Subzero: Emit functions and global initializers in a separate thread.

(This is a continuation of https://codereview.chromium.org/876083007/ .)

Emission is done in a separate thread when -threads=N with N>0 is specified.  This includes both functions and global initializers.

Emission is deterministic.  The parser assigns sequence numbers, and the emitter thread reassembles work units into their original order, regardless of the number of threads.

Dump output, however, is not intended to be in deterministic, reassembled order.  As such, lit tests that test dump output (i.e., '-verbose inst') are explicitly run with -threads=0.

For -elf-writer and -ias=1, the translator thread invokes Cfg::emitIAS() and the assembler buffer is passed to the emitter thread.  For -ias=0, the translator thread passed the Cfg to the emitter thread which then invokes Cfg::emit() to produce the textual asm.

Minor cleanup along the way:
  * Removed Flags from the Ice::Translator object and ctor, since it was redundant with Ctx->getFlags().
  * Cfg::getAssembler<> is the same as Cfg::getAssembler<Assembler> and is useful for just passing the assembler around.
  * Removed the redundant Ctx argument from TargetDataLowering::lowerConstants() .

BUG= https://code.google.com/p/nativeclient/issues/detail?id=4075
R=jvoung@chromium.org

Review URL: https://codereview.chromium.org/916653004
diff --git a/src/IceCfg.cpp b/src/IceCfg.cpp
index 6526da2..bae6c77 100644
--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp
@@ -31,10 +31,10 @@
   return Cfg::getCurrentCfgAllocator();
 }
 
-Cfg::Cfg(GlobalContext *Ctx)
-    : Ctx(Ctx), VMask(Ctx->getVerbose()), FunctionName(""),
-      ReturnType(IceType_void), IsInternalLinkage(false), HasError(false),
-      FocusedTiming(false), ErrorMessage(""), Entry(nullptr),
+Cfg::Cfg(GlobalContext *Ctx, uint32_t SequenceNumber)
+    : Ctx(Ctx), SequenceNumber(SequenceNumber), VMask(Ctx->getVerbose()),
+      FunctionName(""), ReturnType(IceType_void), IsInternalLinkage(false),
+      HasError(false), FocusedTiming(false), ErrorMessage(""), Entry(nullptr),
       NextInstNumber(Inst::NumberInitial), Allocator(new ArenaAllocator<>()),
       Live(nullptr),
       Target(TargetLowering::createLowering(Ctx->getTargetArch(), this)),
@@ -418,17 +418,20 @@
 
 // ======================== Dump routines ======================== //
 
-void Cfg::emitTextHeader(const IceString &MangledName) {
+// emitTextHeader() is not target-specific (apart from what is
+// abstracted by the Assembler), so it is defined here rather than in
+// the target lowering class.
+void Cfg::emitTextHeader(const IceString &MangledName, GlobalContext *Ctx,
+                         const Assembler *Asm) {
   // Note: Still used by emit IAS.
   Ostream &Str = Ctx->getStrEmit();
   Str << "\t.text\n";
   if (Ctx->getFlags().getFunctionSections())
     Str << "\t.section\t.text." << MangledName << ",\"ax\",@progbits\n";
-  if (!getInternal() || Ctx->getFlags().getDisableInternal()) {
+  if (!Asm->getInternal() || Ctx->getFlags().getDisableInternal()) {
     Str << "\t.globl\t" << MangledName << "\n";
     Str << "\t.type\t" << MangledName << ",@function\n";
   }
-  Assembler *Asm = getAssembler<Assembler>();
   Str << "\t.p2align " << Asm->getBundleAlignLog2Bytes() << ",0x";
   for (uint8_t I : Asm->getNonExecBundlePadding())
     Str.write_hex(I);
@@ -449,7 +452,7 @@
   OstreamLocker L(Ctx);
   Ostream &Str = Ctx->getStrEmit();
   IceString MangledName = getContext()->mangleName(getFunctionName());
-  emitTextHeader(MangledName);
+  emitTextHeader(MangledName, Ctx, getAssembler<>());
   for (CfgNode *Node : Nodes)
     Node->emit(this);
   Str << "\n";
@@ -458,22 +461,10 @@
 void Cfg::emitIAS() {
   TimerMarker T(TimerStack::TT_emit, this);
   assert(!Ctx->getFlags().getDecorateAsm());
-  IceString MangledName = getContext()->mangleName(getFunctionName());
   // The emitIAS() routines emit into the internal assembler buffer,
-  // so there's no need to lock the streams until we're ready to call
-  // emitIASBytes().
+  // so there's no need to lock the streams.
   for (CfgNode *Node : Nodes)
     Node->emitIAS(this);
-  // Now write the function to the file and track.
-  if (Ctx->getFlags().getUseELFWriter()) {
-    getAssembler<Assembler>()->alignFunction();
-    Ctx->getObjectWriter()->writeFunctionCode(MangledName, getInternal(),
-                                              getAssembler<Assembler>());
-  } else {
-    OstreamLocker L(Ctx);
-    emitTextHeader(MangledName);
-    getAssembler<Assembler>()->emitIASBytes(Ctx);
-  }
 }
 
 // Dumps the IR with an optional introductory message.
diff --git a/src/IceCfg.h b/src/IceCfg.h
index 605dcf7..737bb64 100644
--- a/src/IceCfg.h
+++ b/src/IceCfg.h
@@ -30,8 +30,9 @@
 public:
   ~Cfg();
 
-  static std::unique_ptr<Cfg> create(GlobalContext *Ctx) {
-    return std::unique_ptr<Cfg>(new Cfg(Ctx));
+  static std::unique_ptr<Cfg> create(GlobalContext *Ctx,
+                                     uint32_t SequenceNumber) {
+    return std::unique_ptr<Cfg>(new Cfg(Ctx, SequenceNumber));
   }
   // Gets a pointer to the current thread's Cfg.
   static const Cfg *getCurrentCfg() { return ICE_TLS_GET_FIELD(CurrentCfg); }
@@ -45,6 +46,7 @@
   }
 
   GlobalContext *getContext() const { return Ctx; }
+  uint32_t getSequenceNumber() const { return SequenceNumber; }
 
   // Returns true if any of the specified options in the verbose mask
   // are set.  If the argument is omitted, it checks if any verbose
@@ -121,9 +123,10 @@
   TargetLowering *getTarget() const { return Target.get(); }
   VariablesMetadata *getVMetadata() const { return VMetadata.get(); }
   Liveness *getLiveness() const { return Live.get(); }
-  template <typename T> T *getAssembler() const {
+  template <typename T = Assembler> T *getAssembler() const {
     return static_cast<T *>(TargetAssembler.get());
   }
+  Assembler *releaseAssembler() { return TargetAssembler.release(); }
   bool hasComputedFrame() const;
   bool getFocusedTiming() const { return FocusedTiming; }
   void setFocusedTiming() { FocusedTiming = true; }
@@ -159,7 +162,8 @@
 
   void emit();
   void emitIAS();
-  void emitTextHeader(const IceString &MangledName);
+  static void emitTextHeader(const IceString &MangledName, GlobalContext *Ctx,
+                             const Assembler *Asm);
   void dump(const IceString &Message = "");
 
   // Allocate data of type T using the per-Cfg allocator.
@@ -181,9 +185,10 @@
   }
 
 private:
-  Cfg(GlobalContext *Ctx);
+  Cfg(GlobalContext *Ctx, uint32_t SequenceNumber);
 
   GlobalContext *Ctx;
+  uint32_t SequenceNumber; // output order for emission
   VerboseMask VMask;
   IceString FunctionName;
   Type ReturnType;
diff --git a/src/IceCfgNode.cpp b/src/IceCfgNode.cpp
index 077866b..d269ee2 100644
--- a/src/IceCfgNode.cpp
+++ b/src/IceCfgNode.cpp
@@ -893,7 +893,7 @@
 
 void CfgNode::emitIAS(Cfg *Func) const {
   Func->setCurrentNode(this);
-  Assembler *Asm = Func->getAssembler<Assembler>();
+  Assembler *Asm = Func->getAssembler<>();
   Asm->BindCfgNodeLabel(getIndex());
   for (const Inst &I : Phis) {
     if (I.isDeleted())
diff --git a/src/IceClFlags.h b/src/IceClFlags.h
index 774a40f..43527b8 100644
--- a/src/IceClFlags.h
+++ b/src/IceClFlags.h
@@ -140,6 +140,7 @@
   // size_t accessors.
 
   size_t getNumTranslationThreads() const { return NumTranslationThreads; }
+  bool isSequential() const { return NumTranslationThreads == 0; }
   void setNumTranslationThreads(size_t NewValue) {
     NumTranslationThreads = NewValue;
   }
diff --git a/src/IceConverter.cpp b/src/IceConverter.cpp
index 7cc8d5a..cc523fa 100644
--- a/src/IceConverter.cpp
+++ b/src/IceConverter.cpp
@@ -84,7 +84,9 @@
       : LLVM2ICEConverter(Converter), Func(nullptr) {}
 
   void convertFunction(const Function *F) {
-    Func = Ice::Cfg::create(Ctx);
+    if (Ctx->isIRGenerationDisabled())
+      return;
+    Func = Ice::Cfg::create(Ctx, Converter.getNextSequenceNumber());
     Ice::Cfg::setCurrentCfg(Func.get());
 
     VarMap.clear();
@@ -658,10 +660,10 @@
       : LLVM2ICEConverter(Converter) {}
 
   /// Converts global variables, and their initializers into ICE
-  /// global variable declarations, for module Mod. Puts corresponding
-  /// converted declarations into VariableDeclarations.
-  void convertGlobalsToIce(Module *Mod,
-                           Ice::VariableDeclarationList &VariableDeclarations);
+  /// global variable declarations, for module Mod. Returns the set of
+  /// converted declarations.
+  std::unique_ptr<Ice::VariableDeclarationList>
+  convertGlobalsToIce(Module *Mod);
 
 private:
   // Adds the Initializer to the list of initializers for the Global
@@ -696,8 +698,10 @@
   }
 };
 
-void LLVM2ICEGlobalsConverter::convertGlobalsToIce(
-    Module *Mod, Ice::VariableDeclarationList &VariableDeclarations) {
+std::unique_ptr<Ice::VariableDeclarationList>
+LLVM2ICEGlobalsConverter::convertGlobalsToIce(Module *Mod) {
+  std::unique_ptr<Ice::VariableDeclarationList> VariableDeclarations(
+      new Ice::VariableDeclarationList);
   for (Module::const_global_iterator I = Mod->global_begin(),
                                      E = Mod->global_end();
        I != E; ++I) {
@@ -706,7 +710,7 @@
 
     Ice::GlobalDeclaration *Var = getConverter().getGlobalDeclaration(GV);
     Ice::VariableDeclaration *VarDecl = cast<Ice::VariableDeclaration>(Var);
-    VariableDeclarations.push_back(VarDecl);
+    VariableDeclarations->push_back(VarDecl);
 
     if (!GV->hasInternalLinkage() && GV->hasInitializer()) {
       std::string Buffer;
@@ -739,6 +743,7 @@
       addGlobalInitializer(*VarDecl, Initializer);
     }
   }
+  return std::move(VariableDeclarations);
 }
 
 void LLVM2ICEGlobalsConverter::addGlobalInitializer(
@@ -801,7 +806,7 @@
 namespace Ice {
 
 void Converter::nameUnnamedGlobalVariables(Module *Mod) {
-  const IceString &GlobalPrefix = Flags.getDefaultGlobalPrefix();
+  const IceString &GlobalPrefix = Ctx->getFlags().getDefaultGlobalPrefix();
   if (GlobalPrefix.empty())
     return;
   uint32_t NameIndex = 0;
@@ -816,7 +821,7 @@
 }
 
 void Converter::nameUnnamedFunctions(Module *Mod) {
-  const IceString &FunctionPrefix = Flags.getDefaultFunctionPrefix();
+  const IceString &FunctionPrefix = Ctx->getFlags().getDefaultFunctionPrefix();
   if (FunctionPrefix.empty())
     return;
   uint32_t NameIndex = 0;
@@ -882,10 +887,7 @@
 }
 
 void Converter::convertGlobals(Module *Mod) {
-  LLVM2ICEGlobalsConverter GlobalsConverter(*this);
-  VariableDeclarationList VariableDeclarations;
-  GlobalsConverter.convertGlobalsToIce(Mod, VariableDeclarations);
-  lowerGlobals(VariableDeclarations);
+  lowerGlobals(LLVM2ICEGlobalsConverter(*this).convertGlobalsToIce(Mod));
 }
 
 void Converter::convertFunctions() {
diff --git a/src/IceConverter.h b/src/IceConverter.h
index 623a4f0..26b647c 100644
--- a/src/IceConverter.h
+++ b/src/IceConverter.h
@@ -29,8 +29,8 @@
   Converter &operator=(const Converter &) = delete;
 
 public:
-  Converter(llvm::Module *Mod, GlobalContext *Ctx, const Ice::ClFlags &Flags)
-      : Translator(Ctx, Flags), Mod(Mod) {}
+  Converter(llvm::Module *Mod, GlobalContext *Ctx)
+      : Translator(Ctx), Mod(Mod) {}
 
   ~Converter() {}
 
diff --git a/src/IceGlobalContext.cpp b/src/IceGlobalContext.cpp
index 3ca8524..d0f84d0 100644
--- a/src/IceGlobalContext.cpp
+++ b/src/IceGlobalContext.cpp
@@ -134,8 +134,10 @@
     : ConstPool(new ConstantPool()), ErrorStatus(), StrDump(OsDump),
       StrEmit(OsEmit), VMask(Mask), Arch(Arch), Opt(Opt),
       TestPrefix(TestPrefix), Flags(Flags), RNG(""), ObjectWriter(),
-      CfgQ(/*MaxSize=*/Flags.getNumTranslationThreads(),
-           /*Sequential=*/(Flags.getNumTranslationThreads() == 0)) {
+      OptQ(/*Sequential=*/Flags.isSequential(),
+           /*MaxSize=*/Flags.getNumTranslationThreads()),
+      // EmitQ is allowed unlimited size.
+      EmitQ(/*Sequential=*/Flags.isSequential()) {
   // Make sure thread_local fields are properly initialized before any
   // accesses are made.  Do this here instead of at the start of
   // main() so that all clients (e.g. unit tests) can benefit for
@@ -162,7 +164,7 @@
 }
 
 void GlobalContext::translateFunctions() {
-  while (std::unique_ptr<Cfg> Func = cfgQueueBlockingPop()) {
+  while (std::unique_ptr<Cfg> Func = optQueueBlockingPop()) {
     // Install Func in TLS for Cfg-specific container allocators.
     Cfg::setCurrentCfg(Func.get());
     // Reset per-function stats being accumulated in TLS.
@@ -180,26 +182,133 @@
         !matchSymbolName(Func->getFunctionName(),
                          getFlags().getTranslateOnly())) {
       Func->dump();
+      Cfg::setCurrentCfg(nullptr);
+      continue; // Func goes out of scope and gets deleted
+    }
+    Func->translate();
+    EmitterWorkItem *Item = nullptr;
+    if (Func->hasError()) {
+      getErrorStatus()->assign(EC_Translation);
+      OstreamLocker L(this);
+      getStrDump() << "ICE translation error: " << Func->getError() << "\n";
+      Item = new EmitterWorkItem(Func->getSequenceNumber());
     } else {
-      Func->translate();
-      if (Func->hasError()) {
-        getErrorStatus()->assign(EC_Translation);
-        OstreamLocker L(this);
-        getStrDump() << "ICE translation error: " << Func->getError() << "\n";
+      if (getFlags().getUseIntegratedAssembler()) {
+        Func->emitIAS();
+        // The Cfg has already emitted into the assembly buffer, so
+        // stats have been fully collected into this thread's TLS.
+        // Dump them before TLS is reset for the next Cfg.
+        dumpStats(Func->getFunctionName());
+        Assembler *Asm = Func->releaseAssembler();
+        // Copy relevant fields into Asm before Func is deleted.
+        Asm->setFunctionName(Func->getFunctionName());
+        Asm->setInternal(Func->getInternal());
+        Item = new EmitterWorkItem(Func->getSequenceNumber(), Asm);
       } else {
-        if (getFlags().getUseIntegratedAssembler())
-          Func->emitIAS();
-        else
-          Func->emit();
-        // TODO(stichnot): actually add to emit queue
+        // The Cfg has not been emitted yet, so stats are not ready
+        // to be dumped.
+        Item = new EmitterWorkItem(Func->getSequenceNumber(), Func.release());
       }
-      dumpStats(Func->getFunctionName());
     }
     Cfg::setCurrentCfg(nullptr);
+    assert(Item);
+    emitQueueBlockingPush(Item);
     // The Cfg now gets deleted as Func goes out of scope.
   }
 }
 
+namespace {
+
+void lowerGlobals(GlobalContext *Ctx,
+                  std::unique_ptr<VariableDeclarationList> VariableDeclarations,
+                  TargetDataLowering *DataLowering) {
+  TimerMarker T(TimerStack::TT_emitGlobalInitializers, Ctx);
+  const bool DumpGlobalVariables = ALLOW_DUMP && Ctx->getVerbose() &&
+                                   Ctx->getFlags().getVerboseFocusOn().empty();
+  if (DumpGlobalVariables) {
+    OstreamLocker L(Ctx);
+    Ostream &Stream = Ctx->getStrDump();
+    for (const Ice::VariableDeclaration *Global : *VariableDeclarations) {
+      Global->dump(Ctx, Stream);
+    }
+  }
+  if (Ctx->getFlags().getDisableTranslation())
+    return;
+  DataLowering->lowerGlobals(std::move(VariableDeclarations));
+}
+
+// Ensure Pending is large enough that Pending[Index] is valid.
+void resizePending(std::vector<EmitterWorkItem *> &Pending, uint32_t Index) {
+  if (Index >= Pending.size())
+    Pending.resize(Index + 1);
+}
+
+} // end of anonymous namespace
+
+void GlobalContext::emitItems() {
+  const bool Threaded = !getFlags().isSequential();
+  // Pending is a vector containing the reassembled, ordered list of
+  // work items.  When we're ready for the next item, we first check
+  // whether it's in the Pending list.  If not, we take an item from
+  // the work queue, and if it's not the item we're waiting for, we
+  // insert it into Pending and repeat.  The work item is deleted
+  // after it is processed.
+  std::vector<EmitterWorkItem *> Pending;
+  uint32_t DesiredSequenceNumber = getFirstSequenceNumber();
+  while (true) {
+    resizePending(Pending, DesiredSequenceNumber);
+    // See if Pending contains DesiredSequenceNumber.
+    EmitterWorkItem *RawItem = Pending[DesiredSequenceNumber];
+    if (RawItem == nullptr)
+      RawItem = emitQueueBlockingPop();
+    if (RawItem == nullptr)
+      return;
+    uint32_t ItemSeq = RawItem->getSequenceNumber();
+    if (Threaded && ItemSeq != DesiredSequenceNumber) {
+      resizePending(Pending, ItemSeq);
+      Pending[ItemSeq] = RawItem;
+      continue;
+    }
+
+    std::unique_ptr<EmitterWorkItem> Item(RawItem);
+    ++DesiredSequenceNumber;
+    switch (Item->getKind()) {
+    case EmitterWorkItem::WI_Nop:
+      break;
+    case EmitterWorkItem::WI_GlobalInits: {
+      lowerGlobals(this, Item->getGlobalInits(),
+                   TargetDataLowering::createLowering(this).get());
+    } break;
+    case EmitterWorkItem::WI_Asm: {
+      std::unique_ptr<Assembler> Asm = Item->getAsm();
+      Asm->alignFunction();
+      IceString MangledName = mangleName(Asm->getFunctionName());
+      if (getFlags().getUseELFWriter()) {
+        getObjectWriter()->writeFunctionCode(MangledName, Asm->getInternal(),
+                                             Asm.get());
+      } else {
+        OstreamLocker L(this);
+        Cfg::emitTextHeader(MangledName, this, Asm.get());
+        Asm->emitIASBytes(this);
+      }
+    } break;
+    case EmitterWorkItem::WI_Cfg: {
+      if (!ALLOW_DUMP)
+        llvm::report_fatal_error("WI_Cfg work item created inappropriately");
+      assert(!getFlags().getUseIntegratedAssembler());
+      std::unique_ptr<Cfg> Func = Item->getCfg();
+      // Unfortunately, we have to temporarily install the Cfg in TLS
+      // because Variable::asType() uses the allocator to create the
+      // differently-typed copy.
+      Cfg::setCurrentCfg(Func.get());
+      Func->emit();
+      Cfg::setCurrentCfg(nullptr);
+      dumpStats(Func->getFunctionName());
+    } break;
+    }
+  }
+}
+
 // Scan a string for S[0-9A-Z]*_ patterns and replace them with
 // S<num>_ where <num> is the next base-36 value.  If a type name
 // legitimately contains that pattern, then the substitution will be
@@ -550,17 +659,31 @@
   Timers->at(StackID).setName(NewName);
 }
 
-// Note: cfgQueueBlockingPush and cfgQueueBlockingPop use unique_ptr
+// Note: optQueueBlockingPush and optQueueBlockingPop use unique_ptr
 // at the interface to take and transfer ownership, but they
 // internally store the raw Cfg pointer in the work queue.  This
 // allows e.g. future queue optimizations such as the use of atomics
 // to modify queue elements.
-void GlobalContext::cfgQueueBlockingPush(std::unique_ptr<Cfg> Func) {
-  CfgQ.blockingPush(Func.release());
+void GlobalContext::optQueueBlockingPush(std::unique_ptr<Cfg> Func) {
+  assert(Func);
+  OptQ.blockingPush(Func.release());
+  if (getFlags().isSequential())
+    translateFunctions();
 }
 
-std::unique_ptr<Cfg> GlobalContext::cfgQueueBlockingPop() {
-  return std::unique_ptr<Cfg>(CfgQ.blockingPop());
+std::unique_ptr<Cfg> GlobalContext::optQueueBlockingPop() {
+  return std::unique_ptr<Cfg>(OptQ.blockingPop());
+}
+
+void GlobalContext::emitQueueBlockingPush(EmitterWorkItem *Item) {
+  assert(Item);
+  EmitQ.blockingPush(Item);
+  if (getFlags().isSequential())
+    emitItems();
+}
+
+EmitterWorkItem *GlobalContext::emitQueueBlockingPop() {
+  return EmitQ.blockingPop();
 }
 
 void GlobalContext::dumpStats(const IceString &Name, bool Final) {
diff --git a/src/IceGlobalContext.h b/src/IceGlobalContext.h
index 280bbd0..f7f5a5a 100644
--- a/src/IceGlobalContext.h
+++ b/src/IceGlobalContext.h
@@ -23,6 +23,7 @@
 #include "IceClFlags.h"
 #include "IceIntrinsics.h"
 #include "IceRNG.h"
+#include "IceThreading.h"
 #include "IceTimerTree.h"
 #include "IceTypes.h"
 #include "IceUtils.h"
@@ -31,6 +32,7 @@
 
 class ClFlags;
 class ConstantPool;
+class EmitterWorkItem;
 class FuncSigType;
 
 // LockedPtr is a way to provide automatically locked access to some object.
@@ -276,18 +278,28 @@
   void resetTimer(TimerStackIdT StackID);
   void setTimerName(TimerStackIdT StackID, const IceString &NewName);
 
+  // This is the first work item sequence number that the parser
+  // produces, and correspondingly the first sequence number that the
+  // emitter thread will wait for.  Start numbering at 1 to leave room
+  // for a sentinel, in case e.g. we wish to inject items with a
+  // special sequence number that may be executed out of order.
+  static uint32_t getFirstSequenceNumber() { return 1; }
   // Adds a newly parsed and constructed function to the Cfg work
   // queue.  Notifies any idle workers that a new function is
   // available for translating.  May block if the work queue is too
   // large, in order to control memory footprint.
-  void cfgQueueBlockingPush(std::unique_ptr<Cfg> Func);
+  void optQueueBlockingPush(std::unique_ptr<Cfg> Func);
   // Takes a Cfg from the work queue for translating.  May block if
   // the work queue is currently empty.  Returns nullptr if there is
   // no more work - the queue is empty and either end() has been
   // called or the Sequential flag was set.
-  std::unique_ptr<Cfg> cfgQueueBlockingPop();
+  std::unique_ptr<Cfg> optQueueBlockingPop();
   // Notifies that no more work will be added to the work queue.
-  void cfgQueueNotifyEnd() { CfgQ.notifyEnd(); }
+  void optQueueNotifyEnd() { OptQ.notifyEnd(); }
+
+  void emitQueueBlockingPush(EmitterWorkItem *Item);
+  EmitterWorkItem *emitQueueBlockingPop();
+  void emitQueueNotifyEnd() { EmitQ.notifyEnd(); }
 
   void startWorkerThreads() {
     size_t NumWorkers = getFlags().getNumTranslationThreads();
@@ -300,18 +312,29 @@
           &GlobalContext::translateFunctionsWrapper, this, WorkerTLS));
     }
     if (NumWorkers) {
-      // TODO(stichnot): start a new thread for the emitter queue worker.
+      ThreadContext *WorkerTLS = new ThreadContext();
+      Timers->initInto(WorkerTLS->Timers);
+      AllThreadContexts.push_back(WorkerTLS);
+      EmitterThreads.push_back(
+          std::thread(&GlobalContext::emitterWrapper, this, WorkerTLS));
     }
   }
 
   void waitForWorkerThreads() {
-    cfgQueueNotifyEnd();
-    // TODO(stichnot): call end() on the emitter work queue.
+    optQueueNotifyEnd();
     for (std::thread &Worker : TranslationThreads) {
       Worker.join();
     }
     TranslationThreads.clear();
-    // TODO(stichnot): join the emitter thread.
+
+    // Only notify the emit queue to end after all the translation
+    // threads have ended.
+    emitQueueNotifyEnd();
+    for (std::thread &Worker : EmitterThreads) {
+      Worker.join();
+    }
+    EmitterThreads.clear();
+
     if (ALLOW_DUMP) {
       auto Timers = getTimers();
       for (ThreadContext *TLS : AllThreadContexts)
@@ -334,6 +357,15 @@
   // Translate functions from the Cfg queue until the queue is empty.
   void translateFunctions();
 
+  // Emitter thread startup routine.
+  void emitterWrapper(ThreadContext *MyTLS) {
+    ICE_TLS_SET_FIELD(TLS, MyTLS);
+    emitItems();
+  }
+  // Emit functions and global initializers from the emitter queue
+  // until the queue is empty.
+  void emitItems();
+
   // Utility function to match a symbol name against a match string.
   // This is used in a few cases where we want to take some action on
   // a particular function or symbol based on a command-line argument,
@@ -390,7 +422,8 @@
   const ClFlags &Flags;
   RandomNumberGenerator RNG; // TODO(stichnot): Move into Cfg.
   std::unique_ptr<ELFObjectWriter> ObjectWriter;
-  BoundedProducerConsumerQueue<Cfg> CfgQ;
+  BoundedProducerConsumerQueue<Cfg> OptQ;
+  BoundedProducerConsumerQueue<EmitterWorkItem> EmitQ;
 
   LockedPtr<ArenaAllocator<>> getAllocator() {
     return LockedPtr<ArenaAllocator<>>(&Allocator, &AllocLock);
@@ -405,8 +438,9 @@
     return LockedPtr<TimerList>(&Timers, &TimerLock);
   }
 
-  std::vector<ThreadContext *> AllThreadContexts;
-  std::vector<std::thread> TranslationThreads;
+  llvm::SmallVector<ThreadContext *, 128> AllThreadContexts;
+  llvm::SmallVector<std::thread, 128> TranslationThreads;
+  llvm::SmallVector<std::thread, 128> EmitterThreads;
   // Each thread has its own TLS pointer which is also held in
   // AllThreadContexts.
   ICE_TLS_DECLARE_FIELD(ThreadContext *, TLS);
diff --git a/src/IceTargetLowering.cpp b/src/IceTargetLowering.cpp
index 8b7c257..8169ae1 100644
--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -252,19 +252,20 @@
   LinearScan.scan(RegMask, RandomizeRegisterAllocation);
 }
 
-TargetDataLowering *TargetDataLowering::createLowering(GlobalContext *Ctx) {
+std::unique_ptr<TargetDataLowering>
+TargetDataLowering::createLowering(GlobalContext *Ctx) {
   // These statements can be #ifdef'd to specialize the code generator
   // to a subset of the available targets.  TODO: use CRTP.
   TargetArch Target = Ctx->getTargetArch();
   if (Target == Target_X8632)
-    return TargetDataX8632::create(Ctx);
+    return std::unique_ptr<TargetDataLowering>(TargetDataX8632::create(Ctx));
 #if 0
   if (Target == Target_X8664)
-    return TargetDataX8664::create(Ctx);
+    return std::unique_ptr<TargetDataLowering>(TargetDataX8664::create(Ctx));
   if (Target == Target_ARM32)
-    return TargetDataARM32::create(Ctx);
+    return std::unique_ptr<TargetDataLowering>(TargetDataARM32::create(Ctx));
   if (Target == Target_ARM64)
-    return TargetDataARM64::create(Ctx);
+    return std::unique_ptr<TargetDataLowering>(TargetDataARM64::create(Ctx));
 #endif
   llvm_unreachable("Unsupported target");
   return nullptr;
diff --git a/src/IceTargetLowering.h b/src/IceTargetLowering.h
index 2dda5c5..92eed51 100644
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -7,11 +7,14 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file declares the TargetLowering and LoweringContext
-// classes.  TargetLowering is an abstract class used to drive the
-// translation/lowering process.  LoweringContext maintains a
-// context for lowering each instruction, offering conveniences such
-// as iterating over non-deleted instructions.
+// This file declares the TargetLowering, LoweringContext, and
+// TargetDataLowering classes.  TargetLowering is an abstract class
+// used to drive the translation/lowering process.  LoweringContext
+// maintains a context for lowering each instruction, offering
+// conveniences such as iterating over non-deleted instructions.
+// TargetDataLowering is an abstract class used to drive the
+// lowering/emission of global initializers, external global
+// declarations, and internal constant pools.
 //
 //===----------------------------------------------------------------------===//
 
@@ -247,12 +250,12 @@
   TargetDataLowering &operator=(const TargetDataLowering &) = delete;
 
 public:
-  static TargetDataLowering *createLowering(GlobalContext *Ctx);
+  static std::unique_ptr<TargetDataLowering> createLowering(GlobalContext *Ctx);
   virtual ~TargetDataLowering();
 
-  virtual void lowerGlobal(const VariableDeclaration &Var) const = 0;
-  virtual void lowerGlobalsELF(const VariableDeclarationList &Vars) const = 0;
-  virtual void lowerConstants(GlobalContext *Ctx) const = 0;
+  virtual void
+  lowerGlobals(std::unique_ptr<VariableDeclarationList> Vars) const = 0;
+  virtual void lowerConstants() const = 0;
 
 protected:
   TargetDataLowering(GlobalContext *Ctx) : Ctx(Ctx) {}
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 9fd0106..51f1f3b 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -4639,10 +4639,20 @@
   Str << "\t.size\t" << MangledName << ", " << Size << "\n";
 }
 
-void
-TargetDataX8632::lowerGlobalsELF(const VariableDeclarationList &Vars) const {
-  ELFObjectWriter *Writer = Ctx->getObjectWriter();
-  Writer->writeDataSection(Vars, llvm::ELF::R_386_32);
+void TargetDataX8632::lowerGlobals(
+    std::unique_ptr<VariableDeclarationList> Vars) const {
+  if (Ctx->getFlags().getUseELFWriter()) {
+    ELFObjectWriter *Writer = Ctx->getObjectWriter();
+    Writer->writeDataSection(*Vars, llvm::ELF::R_386_32);
+  } else {
+    const IceString &TranslateOnly = Ctx->getFlags().getTranslateOnly();
+    OstreamLocker L(Ctx);
+    for (const VariableDeclaration *Var : *Vars) {
+      if (GlobalContext::matchSymbolName(Var->getName(), TranslateOnly)) {
+        lowerGlobal(*Var);
+      }
+    }
+  }
 }
 
 template <typename T> struct PoolTypeConverter {};
@@ -4701,7 +4711,7 @@
   }
 }
 
-void TargetDataX8632::lowerConstants(GlobalContext *Ctx) const {
+void TargetDataX8632::lowerConstants() const {
   if (Ctx->getFlags().getDisableTranslation())
     return;
   // No need to emit constants from the int pool since (for x86) they
diff --git a/src/IceTargetLoweringX8632.h b/src/IceTargetLoweringX8632.h
index fca29c4..543183e 100644
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -497,14 +497,14 @@
     return new TargetDataX8632(Ctx);
   }
 
-  void lowerGlobal(const VariableDeclaration &Var) const final;
-  void lowerGlobalsELF(const VariableDeclarationList &Vars) const final;
-  void lowerConstants(GlobalContext *Ctx) const final;
+  void lowerGlobals(std::unique_ptr<VariableDeclarationList> Vars) const final;
+  void lowerConstants() const final;
 
 protected:
   TargetDataX8632(GlobalContext *Ctx);
 
 private:
+  void lowerGlobal(const VariableDeclaration &Var) const;
   ~TargetDataX8632() override {}
   template <typename T> static void emitConstantPool(GlobalContext *Ctx);
 };
diff --git a/src/IceThreading.cpp b/src/IceThreading.cpp
new file mode 100644
index 0000000..5576abb
--- /dev/null
+++ b/src/IceThreading.cpp
@@ -0,0 +1,48 @@
+//===- subzero/src/IceThreading.cpp - Threading function definitions ------===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines threading-related functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "IceCfg.h"
+#include "IceDefs.h"
+#include "IceThreading.h"
+
+namespace Ice {
+
+EmitterWorkItem::EmitterWorkItem(uint32_t Seq)
+    : Sequence(Seq), Kind(WI_Nop), GlobalInits(nullptr), Function(nullptr),
+      RawFunc(nullptr) {}
+EmitterWorkItem::EmitterWorkItem(uint32_t Seq, VariableDeclarationList *D)
+    : Sequence(Seq), Kind(WI_GlobalInits), GlobalInits(D), Function(nullptr),
+      RawFunc(nullptr) {}
+EmitterWorkItem::EmitterWorkItem(uint32_t Seq, Assembler *A)
+    : Sequence(Seq), Kind(WI_Asm), GlobalInits(nullptr), Function(A),
+      RawFunc(nullptr) {}
+EmitterWorkItem::EmitterWorkItem(uint32_t Seq, Cfg *F)
+    : Sequence(Seq), Kind(WI_Cfg), GlobalInits(nullptr), Function(nullptr),
+      RawFunc(F) {}
+
+std::unique_ptr<VariableDeclarationList> EmitterWorkItem::getGlobalInits() {
+  assert(getKind() == WI_GlobalInits);
+  return std::move(GlobalInits);
+}
+
+std::unique_ptr<Assembler> EmitterWorkItem::getAsm() {
+  assert(getKind() == WI_Asm);
+  return std::move(Function);
+}
+
+std::unique_ptr<Cfg> EmitterWorkItem::getCfg() {
+  assert(getKind() == WI_Cfg);
+  return std::move(RawFunc);
+}
+
+} // end of namespace Ice
diff --git a/src/IceThreading.h b/src/IceThreading.h
new file mode 100644
index 0000000..9ae3b67
--- /dev/null
+++ b/src/IceThreading.h
@@ -0,0 +1,207 @@
+//===- subzero/src/IceThreading.h - Threading functions ---------*- C++ -*-===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares threading-related functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SUBZERO_SRC_ICETHREADING_H
+#define SUBZERO_SRC_ICETHREADING_H
+
+#include <condition_variable>
+#include <mutex>
+
+#include "IceDefs.h"
+
+namespace Ice {
+
+// BoundedProducerConsumerQueue is a work queue that allows multiple
+// producers and multiple consumers.  A producer adds entries using
+// blockingPush(), and may block if the queue is "full".  A producer
+// uses notifyEnd() to indicate that no more entries will be added.  A
+// consumer removes an item using blockingPop(), which will return
+// nullptr if notifyEnd() has been called and the queue is empty (it
+// never returns nullptr if the queue contained any items).
+//
+// The MaxSize ctor arg controls the maximum size the queue can grow
+// to (subject to a hard limit of MaxStaticSize-1).  The Sequential
+// arg indicates purely sequential execution in which the single
+// thread should never wait().
+//
+// Two condition variables are used in the implementation.
+// GrewOrEnded signals a waiting worker that a producer has changed
+// the state of the queue.  Shrunk signals a blocked producer that a
+// consumer has changed the state of the queue.
+//
+// The methods begin with Sequential-specific code to be most clear.
+// The lock and condition variables are not used in the Sequential
+// case.
+//
+// Internally, the queue is implemented as a circular array of size
+// MaxStaticSize, where the queue boundaries are denoted by the Front
+// and Back fields.  Front==Back indicates an empty queue.
+template <typename T, size_t MaxStaticSize = 128>
+class BoundedProducerConsumerQueue {
+  BoundedProducerConsumerQueue() = delete;
+  BoundedProducerConsumerQueue(const BoundedProducerConsumerQueue &) = delete;
+  BoundedProducerConsumerQueue &
+  operator=(const BoundedProducerConsumerQueue &) = delete;
+
+public:
+  BoundedProducerConsumerQueue(bool Sequential, size_t MaxSize = MaxStaticSize)
+      : Back(0), Front(0), MaxSize(std::min(MaxSize, MaxStaticSize)),
+        Sequential(Sequential), IsEnded(false) {}
+  void blockingPush(T *Item) {
+    {
+      std::unique_lock<GlobalLockType> L(Lock);
+      // If the work queue is already "full", wait for a consumer to
+      // grab an element and shrink the queue.
+      Shrunk.wait(L, [this] { return size() < MaxSize || Sequential; });
+      push(Item);
+    }
+    GrewOrEnded.notify_one();
+  }
+  T *blockingPop() {
+    T *Item = nullptr;
+    bool ShouldNotifyProducer = false;
+    {
+      std::unique_lock<GlobalLockType> L(Lock);
+      GrewOrEnded.wait(L, [this] { return IsEnded || !empty() || Sequential; });
+      if (!empty()) {
+        Item = pop();
+        ShouldNotifyProducer = !IsEnded;
+      }
+    }
+    if (ShouldNotifyProducer)
+      Shrunk.notify_one();
+    return Item;
+  }
+  void notifyEnd() {
+    {
+      std::lock_guard<GlobalLockType> L(Lock);
+      IsEnded = true;
+    }
+    GrewOrEnded.notify_all();
+  }
+
+private:
+  const static size_t MaxStaticSizeMask = MaxStaticSize - 1;
+  static_assert(!(MaxStaticSize & (MaxStaticSize - 1)),
+                "MaxStaticSize must be a power of 2");
+
+  // WorkItems and Lock are read/written by all.
+  ICE_CACHELINE_BOUNDARY;
+  T *WorkItems[MaxStaticSize];
+  ICE_CACHELINE_BOUNDARY;
+  // Lock guards access to WorkItems, Front, Back, and IsEnded.
+  GlobalLockType Lock;
+
+  ICE_CACHELINE_BOUNDARY;
+  // GrewOrEnded is written by the producers and read by the
+  // consumers.  It is notified (by the producer) when something is
+  // added to the queue, in case consumers are waiting for a non-empty
+  // queue.
+  std::condition_variable GrewOrEnded;
+  // Back is the index into WorkItems[] of where the next element will
+  // be pushed.  (More precisely, Back&MaxStaticSize is the index.)
+  // It is written by the producers, and read by all via size() and
+  // empty().
+  size_t Back;
+
+  ICE_CACHELINE_BOUNDARY;
+  // Shrunk is notified (by the consumer) when something is removed
+  // from the queue, in case a producer is waiting for the queue to
+  // drop below maximum capacity.  It is written by the consumers and
+  // read by the producers.
+  std::condition_variable Shrunk;
+  // Front is the index into WorkItems[] of the oldest element,
+  // i.e. the next to be popped.  (More precisely Front&MaxStaticSize
+  // is the index.)  It is written by the consumers, and read by all
+  // via size() and empty().
+  size_t Front;
+
+  ICE_CACHELINE_BOUNDARY;
+
+  // MaxSize and Sequential are read by all and written by none.
+  const size_t MaxSize;
+  const bool Sequential;
+  // IsEnded is read by the consumers, and only written once by the
+  // producer.
+  bool IsEnded;
+
+  // The lock must be held when the following methods are called.
+  bool empty() const { return Front == Back; }
+  size_t size() const { return Back - Front; }
+  void push(T *Item) {
+    WorkItems[Back++ & MaxStaticSizeMask] = Item;
+    assert(size() <= MaxStaticSize);
+  }
+  T *pop() {
+    assert(!empty());
+    return WorkItems[Front++ & MaxStaticSizeMask];
+  }
+};
+
+// EmitterWorkItem is a simple wrapper around a pointer that
+// represents a work item to be emitted, i.e. a function or a set of
+// global declarations and initializers, and it includes a sequence
+// number so that work items can be emitted in a particular order for
+// deterministic output.  It acts like an interface class, but instead
+// of making the classes of interest inherit from EmitterWorkItem, it
+// wraps pointers to these classes.  Some space is wasted compared to
+// storing the pointers in a union, but not too much due to the work
+// granularity.
+class EmitterWorkItem {
+  EmitterWorkItem() = delete;
+  EmitterWorkItem(const EmitterWorkItem &) = delete;
+  EmitterWorkItem &operator=(const EmitterWorkItem &) = delete;
+
+public:
+  // ItemKind can be one of the following:
+  //
+  // WI_Nop: No actual work.  This is a placeholder to maintain
+  // sequence numbers in case there is a translation error.
+  //
+  // WI_GlobalInits: A list of global declarations and initializers.
+  //
+  // WI_Asm: A function that has already had emitIAS() called on it.
+  // The work is transferred via the Assembler buffer, and the
+  // originating Cfg has been deleted (to recover lots of memory).
+  //
+  // WI_Cfg: A Cfg that has not yet had emit() or emitIAS() called on
+  // it.  This is only used as a debugging configuration when we want
+  // to emit "readable" assembly code, possibly annotated with
+  // liveness and other information only available in the Cfg and not
+  // in the Assembler buffer.
+  enum ItemKind { WI_Nop, WI_GlobalInits, WI_Asm, WI_Cfg };
+  // Constructor for a WI_Nop work item.
+  explicit EmitterWorkItem(uint32_t Seq);
+  // Constructor for a WI_GlobalInits work item.
+  EmitterWorkItem(uint32_t Seq, VariableDeclarationList *D);
+  // Constructor for a WI_Asm work item.
+  EmitterWorkItem(uint32_t Seq, Assembler *A);
+  // Constructor for a WI_Cfg work item.
+  EmitterWorkItem(uint32_t Seq, Cfg *F);
+  uint32_t getSequenceNumber() const { return Sequence; }
+  ItemKind getKind() const { return Kind; }
+  std::unique_ptr<VariableDeclarationList> getGlobalInits();
+  std::unique_ptr<Assembler> getAsm();
+  std::unique_ptr<Cfg> getCfg();
+
+private:
+  const uint32_t Sequence;
+  const ItemKind Kind;
+  std::unique_ptr<VariableDeclarationList> GlobalInits;
+  std::unique_ptr<Assembler> Function;
+  std::unique_ptr<Cfg> RawFunc;
+};
+
+} // end of namespace Ice
+
+#endif // SUBZERO_SRC_ICETHREADING_H
diff --git a/src/IceTranslator.cpp b/src/IceTranslator.cpp
index c47769d..c78bbcb 100644
--- a/src/IceTranslator.cpp
+++ b/src/IceTranslator.cpp
@@ -21,9 +21,9 @@
 
 using namespace Ice;
 
-Translator::Translator(GlobalContext *Ctx, const ClFlags &Flags)
-    : Ctx(Ctx), Flags(Flags),
-      DataLowering(TargetDataLowering::createLowering(Ctx)), ErrorStatus() {}
+Translator::Translator(GlobalContext *Ctx)
+    : Ctx(Ctx), NextSequenceNumber(GlobalContext::getFirstSequenceNumber()),
+      ErrorStatus() {}
 
 Translator::~Translator() {}
 
@@ -54,15 +54,12 @@
 }
 
 void Translator::translateFcn(std::unique_ptr<Cfg> Func) {
-  Ctx->cfgQueueBlockingPush(std::move(Func));
-  if (Ctx->getFlags().getNumTranslationThreads() == 0) {
-    Ctx->translateFunctions();
-  }
+  Ctx->optQueueBlockingPush(std::move(Func));
 }
 
 void Translator::emitConstants() {
   if (!getErrorStatus())
-    DataLowering->lowerConstants(Ctx);
+    TargetDataLowering::createLowering(Ctx)->lowerConstants();
 }
 
 void Translator::transferErrorCode() const {
@@ -70,33 +67,9 @@
     Ctx->getErrorStatus()->assign(getErrorStatus().value());
 }
 
-void
-Translator::lowerGlobals(const VariableDeclarationList &VariableDeclarations) {
-  TimerMarker T(TimerStack::TT_emitGlobalInitializers, Ctx);
-  bool DisableTranslation = Ctx->getFlags().getDisableTranslation();
-  const bool DumpGlobalVariables = ALLOW_DUMP && Ctx->getVerbose() &&
-                                   Ctx->getFlags().getVerboseFocusOn().empty();
-  if (Ctx->getFlags().getUseELFWriter()) {
-    // Dump all globals if requested, but don't interleave w/ emission.
-    if (DumpGlobalVariables) {
-      OstreamLocker L(Ctx);
-      Ostream &Stream = Ctx->getStrDump();
-      for (const Ice::VariableDeclaration *Global : VariableDeclarations) {
-        Global->dump(getContext(), Stream);
-      }
-    }
-    DataLowering->lowerGlobalsELF(VariableDeclarations);
-  } else {
-    const IceString &TranslateOnly = Ctx->getFlags().getTranslateOnly();
-    OstreamLocker L(Ctx);
-    Ostream &Stream = Ctx->getStrDump();
-    for (const Ice::VariableDeclaration *Global : VariableDeclarations) {
-      // Interleave dump output w/ emit output.
-      if (DumpGlobalVariables)
-        Global->dump(getContext(), Stream);
-      if (!DisableTranslation &&
-          GlobalContext::matchSymbolName(Global->getName(), TranslateOnly))
-        DataLowering->lowerGlobal(*Global);
-    }
-  }
+void Translator::lowerGlobals(
+    std::unique_ptr<VariableDeclarationList> VariableDeclarations) {
+  EmitterWorkItem *Item = new EmitterWorkItem(getNextSequenceNumber(),
+                                              VariableDeclarations.release());
+  Ctx->emitQueueBlockingPush(Item);
 }
diff --git a/src/IceTranslator.h b/src/IceTranslator.h
index 6b35b25..bc9a933 100644
--- a/src/IceTranslator.h
+++ b/src/IceTranslator.h
@@ -34,14 +34,14 @@
   Translator &operator=(const Translator &) = delete;
 
 public:
-  Translator(GlobalContext *Ctx, const ClFlags &Flags);
+  Translator(GlobalContext *Ctx);
 
   ~Translator();
   const ErrorCode &getErrorStatus() const { return ErrorStatus; }
 
   GlobalContext *getContext() const { return Ctx; }
 
-  const ClFlags &getFlags() const { return Flags; }
+  const ClFlags &getFlags() const { return Ctx->getFlags(); }
 
   /// Translates the constructed ICE function Fcn to machine code.
   /// Takes ownership of Func.
@@ -56,7 +56,8 @@
 
   /// Lowers the given list of global addresses to target. Generates
   /// list of corresponding variable declarations.
-  void lowerGlobals(const VariableDeclarationList &VariableDeclarations);
+  void
+  lowerGlobals(std::unique_ptr<VariableDeclarationList> VariableDeclarations);
 
   /// Creates a name using the given prefix and corresponding index.
   std::string createUnnamedName(const IceString &Prefix, SizeT Index);
@@ -67,10 +68,11 @@
   bool checkIfUnnamedNameSafe(const IceString &Name, const char *Kind,
                               const IceString &Prefix);
 
+  uint32_t getNextSequenceNumber() { return NextSequenceNumber++; }
+
 protected:
   GlobalContext *Ctx;
-  const ClFlags &Flags;
-  std::unique_ptr<TargetDataLowering> DataLowering;
+  uint32_t NextSequenceNumber;
   // Exit status of the translation. False is successful. True otherwise.
   ErrorCode ErrorStatus;
 };
diff --git a/src/IceUtils.h b/src/IceUtils.h
index dc3a5ff..7b1ab81 100644
--- a/src/IceUtils.h
+++ b/src/IceUtils.h
@@ -13,9 +13,7 @@
 
 #ifndef SUBZERO_SRC_ICEUTILS_H
 #define SUBZERO_SRC_ICEUTILS_H
-
 #include <climits>
-#include <condition_variable>
 
 namespace Ice {
 
@@ -63,133 +61,6 @@
   }
 };
 
-// BoundedProducerConsumerQueue is a work queue that allows multiple
-// producers and multiple consumers.  A producer adds entries using
-// blockingPush(), and may block if the queue is "full".  A producer
-// uses notifyEnd() to indicate that no more entries will be added.  A
-// consumer removes an item using blockingPop(), which will return
-// nullptr if notifyEnd() has been called and the queue is empty (it
-// never returns nullptr if the queue contained any items).
-//
-// The MaxSize ctor arg controls the maximum size the queue can grow
-// to (subject to a hard limit of MaxStaticSize-1).  The Sequential
-// arg indicates purely sequential execution in which the single
-// thread should never wait().
-//
-// Two condition variables are used in the implementation.
-// GrewOrEnded signals a waiting worker that a producer has changed
-// the state of the queue.  Shrunk signals a blocked producer that a
-// consumer has changed the state of the queue.
-//
-// The methods begin with Sequential-specific code to be most clear.
-// The lock and condition variables are not used in the Sequential
-// case.
-//
-// Internally, the queue is implemented as a circular array of size
-// MaxStaticSize, where the queue boundaries are denoted by the Front
-// and Back fields.  Front==Back indicates an empty queue.
-template <typename T, size_t MaxStaticSize = 128>
-class BoundedProducerConsumerQueue {
-  BoundedProducerConsumerQueue() = delete;
-  BoundedProducerConsumerQueue(const BoundedProducerConsumerQueue &) = delete;
-  BoundedProducerConsumerQueue &
-  operator=(const BoundedProducerConsumerQueue &) = delete;
-
-public:
-  BoundedProducerConsumerQueue(size_t MaxSize, bool Sequential)
-      : Back(0), Front(0), MaxSize(std::min(MaxSize, MaxStaticSize)),
-        Sequential(Sequential), IsEnded(false) {}
-  void blockingPush(T *Item) {
-    {
-      std::unique_lock<GlobalLockType> L(Lock);
-      // If the work queue is already "full", wait for a consumer to
-      // grab an element and shrink the queue.
-      Shrunk.wait(L, [this] { return size() < MaxSize || Sequential; });
-      push(Item);
-    }
-    GrewOrEnded.notify_one();
-  }
-  T *blockingPop() {
-    T *Item = nullptr;
-    bool ShouldNotifyProducer = false;
-    {
-      std::unique_lock<GlobalLockType> L(Lock);
-      GrewOrEnded.wait(L, [this] { return IsEnded || !empty() || Sequential; });
-      if (!empty()) {
-        Item = pop();
-        ShouldNotifyProducer = !IsEnded;
-      }
-    }
-    if (ShouldNotifyProducer)
-      Shrunk.notify_one();
-    return Item;
-  }
-  void notifyEnd() {
-    {
-      std::lock_guard<GlobalLockType> L(Lock);
-      IsEnded = true;
-    }
-    GrewOrEnded.notify_all();
-  }
-
-private:
-  const static size_t MaxStaticSizeMask = MaxStaticSize - 1;
-  static_assert(!(MaxStaticSize & (MaxStaticSize - 1)),
-                "MaxStaticSize must be a power of 2");
-
-  // WorkItems and Lock are read/written by all.
-  ICE_CACHELINE_BOUNDARY;
-  T *WorkItems[MaxStaticSize];
-  ICE_CACHELINE_BOUNDARY;
-  // Lock guards access to WorkItems, Front, Back, and IsEnded.
-  GlobalLockType Lock;
-
-  ICE_CACHELINE_BOUNDARY;
-  // GrewOrEnded is written by the producers and read by the
-  // consumers.  It is notified (by the producer) when something is
-  // added to the queue, in case consumers are waiting for a non-empty
-  // queue.
-  std::condition_variable GrewOrEnded;
-  // Back is the index into WorkItems[] of where the next element will
-  // be pushed.  (More precisely, Back&MaxStaticSize is the index.)
-  // It is written by the producers, and read by all via size() and
-  // empty().
-  size_t Back;
-
-  ICE_CACHELINE_BOUNDARY;
-  // Shrunk is notified (by the consumer) when something is removed
-  // from the queue, in case a producer is waiting for the queue to
-  // drop below maximum capacity.  It is written by the consumers and
-  // read by the producers.
-  std::condition_variable Shrunk;
-  // Front is the index into WorkItems[] of the oldest element,
-  // i.e. the next to be popped.  (More precisely Front&MaxStaticSize
-  // is the index.)  It is written by the consumers, and read by all
-  // via size() and empty().
-  size_t Front;
-
-  ICE_CACHELINE_BOUNDARY;
-
-  // MaxSize and Sequential are read by all and written by none.
-  const size_t MaxSize;
-  const bool Sequential;
-  // IsEnded is read by the consumers, and only written once by the
-  // producer.
-  bool IsEnded;
-
-  // The lock must be held when the following methods are called.
-  bool empty() const { return Front == Back; }
-  size_t size() const { return Back - Front; }
-  void push(T *Item) {
-    WorkItems[Back++ & MaxStaticSizeMask] = Item;
-    assert(size() <= MaxStaticSize);
-  }
-  T *pop() {
-    assert(!empty());
-    return WorkItems[Front++ & MaxStaticSizeMask];
-  }
-};
-
 } // end of namespace Ice
 
 #endif // SUBZERO_SRC_ICEUTILS_H
diff --git a/src/PNaClTranslator.cpp b/src/PNaClTranslator.cpp
index 48f47a9..a0ce417 100644
--- a/src/PNaClTranslator.cpp
+++ b/src/PNaClTranslator.cpp
@@ -1150,7 +1150,8 @@
     }
 
     if (!isIRGenerationDisabled())
-      Func = Ice::Cfg::create(getTranslator().getContext());
+      Func = Ice::Cfg::create(getTranslator().getContext(),
+                              getTranslator().getNextSequenceNumber());
     Ice::Cfg::setCurrentCfg(Func.get());
 
     // TODO(kschimpf) Clean up API to add a function signature to
@@ -1185,7 +1186,7 @@
     // translation of all remaining functions. This allows successive
     // parsing errors to be reported, without adding extra checks to
     // the translator for such parsing errors.
-    if (Context->getNumErrors() == 0) {
+    if (Context->getNumErrors() == 0 && Func) {
       getTranslator().translateFcn(std::move(Func));
       // The translator now has ownership of Func.
     } else {
@@ -2863,10 +2864,7 @@
     if (!GlobalDeclarationNamesAndInitializersInstalled) {
       Context->installGlobalNames();
       Context->createValueIDs();
-      std::unique_ptr<Ice::VariableDeclarationList> DeclsPtr =
-          Context->getGlobalVariables();
-      const Ice::VariableDeclarationList &Decls = *DeclsPtr;
-      getTranslator().lowerGlobals(Decls);
+      getTranslator().lowerGlobals(Context->getGlobalVariables());
       GlobalDeclarationNamesAndInitializersInstalled = true;
     }
   }
diff --git a/src/PNaClTranslator.h b/src/PNaClTranslator.h
index e818aaf..1bd4d5b 100644
--- a/src/PNaClTranslator.h
+++ b/src/PNaClTranslator.h
@@ -30,8 +30,7 @@
   PNaClTranslator &operator=(const PNaClTranslator &) = delete;
 
 public:
-  PNaClTranslator(GlobalContext *Ctx, const ClFlags &Flags)
-      : Translator(Ctx, Flags) {}
+  PNaClTranslator(GlobalContext *Ctx) : Translator(Ctx) {}
 
   // Reads the PNaCl bitcode file and translates to ICE, which is then
   // converted to machine code. Sets ErrorStatus to 1 if any errors
diff --git a/src/assembler.h b/src/assembler.h
index 40f50fd..dfd8cd1 100644
--- a/src/assembler.h
+++ b/src/assembler.h
@@ -149,7 +149,7 @@
   Assembler &operator=(const Assembler &) = delete;
 
 public:
-  Assembler() : buffer_(*this) {}
+  Assembler() : FunctionName(""), IsInternal(false), buffer_(*this) {}
   virtual ~Assembler() {}
 
   // Allocate a chunk of bytes using the per-Assembler allocator.
@@ -190,9 +190,18 @@
   }
 
   void emitIASBytes(GlobalContext *Ctx) const;
+  bool getInternal() const { return IsInternal; }
+  void setInternal(bool Internal) { IsInternal = Internal; }
+  const IceString &getFunctionName() { return FunctionName; }
+  void setFunctionName(const IceString &NewName) { FunctionName = NewName; }
 
 private:
   ArenaAllocator<32 * 1024> Allocator;
+  // FunctionName and IsInternal are transferred from the original Cfg
+  // object, since the Cfg object may be deleted by the time the
+  // assembler buffer is emitted.
+  IceString FunctionName;
+  bool IsInternal;
 
 protected:
   AssemblerBuffer buffer_;
diff --git a/src/llvm2ice.cpp b/src/llvm2ice.cpp
index f83b253..935cb64 100644
--- a/src/llvm2ice.cpp
+++ b/src/llvm2ice.cpp
@@ -374,7 +374,7 @@
   std::unique_ptr<Ice::Translator> Translator;
   if (BuildOnRead) {
     std::unique_ptr<Ice::PNaClTranslator> PTranslator(
-        new Ice::PNaClTranslator(&Ctx, Flags));
+        new Ice::PNaClTranslator(&Ctx));
     PTranslator->translate(IRFilename);
     Translator.reset(PTranslator.release());
   } else if (ALLOW_LLVM_IR) {
@@ -390,8 +390,7 @@
       return GetReturnValue(Ice::EC_Bitcode);
     }
 
-    std::unique_ptr<Ice::Converter> Converter(
-        new Ice::Converter(Mod, &Ctx, Flags));
+    std::unique_ptr<Ice::Converter> Converter(new Ice::Converter(Mod, &Ctx));
     Converter->convertToIce();
     Translator.reset(Converter.release());
   } else {