Use atomic integer for compiler driver work balancing.

Before, we divided the work by dividing the total work by the number
of threads. This did not balance work well since some threads could
finish much earlier than others. The new method uses a shared atomic
integer to balance work. This makes it that a thread can process at
most one item after the other worker threads are finished.

Changed the number of threads to take into account the main thread
also doing work. This means that we subtract one from the number of
threads when we make the thread pool.

Change-Id: I0147b0403c6214800ed6bfcdac4f1e5486330996
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index 99f8fb7..634d3bc 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -473,7 +473,7 @@
                                 const std::vector<const DexFile*>& dex_files,
                                 base::TimingLogger& timings) {
   DCHECK(!Runtime::Current()->IsStarted());
-  UniquePtr<ThreadPool> thread_pool(new ThreadPool(thread_count_));
+  UniquePtr<ThreadPool> thread_pool(new ThreadPool(thread_count_ - 1));
   PreCompile(class_loader, dex_files, *thread_pool.get(), timings);
   Compile(class_loader, dex_files, *thread_pool.get(), timings);
   if (dump_stats_) {
@@ -537,7 +537,7 @@
   std::vector<const DexFile*> dex_files;
   dex_files.push_back(dex_file);
 
-  UniquePtr<ThreadPool> thread_pool(new ThreadPool(1U));
+  UniquePtr<ThreadPool> thread_pool(new ThreadPool(0U));
   PreCompile(jclass_loader, dex_files, *thread_pool.get(), timings);
 
   uint32_t method_idx = method->GetDexMethodIndex();
@@ -1322,7 +1322,8 @@
                              CompilerDriver* compiler,
                              const DexFile* dex_file,
                              ThreadPool& thread_pool)
-    : class_linker_(class_linker),
+    : index_(0),
+      class_linker_(class_linker),
       class_loader_(class_loader),
       compiler_(compiler),
       dex_file_(dex_file),
@@ -1353,8 +1354,9 @@
     CHECK_GT(work_units, 0U);
 
     std::vector<ForAllClosure*> closures(work_units);
+    index_ = begin;
     for (size_t i = 0; i < work_units; ++i) {
-      closures[i] = new ForAllClosure(this, begin + i, end, callback, work_units);
+      closures[i] = new ForAllClosure(this, end, callback);
       thread_pool_->AddTask(self, closures[i]);
     }
     thread_pool_->StartWorkers(self);
@@ -1367,20 +1369,25 @@
     thread_pool_->Wait(self, true, false);
   }
 
+  size_t NextIndex() {
+    return index_.fetch_add(1);
+  }
+
  private:
   class ForAllClosure : public Task {
    public:
-    ForAllClosure(ParallelCompilationManager* manager, size_t begin, size_t end, Callback* callback,
-                  size_t stripe)
+    ForAllClosure(ParallelCompilationManager* manager, size_t end, Callback* callback)
         : manager_(manager),
-          begin_(begin),
           end_(end),
-          callback_(callback),
-          stripe_(stripe) {}
+          callback_(callback) {}
 
     virtual void Run(Thread* self) {
-      for (size_t i = begin_; i < end_; i += stripe_) {
-        callback_(manager_, i);
+      while (true) {
+        const size_t index = manager_->NextIndex();
+        if (UNLIKELY(index >= end_)) {
+          break;
+        }
+        callback_(manager_, index);
         self->AssertNoPendingException();
       }
     }
@@ -1390,18 +1397,19 @@
     }
 
    private:
-    const ParallelCompilationManager* const manager_;
-    const size_t begin_;
+    ParallelCompilationManager* const manager_;
     const size_t end_;
     const Callback* const callback_;
-    const size_t stripe_;
   };
 
+  AtomicInteger index_;
   ClassLinker* const class_linker_;
   const jobject class_loader_;
   CompilerDriver* const compiler_;
   const DexFile* const dex_file_;
   ThreadPool* const thread_pool_;
+
+  DISALLOW_COPY_AND_ASSIGN(ParallelCompilationManager);
 };
 
 // Return true if the class should be skipped during compilation. We