trace_processor: make args table a lot more efficient

Before, we were doing a full table scan every time we joined the
counters (or any future table) with the args table. Instead, since we
have the multimap of id -> row index, we can do an O(1) lookup to find
the rows associated with the id (if they exist).

This makes a full inner join a lot faster:
Old code on a 6MB trace: 436s
New code on a 6MB trace: 300ms

Change-Id: Ie5beebd49142759854f0489ee719c369a24de295
diff --git a/src/trace_processor/trace_storage.h b/src/trace_processor/trace_storage.h
index c69636e..9d5f384 100644
--- a/src/trace_processor/trace_storage.h
+++ b/src/trace_processor/trace_storage.h
@@ -126,6 +126,9 @@
     const std::deque<StringId>& flat_keys() const { return flat_keys_; }
     const std::deque<StringId>& keys() const { return keys_; }
     const std::deque<Varardic>& arg_values() const { return arg_values_; }
+    const std::multimap<RowId, uint32_t>& args_for_id() const {
+      return args_for_id_;
+    }
     size_t args_count() const { return ids_.size(); }
 
     void AddArg(RowId id, StringId flat_key, StringId key, int64_t value) {
@@ -136,7 +139,7 @@
       flat_keys_.emplace_back(flat_key);
       keys_.emplace_back(key);
       arg_values_.emplace_back(value);
-      args_for_id_.emplace(id, args_count() - 1);
+      args_for_id_.emplace(id, static_cast<uint32_t>(args_count() - 1));
     }
 
    private:
@@ -144,7 +147,7 @@
     std::deque<StringId> flat_keys_;
     std::deque<StringId> keys_;
     std::deque<Varardic> arg_values_;
-    std::multimap<RowId, size_t> args_for_id_;
+    std::multimap<RowId, uint32_t> args_for_id_;
   };
 
   class Slices {