TraceProcessor: add ninja log parsing

Adds a parser for ninja build logs. This is the
moral equivalent of https://github.com/nico/ninjatracing.
This works only as long as ninja doesn't hit recompaction.
The log seems to stop making sense after re-compaction because
it becomes impossible to reconstruct the boundaries of
each build invocation.
I eye-balled this vs ninjatracing on the same trace and
the results seem to make sense.

Bug: 149483774
Test: TraceProcessorIntegrationTest.NinjaLog
Change-Id: Ice9ac9d9fee6a94d78fb0fec997b3b3d36d3c02b
diff --git a/Android.bp b/Android.bp
index 5c70f53..1648f1c 100644
--- a/Android.bp
+++ b/Android.bp
@@ -6298,6 +6298,7 @@
     "src/trace_processor/importers/fuchsia/fuchsia_trace_parser.cc",
     "src/trace_processor/importers/fuchsia/fuchsia_trace_tokenizer.cc",
     "src/trace_processor/importers/fuchsia/fuchsia_trace_utils.cc",
+    "src/trace_processor/importers/ninja/ninja_log_parser.cc",
     "src/trace_processor/importers/proto/args_table_utils.cc",
     "src/trace_processor/importers/proto/packet_sequence_state.cc",
     "src/trace_processor/importers/proto/proto_importer_module.cc",
diff --git a/BUILD b/BUILD
index bce6463..f80d096 100644
--- a/BUILD
+++ b/BUILD
@@ -955,6 +955,8 @@
         "src/trace_processor/importers/json/json_trace_utils.h",
         "src/trace_processor/importers/json/json_tracker.cc",
         "src/trace_processor/importers/json/json_tracker.h",
+        "src/trace_processor/importers/ninja/ninja_log_parser.cc",
+        "src/trace_processor/importers/ninja/ninja_log_parser.h",
         "src/trace_processor/importers/proto/args_table_utils.cc",
         "src/trace_processor/importers/proto/args_table_utils.h",
         "src/trace_processor/importers/proto/chrome_compositor_scheduler_state.descriptor.h",
diff --git a/src/trace_processor/BUILD.gn b/src/trace_processor/BUILD.gn
index 3e70300..db7253e 100644
--- a/src/trace_processor/BUILD.gn
+++ b/src/trace_processor/BUILD.gn
@@ -93,6 +93,8 @@
     "importers/ftrace/ftrace_module.cc",
     "importers/ftrace/ftrace_module.h",
     "importers/fuchsia/fuchsia_record.h",
+    "importers/ninja/ninja_log_parser.cc",
+    "importers/ninja/ninja_log_parser.h",
     "importers/proto/args_table_utils.cc",
     "importers/proto/args_table_utils.h",
     "importers/proto/chrome_compositor_scheduler_state.descriptor.h",
diff --git a/src/trace_processor/forwarding_trace_parser.cc b/src/trace_processor/forwarding_trace_parser.cc
index e61ce2d..0e022b3 100644
--- a/src/trace_processor/forwarding_trace_parser.cc
+++ b/src/trace_processor/forwarding_trace_parser.cc
@@ -19,6 +19,7 @@
 #include "perfetto/base/logging.h"
 #include "perfetto/ext/base/string_utils.h"
 #include "src/trace_processor/gzip_trace_parser.h"
+#include "src/trace_processor/importers/ninja/ninja_log_parser.h"
 #include "src/trace_processor/importers/proto/proto_trace_parser.h"
 #include "src/trace_processor/importers/proto/proto_trace_tokenizer.h"
 #include "src/trace_processor/trace_sorter.h"
@@ -99,6 +100,11 @@
         context_->parser.reset(new ProtoTraceParser(context_));
         break;
       }
+      case kNinjaLogTraceType: {
+        PERFETTO_DLOG("Ninja log detected");
+        reader_.reset(new NinjaLogParser(context_));
+        break;
+      }
       case kFuchsiaTraceType: {
 #if PERFETTO_BUILDFLAG(PERFETTO_TP_FUCHSIA)
         PERFETTO_DLOG("Fuchsia trace detected");
@@ -176,6 +182,10 @@
   if (start.find("TRACE:") != std::string::npos)
     return kCtraceTraceType;
 
+  // Ninja's buils log (.ninja_log).
+  if (base::StartsWith(start, "# ninja log"))
+    return kNinjaLogTraceType;
+
   // Systrace with no header or leading HTML.
   if (base::StartsWith(start, " "))
     return kSystraceTraceType;
diff --git a/src/trace_processor/forwarding_trace_parser.h b/src/trace_processor/forwarding_trace_parser.h
index 7d25e19..14e9712 100644
--- a/src/trace_processor/forwarding_trace_parser.h
+++ b/src/trace_processor/forwarding_trace_parser.h
@@ -32,6 +32,7 @@
   kSystraceTraceType,
   kGzipTraceType,
   kCtraceTraceType,
+  kNinjaLogTraceType,
 };
 
 TraceType GuessTraceType(const uint8_t* data, size_t size);
diff --git a/src/trace_processor/forwarding_trace_parser_unittest.cc b/src/trace_processor/forwarding_trace_parser_unittest.cc
index c812f90..78eb404 100644
--- a/src/trace_processor/forwarding_trace_parser_unittest.cc
+++ b/src/trace_processor/forwarding_trace_parser_unittest.cc
@@ -32,6 +32,11 @@
   EXPECT_EQ(kJsonTraceType, GuessTraceType(prefix, sizeof(prefix)));
 }
 
+TEST(TraceProcessorImplTest, GuessTraceType_Ninja) {
+  const uint8_t prefix[] = "# ninja log v5\n";
+  EXPECT_EQ(kNinjaLogTraceType, GuessTraceType(prefix, sizeof(prefix)));
+}
+
 TEST(TraceProcessorImplTest, GuessTraceType_JsonWithSpaces) {
   const uint8_t prefix[] = "\n{ \"traceEvents\": [";
   EXPECT_EQ(kJsonTraceType, GuessTraceType(prefix, sizeof(prefix)));
diff --git a/src/trace_processor/importers/ninja/ninja_log_parser.cc b/src/trace_processor/importers/ninja/ninja_log_parser.cc
new file mode 100644
index 0000000..3c1f7ec
--- /dev/null
+++ b/src/trace_processor/importers/ninja/ninja_log_parser.cc
@@ -0,0 +1,189 @@
+/*
+ * Copyright (C) 2020 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/trace_processor/importers/ninja/ninja_log_parser.h"
+#include "perfetto/ext/base/string_splitter.h"
+#include "perfetto/ext/base/string_utils.h"
+#include "src/trace_processor/process_tracker.h"
+#include "src/trace_processor/slice_tracker.h"
+#include "src/trace_processor/storage/trace_storage.h"
+#include "src/trace_processor/trace_sorter.h"
+#include "src/trace_processor/track_tracker.h"
+
+namespace perfetto {
+namespace trace_processor {
+
+using base::StringSplitter;
+
+NinjaLogParser::NinjaLogParser(TraceProcessorContext* ctx) : ctx_(ctx) {}
+NinjaLogParser::~NinjaLogParser() = default;
+
+util::Status NinjaLogParser::Parse(std::unique_ptr<uint8_t[]> buf, size_t len) {
+  // A trace is read in chunks of arbitrary size (for http fetch() pipeliniing),
+  // not necessarily aligned on a line boundary.
+  // Here we push everything into a vector and, on each call, consume only
+  // the leading part until the last \n, keeping the rest for the next call.
+  const char* src = reinterpret_cast<const char*>(&buf[0]);
+  log_.insert(log_.end(), src, src + len);
+
+  // Find the last \n.
+  size_t valid_size = log_.size();
+  for (; valid_size > 0 && log_[valid_size - 1] != '\n'; --valid_size) {
+  }
+
+  for (StringSplitter line(log_.data(), valid_size, '\n'); line.Next();) {
+    static const char kHeader[] = "# ninja log v";
+    if (!header_parsed_) {
+      if (!base::StartsWith(line.cur_token(), kHeader))
+        return util::ErrStatus("Failed to parse ninja log header");
+      header_parsed_ = true;
+      auto version = base::CStringToUInt32(line.cur_token() + strlen(kHeader));
+      if (!version || *version != 5)
+        return util::ErrStatus("Unsupported ninja log version");
+      continue;
+    }
+
+    // Each line in the ninja log looks like this:
+    // 4 12  1579224178  ui/assets/modal.scss  832a958a9e234dfa
+    // Where:
+    // - [4, 12] are the timestamps in ms of [start, end] of the job, measured
+    //     from the beginning of the build.
+    // - 1579224178 is the "restat" (ignored).
+    // - ui/assets/modal.scss is the name of the output file being built.
+    // - 832a958a9e234dfa is a hash of the compiler invocation.
+    // In most cases, each hash should be unique per ninja invocation (because
+    // two rules shouln't generate the same output). However, in rare
+    // circumstances the same hash can show up more than once. Examples:
+    // - A GN action generates > 1 output per invocation (e.g., protos). In this
+    //   case all items will have the same [start, end] timestamp. In this case
+    //   we want to merge all the output names into one build step, because from
+    //   the build system viewpoint, that was the same compiler/tool invocation.
+    // - A subtle script that generates different outputs without taking a
+    //   --output=filename argument (e.g. via env vars or similar). Note that
+    //   this happens in the perfetto codebase itself (goto.google.com/nigew).
+    //   In this case we want to treat the two entries as two distinct jobs.
+    //
+    // In summary the deduping logic here is: if both the hash and the
+    // timestamps match -> merge, if not, keep distinct.
+    StringSplitter tok(&line, '\t');
+    auto t_start = base::CStringToInt64(tok.Next() ? tok.cur_token() : "");
+    auto t_end = base::CStringToInt64(tok.Next() ? tok.cur_token() : "");
+    tok.Next();  // Ignore restat.
+    const char* name = tok.Next() ? tok.cur_token() : nullptr;
+    auto cmdhash = base::CStringToUInt64(tok.Next() ? tok.cur_token() : "", 16);
+
+    if (!t_start || !t_end || !name || !cmdhash) {
+      ctx_->storage->IncrementStats(stats::ninja_parse_errors);
+      continue;
+    }
+
+    // The same log file can contain timestamps for different builds. The only
+    // way we can tell when a new build starts is by detecting the end timestamp
+    // breaking monotonicity.
+    if (last_end_seen_ == 0 || *t_end < last_end_seen_) {
+      // Create a new "process" for each build. In the UI this causes each build
+      // to be nested under a track group. |cur_build_id_| is the fake pid
+      // of the synthesized process.
+      ++cur_build_id_;
+      StringId name_id = ctx_->storage->InternString("Build");
+      ctx_->process_tracker->SetProcessNameIfUnset(
+          ctx_->process_tracker->GetOrCreateProcess(cur_build_id_), name_id);
+    }
+    last_end_seen_ = *t_end;
+
+    // If more hashes show up back-to-back with the same timestamps, merge them
+    // together as they identify multiple outputs for the same build rule.
+    if (!jobs_.empty() && *cmdhash == jobs_.back().hash &&
+        *t_start == jobs_.back().start_ms && *t_end == jobs_.back().end_ms) {
+      jobs_.back().names.append(" ");
+      jobs_.back().names.append(name);
+      continue;
+    }
+
+    jobs_.emplace_back(cur_build_id_, *t_start, *t_end, *cmdhash, name);
+  }
+  log_.erase(log_.begin(), log_.begin() + static_cast<ssize_t>(valid_size));
+  return util::OkStatus();
+}
+
+// This is called after the last Parase() call. At this point all |jobs_| have
+// been populated.
+void NinjaLogParser::NotifyEndOfFile() {
+  std::sort(jobs_.begin(), jobs_.end(),
+            [](const Job& x, const Job& y) { return x.start_ms < y.start_ms; });
+
+  // Now we need to work out the job parallelism. There's no direct indication
+  // of that in the ninja logs, so it must be inferred by observing overlapping
+  // of timestamps. In this context a "Worker" is an inferred sequence of jobs
+  // that happened concurrently with other sequences.
+  // Here we pack jobs according the following heuristic, for the sake of making
+  // the graph nicer to read to humans. Consider the initial situation:
+  // 1: [  job 1 ]
+  // 2:   [   job 2   ]
+  // 3: [   job 3   ]
+  //    T=0              | T=6
+  // Assume that a new job starts at T=6. It's very likely that job4 was started
+  // as a consequence of job2 completion (othewise it could have been started
+  // earlier, soon after job 1 or Job 3). It seems to make more sense to draw
+  // it next in the 2nd worker, i.e. next to job 2.
+  struct Worker {
+    int64_t busy_until;
+    TrackId track_id;
+  };
+  std::map<uint32_t /*build_id*/, std::vector<Worker>> workers_by_build;
+
+  // Assign thread ids to worker without conflicting with builds' process ids
+  // (to avoid main-thread auto-mapping).s
+  uint32_t last_worker_id = cur_build_id_;
+
+  for (const auto& job : jobs_) {
+    Worker* worker = nullptr;
+    auto& workers = workers_by_build[job.build_id];
+    for (Worker& cur : workers) {
+      // Pick the worker which has the greatest end time (busy_until) <= the
+      // job's start time.
+      if (cur.busy_until <= job.start_ms) {
+        if (!worker || cur.busy_until > worker->busy_until)
+          worker = &cur;
+      }
+    }
+    if (worker) {
+      // Update the worker's end time with the newly assigned job.
+      worker->busy_until = job.end_ms;
+    } else {
+      // All workers are busy, allocate a new one.
+      uint32_t worker_id = ++last_worker_id;
+      char name[32];
+      snprintf(name, sizeof(name), "Worker %zu", workers.size() + 1);
+      StringId name_id = ctx_->storage->InternString(name);
+      auto utid = ctx_->process_tracker->UpdateThread(worker_id, job.build_id);
+      ctx_->process_tracker->SetThreadNameIfUnset(utid, name_id);
+      TrackId track_id = ctx_->track_tracker->InternThreadTrack(utid);
+      workers.emplace_back(Worker{/*busy_until=*/job.end_ms, track_id});
+      worker = &workers.back();
+    }
+
+    static constexpr int64_t kMsToNs = 1000 * 1000;
+    const int64_t start_ns = job.start_ms * kMsToNs;
+    const int64_t dur_ns = (job.end_ms - job.start_ms) * kMsToNs;
+    StringId name_id = ctx_->storage->InternString(base::StringView(job.names));
+    ctx_->slice_tracker->Scoped(start_ns, worker->track_id, StringId::Null(),
+                                name_id, dur_ns);
+  }
+}
+
+}  // namespace trace_processor
+}  // namespace perfetto
diff --git a/src/trace_processor/importers/ninja/ninja_log_parser.h b/src/trace_processor/importers/ninja/ninja_log_parser.h
new file mode 100644
index 0000000..280fa5a
--- /dev/null
+++ b/src/trace_processor/importers/ninja/ninja_log_parser.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (C) 2020 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SRC_TRACE_PROCESSOR_IMPORTERS_NINJA_NINJA_LOG_PARSER_H_
+#define SRC_TRACE_PROCESSOR_IMPORTERS_NINJA_NINJA_LOG_PARSER_H_
+
+#include <stdint.h>
+
+#include <map>
+#include <string>
+
+#include "src/trace_processor/chunked_trace_reader.h"
+#include "src/trace_processor/trace_parser.h"
+
+namespace perfetto {
+namespace trace_processor {
+
+class TraceProcessorContext;
+
+// This class parses Ninja's (the build system, ninja-build.org) build logs and
+// turns them into traces. A ninja log typically contains the logs of >1 ninja
+// invocation. We map those as follows:
+// - For each ninja invocation we create one process in the trace (from the UI
+//   perspective a process is a group of tracks).
+// - Within each invocation we work out the parallelism from the time stamp and
+//   create one thread for each concurent stream of jobs.
+// Caveat: this works only if ninja didn't recompact the logs. Once recompaction
+// happens (can be forced via ninja -t recompact) there is no way to identify
+// the boundaries of each build (recompaction deletes, for each hash, all but
+// the most recent timestamp and rewrites the log).
+class NinjaLogParser : public ChunkedTraceReader {
+ public:
+  explicit NinjaLogParser(TraceProcessorContext*);
+  ~NinjaLogParser() override;
+  NinjaLogParser(const NinjaLogParser&) = delete;
+  NinjaLogParser& operator=(const NinjaLogParser&) = delete;
+
+  // ChunkedTraceReader implementation
+  util::Status Parse(std::unique_ptr<uint8_t[]>, size_t) override;
+  void NotifyEndOfFile() override;
+
+ private:
+  struct Job {
+    Job(uint32_t b, int64_t s, int64_t e, uint64_t h, const std::string& n)
+        : build_id(b), start_ms(s), end_ms(e), hash(h), names(n) {}
+
+    uint32_t build_id;  // The synthesized PID of the build "process".
+    int64_t start_ms;
+    int64_t end_ms;
+    uint64_t hash;  // Hash of the compiler invocation cmdline.
+
+    // Typically the one output for the compiler invocation. In case of actions
+    // generating multiple outputs this contains the join of all output names.
+    std::string names;
+  };
+
+  TraceProcessorContext* const ctx_;
+  bool header_parsed_ = false;
+  int64_t last_end_seen_ = 0;
+  uint32_t cur_build_id_ = 0;
+  std::vector<Job> jobs_;
+  std::vector<char> log_;
+};
+
+}  // namespace trace_processor
+}  // namespace perfetto
+
+#endif  // SRC_TRACE_PROCESSOR_IMPORTERS_NINJA_NINJA_LOG_PARSER_H_
diff --git a/src/trace_processor/storage/stats.h b/src/trace_processor/storage/stats.h
index 96ed568..0f82707 100644
--- a/src/trace_processor/storage/stats.h
+++ b/src/trace_processor/storage/stats.h
@@ -130,7 +130,8 @@
   F(compact_sched_switch_skipped,             kSingle,  kInfo,     kAnalysis), \
   F(compact_sched_waking_skipped,             kSingle,  kInfo,     kAnalysis), \
   F(empty_chrome_metadata,                    kSingle,  kError,    kTrace),    \
-  F(perf_cpu_lost_records,                    kIndexed, kDataLoss, kTrace)
+  F(perf_cpu_lost_records,                    kIndexed, kDataLoss, kTrace),    \
+  F(ninja_parse_errors,                       kSingle,  kError,    kTrace)
 // clang-format on
 
 enum Type {
diff --git a/src/trace_processor/trace_database_integrationtest.cc b/src/trace_processor/trace_database_integrationtest.cc
index 6d587df..3bce734 100644
--- a/src/trace_processor/trace_database_integrationtest.cc
+++ b/src/trace_processor/trace_database_integrationtest.cc
@@ -242,6 +242,42 @@
   }
 }
 
+// This test checks that a ninja trace is tokenized properly even if read in
+// small chunks of 1KB each. The values used in the test have been cross-checked
+// with opening the same trace with ninjatracing + chrome://tracing.
+TEST_F(TraceProcessorIntegrationTest, NinjaLog) {
+  ASSERT_TRUE(LoadTrace("ninja_log", 1024).ok());
+  auto it = Query("select count(*) from process where name like 'build';");
+  ASSERT_TRUE(it.Next());
+  ASSERT_EQ(it.Get(0).long_value, 2);
+
+  it = Query(
+      "select count(*) from thread left join process using(upid) where "
+      "thread.name like 'worker%' and process.pid=1");
+  ASSERT_TRUE(it.Next());
+  ASSERT_EQ(it.Get(0).long_value, 14);
+
+  it = Query(
+      "create view slices_1st_build as select slices.* from slices left "
+      "join thread_track on(slices.track_id == thread_track.id) left join "
+      "thread using(utid) left join process using(upid) where pid=2");
+  it.Next();
+  ASSERT_TRUE(it.Status().ok());
+
+  it = Query("select (max(ts) - min(ts)) / 1000000 from slices_1st_build");
+  ASSERT_TRUE(it.Next());
+  ASSERT_EQ(it.Get(0).long_value, 12612);
+
+  it = Query("select name from slices_1st_build order by ts desc limit 1");
+  ASSERT_TRUE(it.Next());
+  ASSERT_STREQ(it.Get(0).string_value,
+               "obj/src/trace_processor/unittests.trace_sorter_unittest.o");
+
+  it = Query("select sum(dur) / 1000000 from slices_1st_build");
+  ASSERT_TRUE(it.Next());
+  ASSERT_EQ(it.Get(0).long_value, 276174);
+}
+
 }  // namespace
 }  // namespace trace_processor
 }  // namespace perfetto
diff --git a/tools/install-build-deps b/tools/install-build-deps
index ff99949..70ca67d 100755
--- a/tools/install-build-deps
+++ b/tools/install-build-deps
@@ -146,8 +146,8 @@
     # Example traces for regression tests.
     (
         'buildtools/test_data.zip',
-        'https://storage.googleapis.com/perfetto/test-data-20200122-100845.zip',
-        '56ac45b5239fda50d33cf05bfd329dcb3efb0b2a',
+        'https://storage.googleapis.com/perfetto/test-data-20200213-221940.zip',
+        '347f825269aaa9a61c9f746febee14a451391389',
         'all',
     ),