Collect and report kernel warnings.

A flex-based daemon collects kernel warnings by parsing /var/log/messages
as lines are added to it.  For every warning the daemon sends a sample
to a sparse UMA histogram using a hash of the warning as the bucket.
Then, if the warning hasn't been seen before, the daemon invokes the
crash collector to upload the warning stack trace.

BUG=chromium:217382
TEST=manually tested (for now), automated test on its way
CQ-DEPEND=Ic8d5773d05d717a275c4a4b5616e0e4c307337b8
CQ-DEPEND=I6a4010acad0ffe20c702bb0fc455e3da7cdf3ac1

Change-Id: I89090e5c2b61ec46b4e740f0895c591728d70e77
Reviewed-on: https://gerrit.chromium.org/gerrit/48277
Commit-Queue: Luigi Semenzato <semenzato@chromium.org>
Reviewed-by: Luigi Semenzato <semenzato@chromium.org>
Tested-by: Luigi Semenzato <semenzato@chromium.org>
diff --git a/crash_reporter/warn_collector.l b/crash_reporter/warn_collector.l
new file mode 100644
index 0000000..d0977dc
--- /dev/null
+++ b/crash_reporter/warn_collector.l
@@ -0,0 +1,300 @@
+/* Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ *
+ * This flex program reads /var/log/messages as it grows and saves kernel
+ * warnings to files.  It keeps track of warnings it has seen (based on
+ * file/line only, ignoring differences in the stack trace), and reports only
+ * the first warning of each kind, but maintains a count of all warnings by
+ * using their hashes as buckets in a UMA sparse histogram.  It also invokes
+ * the crash collector, which collects the warnings and prepares them for later
+ * shipment to the crash server.
+ */
+
+%{
+#include <fcntl.h>
+#include <inttypes.h>
+#include <pwd.h>
+#include <stdarg.h>
+#include <sys/inotify.h>
+#include <sys/select.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "metrics/c_metrics_library.h"
+
+int WarnStart(void);
+void WarnEnd(void);
+void WarnInput(char *buf, int *result, size_t max_size);
+
+#define YY_INPUT(buf, result, max_size) WarnInput(buf, &result, max_size)
+
+%}
+
+/* Define a few useful regular expressions. */
+
+D               [0-9]
+PREFIX          .*" kernel: [ "*{D}+"."{D}+"]"
+CUT_HERE        {PREFIX}" ------------[ cut here".*
+WARNING         {PREFIX}" WARNING: at "
+END_TRACE       {PREFIX}" ---[ end trace".*
+
+/* Use exclusive start conditions. */
+%x PRE_WARN WARN
+
+%%
+ /* The scanner itself. */
+
+^{CUT_HERE}\n{WARNING}          BEGIN(PRE_WARN);
+.|\n                            /* ignore all other input in state 0 */
+<PRE_WARN>.*\n                  if (WarnStart()) {
+                                  BEGIN(WARN); ECHO;
+                                } else {
+                                  BEGIN(0);
+                                }
+
+ /* Assume the warning ends at the "end trace" line */
+<WARN>^{END_TRACE}\n            ECHO; BEGIN(0); WarnEnd();
+<WARN>^.*\n                     ECHO;
+
+%%
+
+#define HASH_BITMAP_SIZE        (1 << 15)  /* size in bits */
+#define HASH_BITMAP_MASK        (HASH_BITMAP_SIZE - 1)
+
+const char warn_hist_name[] = "Platform.KernelWarningHashes";
+uint32_t hash_bitmap[HASH_BITMAP_SIZE / 32];
+CMetricsLibrary metrics_library;
+
+const char *prog_name;          /* the name of this program */
+int yyin_fd;                    /* instead of FILE *yyin to avoid buffering */
+int i_fd;                       /* for inotify, to detect file changes */
+int testing;                    /* 1 if running test */
+int filter;                     /* 1 when using as filter (for development) */
+int fifo;                       /* 1 when reading from fifo (for devel) */
+int draining;                   /* 1 when draining renamed log file */
+
+const char *msg_path = "/var/log/messages";
+const char warn_dump_dir[]  = "/var/run/kwarn";
+const char *warn_dump_path = "/var/run/kwarn/warning";
+const char *crash_reporter_command;
+
+static void Die(const char *format, ...) {
+  va_list ap;
+  va_start(ap, format);
+  fprintf(stderr, "%s: ", prog_name);
+  vfprintf(stderr, format, ap);
+  exit(1);
+}
+
+static void RunCrashReporter(void) {
+  int status = system(crash_reporter_command);
+  if (status != 0)
+    Die("%s exited with status %d\n", crash_reporter_command, status);
+}
+
+static uint32_t StringHash(const char *string) {
+  uint32_t hash = 0;
+  while (*string != '\0') {
+    hash = (hash << 5) + hash + *string++;
+  }
+  return hash;
+}
+
+/* We expect only a handful of different warnings per boot session, so the
+ * probability of a collision is very low, and statistically it won't matter
+ * (unless warnings with the same hash also happens in tandem, which is even
+ * rarer).
+ */
+static int HashSeen(uint32_t hash) {
+  int word_index = (hash & HASH_BITMAP_MASK) / 32;
+  int bit_index = (hash & HASH_BITMAP_MASK) % 32;
+  return hash_bitmap[word_index] & 1 << bit_index;
+}
+
+static void SetHashSeen(uint32_t hash) {
+  int word_index = (hash & HASH_BITMAP_MASK) / 32;
+  int bit_index = (hash & HASH_BITMAP_MASK) % 32;
+  hash_bitmap[word_index] |= 1 << bit_index;
+}
+
+int WarnStart(void) {
+  uint32_t hash;
+
+  if (filter)
+    return 1;
+
+  hash = StringHash(yytext);
+  if (!(testing || fifo || filter)) {
+    CMetricsLibrarySendSparseToUMA(metrics_library, warn_hist_name, (int) hash);
+  }
+  if (HashSeen(hash))
+    return 0;
+  SetHashSeen(hash);
+
+  yyout = fopen(warn_dump_path, "w");
+  if (yyout == NULL)
+    Die("fopen %s failed: %s\n", warn_dump_path, strerror(errno));
+  fprintf(yyout, "%08x\n", hash);
+  return 1;
+}
+
+void WarnEnd(void) {
+  if (filter)
+    return;
+  fclose(yyout);
+  yyout = stdout;               /* for debugging */
+  RunCrashReporter();
+}
+
+static void WarnOpenInput(const char *path) {
+  yyin_fd = open(path, O_RDONLY);
+  if (yyin_fd < 0)
+    Die("could not open %s: %s\n", path, strerror(errno));
+  if (!fifo) {
+    /* Set up notification of file growth and rename. */
+    i_fd = inotify_init();
+    if (i_fd < 0)
+      Die("inotify_init: %s\n", strerror(errno));
+    if (inotify_add_watch(i_fd, path, IN_MODIFY | IN_MOVE_SELF) < 0)
+      Die("inotify_add_watch: %s\n", strerror(errno));
+  }
+}
+
+/* We replace the default YY_INPUT() for the following reasons:
+ *
+ * 1.  We want to read data as soon as it becomes available, but the default
+ * YY_INPUT() uses buffered I/O.
+ *
+ * 2.  We want to block on end of input and wait for the file to grow.
+ *
+ * 3.  We want to detect log rotation, and reopen the input file as needed.
+ */
+void WarnInput(char *buf, int *result, size_t max_size) {
+  while (1) {
+    *result = read(yyin_fd, buf, max_size);
+    if (*result < 0)
+      Die("read: %s", strerror(errno));
+    if (*result > 0 || fifo || filter)
+      return;
+    if (draining) {
+      /* Assume we're done with this log, and move to next
+       * log.  Rsyslogd may keep writing to the old log file
+       * for a while, but we don't care since we don't have
+       * to be exact.
+       */
+      close(yyin_fd);
+      if (YYSTATE == WARN) {
+        /* Be conservative in case we lose the warn
+         * terminator during the switch---or we may
+         * collect personally identifiable information.
+         */
+        WarnEnd();
+      }
+      BEGIN(0);        /* see above comment */
+      sleep(1);        /* avoid race with log rotator */
+      WarnOpenInput(msg_path);
+      draining = 0;
+      continue;
+    }
+    /* Nothing left to read, so we must wait. */
+    struct inotify_event event;
+    int n = read(i_fd, &event, sizeof(event));
+    if (n <= 0)
+      Die("inotify: %s\n", strerror(errno));
+    if (event.mask & IN_MOVE_SELF) {
+      /* The file has been renamed.  Before switching
+       * to the new one, we process any remaining
+       * content of this file.
+       */
+      draining = 1;
+    }
+  }
+}
+
+int main(int argc, char **argv) {
+  int result;
+  struct passwd *user;
+  prog_name = argv[0];
+
+  if (argc == 2 && strcmp(argv[1], "--test") == 0)
+    testing = 1;
+  else if (argc == 2 && strcmp(argv[1], "--filter") == 0)
+    filter = 1;
+  else if (argc == 2 && strcmp(argv[1], "--fifo") == 0) {
+    fifo = 1;
+  } else if (argc != 1) {
+    fprintf(stderr,
+            "usage: %s [single-flag]\n"
+            "flags (for testing only):\n"
+            "--fifo\tinput is fifo \"fifo\", output is stdout\n"
+            "--filter\tinput is stdin, output is stdout\n"
+            "--test\trun self-test\n",
+            prog_name);
+    exit(1);
+  }
+
+  metrics_library = CMetricsLibraryNew();
+  CMetricsLibraryInit(metrics_library);
+
+  crash_reporter_command = testing ?
+    "./warn_collector_test_reporter.sh" :
+    "/sbin/crash_reporter --kernel_warning";
+
+  /* When filtering with --filter (for development) use stdin for input.
+   * Otherwise read input from a file or a fifo.
+   */
+  yyin_fd = fileno(stdin);
+  if (testing) {
+    msg_path = "messages";
+    warn_dump_path = "warning";
+  }
+  if (fifo) {
+    msg_path = "fifo";
+  }
+  if (!filter) {
+    WarnOpenInput(msg_path);
+  }
+
+  /* Create directory for dump file.  Still need to be root here. */
+  unlink(warn_dump_path);
+  if (!testing && !fifo && !filter) {
+    rmdir(warn_dump_dir);
+    result = mkdir(warn_dump_dir, 0755);
+    if (result < 0)
+      Die("could not create %s: %s\n",
+          warn_dump_dir, strerror(errno));
+  }
+
+  if (0) {
+    /* TODO(semenzato): put this back in once we decide it's safe
+     * to make /var/spool/crash rwxrwxrwx root, or use a different
+     * owner and setuid for the crash reporter as well.
+     */
+
+    /* Get low privilege uid, gid. */
+    user = getpwnam("chronos");
+    if (user == NULL)
+      Die("getpwnam failed\n");
+
+    /* Change dump directory ownership. */
+    if (chown(warn_dump_dir, user->pw_uid, user->pw_gid) < 0)
+      Die("chown: %s\n", strerror(errno));
+
+    /* Drop privileges. */
+    if (setuid(user->pw_uid) < 0) {
+      Die("setuid: %s\n", strerror(errno));
+    }
+  }
+
+  /* Go! */
+  return yylex();
+}
+
+/* Flex should really know not to generate these functions.
+ */
+void UnusedFunctionWarningSuppressor(void) {
+  yyunput(0, 0);
+  (void) input();
+}