Custom wrappers for DFSanitizing sprintf & snprintf.

Differential Revision: http://reviews.llvm.org/D5561

llvm-svn: 219293
diff --git a/compiler-rt/lib/dfsan/dfsan_custom.cc b/compiler-rt/lib/dfsan/dfsan_custom.cc
index d06a003..ffd8b00 100644
--- a/compiler-rt/lib/dfsan/dfsan_custom.cc
+++ b/compiler-rt/lib/dfsan/dfsan_custom.cc
@@ -12,12 +12,14 @@
 // This file defines the custom functions listed in done_abilist.txt.
 //===----------------------------------------------------------------------===//
 
+#include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_internal_defs.h"
 #include "sanitizer_common/sanitizer_linux.h"
 
 #include "dfsan/dfsan.h"
 
 #include <arpa/inet.h>
+#include <assert.h>
 #include <ctype.h>
 #include <dlfcn.h>
 #include <link.h>
@@ -26,6 +28,8 @@
 #include <pwd.h>
 #include <sched.h>
 #include <signal.h>
+#include <stdarg.h>
+#include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -839,4 +843,281 @@
   *ret_label = 0;
   return write(fd, buf, count);
 }
+
+// Type used to extract a dfsan_label with va_arg()
+typedef int dfsan_label_va;
+
+// A chunk of data representing the output of formatting either a constant
+// string or a single format directive.
+struct Chunk {
+  // Address of the beginning of the formatted string
+  const char *ptr;
+  // Size of the formatted string
+  size_t size;
+
+  // Type of DFSan label (depends on the format directive)
+  enum {
+    // Constant string, no argument and thus no label
+    NONE = 0,
+    // Label for an argument of '%n'
+    IGNORED,
+    // Label for a '%s' argument
+    STRING,
+    // Label for any other type of argument
+    NUMERIC,
+  } label_type;
+
+  // Value of the argument (if label_type == STRING)
+  const char *arg;
+};
+
+// Formats the input. The output is stored in 'str' starting from offset
+// 'off'. The format directive is represented by the first 'format_size' bytes
+// of 'format'. If 'has_size' is true, 'size' bounds the number of output
+// bytes. Returns the return value of the vsnprintf call used to format the
+// input.
+static int format_chunk(char *str, size_t off, bool has_size, size_t size,
+                        const char *format, size_t format_size, ...) {
+  char *chunk_format = (char *) malloc(format_size + 1);
+  assert(chunk_format);
+  internal_memcpy(chunk_format, format, format_size);
+  chunk_format[format_size] = '\0';
+
+  va_list ap;
+  va_start(ap, format_size);
+  int r = 0;
+  if (has_size) {
+    r = vsnprintf(str + off, off < size ? size - off : 0, chunk_format, ap);
+  } else {
+    r = vsprintf(str + off, chunk_format, ap);
+  }
+  va_end(ap);
+
+  free(chunk_format);
+  return r;
+}
+
+// Formats the input and propagates the input labels to the output. The output
+// is stored in 'str'. If 'has_size' is true, 'size' bounds the number of
+// output bytes. 'format' and 'ap' are the format string and the list of
+// arguments for formatting. Returns the return value vsnprintf would return.
+//
+// The function tokenizes the format string in chunks representing either a
+// constant string or a single format directive (e.g., '%.3f') and formats each
+// chunk independently into the output string. This approach allows to figure
+// out which bytes of the output string depends on which argument and thus to
+// propagate labels more precisely.
+static int format_buffer(char *str, bool has_size, size_t size,
+                         const char *format, va_list ap) {
+  InternalMmapVector<Chunk> chunks(8);
+  size_t off = 0;
+
+  while (*format) {
+    chunks.push_back(Chunk());
+    Chunk& chunk = chunks.back();
+    chunk.ptr = str + off;
+    chunk.arg = nullptr;
+
+    int status = 0;
+
+    if (*format != '%') {
+      // Ordinary character. Consume all the characters until a '%' or the end
+      // of the string.
+      size_t format_size = 0;
+      for (; *format && *format != '%'; ++format, ++format_size) {}
+      status = format_chunk(str, off, has_size, size, format - format_size,
+                            format_size);
+      chunk.label_type = Chunk::NONE;
+    } else {
+      // Conversion directive. Consume all the characters until a conversion
+      // specifier or the end of the string.
+      bool end_format = false;
+#define FORMAT_CHUNK(t)                                                  \
+      format_chunk(str, off, has_size, size, format - format_size,  \
+                   format_size + 1, va_arg(ap, t))
+
+      for (size_t format_size = 1; *++format && !end_format; ++format_size) {
+        switch (*format) {
+          case 'd':
+          case 'i':
+          case 'o':
+          case 'u':
+          case 'x':
+          case 'X':
+            switch (*(format - 1)) {
+              case 'h':
+                // Also covers the 'hh' case (since the size of the arg is still
+                // an int).
+                status = FORMAT_CHUNK(int);
+                break;
+              case 'l':
+                if (format_size >= 2 && *(format - 2) == 'l') {
+                  status = FORMAT_CHUNK(long long int);
+                } else {
+                  status = FORMAT_CHUNK(long int);
+                }
+                break;
+              case 'q':
+                status = FORMAT_CHUNK(long long int);
+                break;
+              case 'j':
+                status = FORMAT_CHUNK(intmax_t);
+                break;
+              case 'z':
+                status = FORMAT_CHUNK(size_t);
+                break;
+              case 't':
+                status = FORMAT_CHUNK(size_t);
+                break;
+              default:
+                status = FORMAT_CHUNK(int);
+            }
+            chunk.label_type = Chunk::NUMERIC;
+            end_format = true;
+            break;
+
+          case 'a':
+          case 'A':
+          case 'e':
+          case 'E':
+          case 'f':
+          case 'F':
+          case 'g':
+          case 'G':
+            if (*(format - 1) == 'L') {
+              status = FORMAT_CHUNK(long double);
+            } else {
+              status = FORMAT_CHUNK(double);
+            }
+            chunk.label_type = Chunk::NUMERIC;
+            end_format = true;
+            break;
+
+          case 'c':
+            status = FORMAT_CHUNK(int);
+            chunk.label_type = Chunk::NUMERIC;
+            end_format = true;
+            break;
+
+          case 's':
+            chunk.arg = va_arg(ap, char *);
+            status =
+                format_chunk(str, off, has_size, size,
+                             format - format_size, format_size + 1,
+                             chunk.arg);
+            chunk.label_type = Chunk::STRING;
+            end_format = true;
+            break;
+
+          case 'p':
+            status = FORMAT_CHUNK(void *);
+            chunk.label_type = Chunk::NUMERIC;
+            end_format = true;
+            break;
+
+          case 'n':
+            *(va_arg(ap, int *)) = (int)off;
+            chunk.label_type = Chunk::IGNORED;
+            end_format = true;
+            break;
+
+          case '%':
+            status = format_chunk(str, off, has_size, size,
+                                  format - format_size, format_size + 1);
+            chunk.label_type = Chunk::NONE;
+            end_format = true;
+            break;
+
+          default:
+            break;
+        }
+      }
+#undef FORMAT_CHUNK
+    }
+
+    if (status < 0) {
+      return status;
+    }
+
+    // A return value of {v,}snprintf of size or more means that the output was
+    // truncated.
+    if (has_size) {
+      if (off < size) {
+        size_t ustatus = (size_t) status;
+        chunk.size = ustatus >= (size - off) ?
+            ustatus - (size - off) : ustatus;
+      } else {
+        chunk.size = 0;
+      }
+    } else {
+      chunk.size = status;
+    }
+    off += status;
+  }
+
+  // Consume the labels of the output buffer, (optional) size, and format
+  // string.
+  //
+  // TODO(martignlo): Decide how to combine labels (e.g., whether to ignore or
+  // not the label of the format string).
+  va_arg(ap, dfsan_label_va);
+  if (has_size) {
+    va_arg(ap, dfsan_label_va);
+  }
+  va_arg(ap, dfsan_label_va);
+
+  // Label each output chunk according to the label supplied as argument to the
+  // function. We need to go through all the chunks and arguments even if the
+  // string was only partially printed ({v,}snprintf case).
+  for (size_t i = 0; i < chunks.size(); ++i) {
+    const Chunk& chunk = chunks[i];
+
+    switch (chunk.label_type) {
+      case Chunk::NONE:
+        dfsan_set_label(0, (void*) chunk.ptr, chunk.size);
+        break;
+      case Chunk::IGNORED:
+        va_arg(ap, dfsan_label_va);
+        dfsan_set_label(0, (void*) chunk.ptr, chunk.size);
+        break;
+      case Chunk::NUMERIC: {
+        dfsan_label label = va_arg(ap, dfsan_label_va);
+        dfsan_set_label(label, (void*) chunk.ptr, chunk.size);
+        break;
+      }
+      case Chunk::STRING: {
+        // Consume the label of the pointer to the string
+        va_arg(ap, dfsan_label_va);
+        internal_memcpy(shadow_for((void *) chunk.ptr),
+                        shadow_for((void *) chunk.arg),
+                        sizeof(dfsan_label) * (strlen(chunk.arg)));
+        break;
+      }
+    }
+  }
+
+  dfsan_label *ret_label_ptr = va_arg(ap, dfsan_label *);
+  *ret_label_ptr = 0;
+
+  // Number of bytes written in total.
+  return off;
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE
+int __dfsw_sprintf(char *str, const char *format, ...) {
+  va_list ap;
+  va_start(ap, format);
+  int ret = format_buffer(str, false, 0, format, ap);
+  va_end(ap);
+  return ret;
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE
+int __dfsw_snprintf(char *str, size_t size, const char *format, ...) {
+  va_list ap;
+  va_start(ap, format);
+  int ret = format_buffer(str, true, size, format, ap);
+  va_end(ap);
+  return ret;
+}
 }
diff --git a/compiler-rt/lib/dfsan/done_abilist.txt b/compiler-rt/lib/dfsan/done_abilist.txt
index 44507bf..8d966b5 100644
--- a/compiler-rt/lib/dfsan/done_abilist.txt
+++ b/compiler-rt/lib/dfsan/done_abilist.txt
@@ -208,9 +208,11 @@
 fun:sigaction=custom
 fun:gettimeofday=custom
 
+# sprintf-like
+fun:sprintf=custom
+fun:snprintf=custom
+
 # TODO: custom
-fun:snprintf=discard
-fun:vsnprintf=discard
 fun:asprintf=discard
 fun:qsort=discard
 
diff --git a/compiler-rt/lib/dfsan/scripts/check_custom_wrappers.sh b/compiler-rt/lib/dfsan/scripts/check_custom_wrappers.sh
index 87e8a09..7acf005 100755
--- a/compiler-rt/lib/dfsan/scripts/check_custom_wrappers.sh
+++ b/compiler-rt/lib/dfsan/scripts/check_custom_wrappers.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 DFSAN_DIR=$(dirname "$0")/../
-DFSAN_CUSTOM_TESTS=${DFSAN_DIR}/../../test/dfsan/custom.c
+DFSAN_CUSTOM_TESTS=${DFSAN_DIR}/../../test/dfsan/custom.cc
 DFSAN_CUSTOM_WRAPPERS=${DFSAN_DIR}/dfsan_custom.cc
 DFSAN_ABI_LIST=${DFSAN_DIR}/done_abilist.txt
 
diff --git a/compiler-rt/lib/sanitizer_common/scripts/check_lint.sh b/compiler-rt/lib/sanitizer_common/scripts/check_lint.sh
index 4612fad..267273d 100755
--- a/compiler-rt/lib/sanitizer_common/scripts/check_lint.sh
+++ b/compiler-rt/lib/sanitizer_common/scripts/check_lint.sh
@@ -29,7 +29,7 @@
 MSAN_RTL_LINT_FILTER=${COMMON_LINT_FILTER}
 LSAN_RTL_LINT_FILTER=${COMMON_LINT_FILTER}
 LSAN_LIT_TEST_LINT_FILTER=${LSAN_RTL_LINT_FILTER},-whitespace/line_length
-DFSAN_RTL_LINT_FILTER=${COMMON_LINT_FILTER},-runtime/int,-runtime/printf,-runtime/references
+DFSAN_RTL_LINT_FILTER=${COMMON_LINT_FILTER},-runtime/int,-runtime/printf,-runtime/references,-readability/function
 COMMON_RTL_INC_LINT_FILTER=${COMMON_LINT_FILTER},-runtime/int,-runtime/sizeof,-runtime/printf,-readability/fn_size
 SANITIZER_INCLUDES_LINT_FILTER=${COMMON_LINT_FILTER},-runtime/int
 MKTEMP="mktemp -q /tmp/tmp.XXXXXXXXXX"
diff --git a/compiler-rt/test/dfsan/custom.c b/compiler-rt/test/dfsan/custom.cc
similarity index 80%
rename from compiler-rt/test/dfsan/custom.c
rename to compiler-rt/test/dfsan/custom.cc
index 8a7a548..8a49c32 100644
--- a/compiler-rt/test/dfsan/custom.c
+++ b/compiler-rt/test/dfsan/custom.cc
@@ -5,7 +5,6 @@
 
 // Tests custom implementations of various glibc functions.
 
-#define _GNU_SOURCE
 #include <sanitizer/dfsan_interface.h>
 
 #include <arpa/inet.h>
@@ -18,6 +17,7 @@
 #include <sched.h>
 #include <signal.h>
 #include <stdio.h>
+#include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/select.h>
@@ -256,12 +256,12 @@
   // With any luck this sequence of calls will cause calloc to return the same
   // pointer both times.  This is probably the best we can do to test this
   // function.
-  char *crv = calloc(4096, 1);
+  char *crv = (char *) calloc(4096, 1);
   ASSERT_ZERO_LABEL(crv[0]);
   dfsan_set_label(i_label, crv, 100);
   free(crv);
 
-  crv = calloc(4096, 1);
+  crv = (char *) calloc(4096, 1);
   ASSERT_ZERO_LABEL(crv[0]);
   free(crv);
 }
@@ -342,14 +342,14 @@
 
 static int write_callback_count = 0;
 static int last_fd;
-static const void *last_buf;
+static const unsigned char *last_buf;
 static size_t last_count;
 
 void write_callback(int fd, const void *buf, size_t count) {
   write_callback_count++;
 
   last_fd = fd;
-  last_buf = buf;
+  last_buf = (const unsigned char*) buf;
   last_count = count;
 }
 
@@ -376,7 +376,7 @@
   dfsan_set_label(i_label, &fd, sizeof(fd));
   dfsan_set_label(j_label, &(buf[3]), 1);
   dfsan_set_label(k_label, &buf_len, sizeof(buf_len));
-  
+
   res = write(fd, buf, buf_len);
   assert(write_callback_count == 2);
   ASSERT_READ_ZERO_LABEL(&res, sizeof(res));
@@ -694,11 +694,11 @@
   dfsan_set_label(i_label, &str1[3], 1);
   dfsan_set_label(j_label, &str1[4], 1);
 
-  char *crv = memchr(str1, 'r', sizeof(str1));
+  char *crv = (char *) memchr(str1, 'r', sizeof(str1));
   assert(crv == &str1[2]);
   ASSERT_ZERO_LABEL(crv);
 
-  crv = memchr(str1, '1', sizeof(str1));
+  crv = (char *) memchr(str1, '1', sizeof(str1));
   assert(crv == &str1[3]);
 #ifdef STRICT_DATA_DEPENDENCIES
   ASSERT_ZERO_LABEL(crv);
@@ -706,7 +706,7 @@
   ASSERT_LABEL(crv, i_label);
 #endif
 
-  crv = memchr(str1, 'x', sizeof(str1));
+  crv = (char *) memchr(str1, 'x', sizeof(str1));
   assert(!crv);
 #ifdef STRICT_DATA_DEPENDENCIES
   ASSERT_ZERO_LABEL(crv);
@@ -774,6 +774,124 @@
   close(fd);
 }
 
+template <class T>
+void test_sprintf_chunk(const char* expected, const char* format, T arg) {
+  char buf[512];
+  memset(buf, 'a', sizeof(buf));
+
+  char padded_expected[512];
+  strcpy(padded_expected, "foo ");
+  strcat(padded_expected, expected);
+  strcat(padded_expected, " bar");
+
+  char padded_format[512];
+  strcpy(padded_format, "foo ");
+  strcat(padded_format, format);
+  strcat(padded_format, " bar");
+
+  // Non labelled arg.
+  assert(sprintf(buf, padded_format,  arg) == strlen(padded_expected));
+  assert(strcmp(buf, padded_expected) == 0);
+  ASSERT_READ_LABEL(buf, strlen(padded_expected), 0);
+  memset(buf, 'a', sizeof(buf));
+
+  // Labelled arg.
+  dfsan_set_label(i_label, &arg, sizeof(arg));
+  assert(sprintf(buf, padded_format,  arg) == strlen(padded_expected));
+  assert(strcmp(buf, padded_expected) == 0);
+  ASSERT_READ_LABEL(buf, 4, 0);
+  ASSERT_READ_LABEL(buf + 4, strlen(padded_expected) - 8, i_label);
+  ASSERT_READ_LABEL(buf + (strlen(padded_expected) - 4), 4, 0);
+}
+
+void test_sprintf() {
+  char buf[2048];
+  memset(buf, 'a', sizeof(buf));
+
+  // Test formatting (no conversion specifier).
+  assert(sprintf(buf, "Hello world!") == 12);
+  assert(strcmp(buf, "Hello world!") == 0);
+  ASSERT_READ_LABEL(buf, sizeof(buf), 0);
+
+  // Test formatting & label propagation (multiple conversion specifiers): %s,
+  // %d, %n, %f, and %%.
+  const char* s = "world";
+  int m = 8;
+  int d = 27;
+  dfsan_set_label(k_label, (void *) (s + 1), 2);
+  dfsan_set_label(i_label, &m, sizeof(m));
+  dfsan_set_label(j_label, &d, sizeof(d));
+  int n;
+  int r = sprintf(buf, "hello %s, %-d/%d/%d %f %% %n%d", s, 2014, m, d,
+                  12345.6781234, &n, 1000);
+  assert(r == 42);
+  assert(strcmp(buf, "hello world, 2014/8/27 12345.678123 % 1000") == 0);
+  ASSERT_READ_LABEL(buf, 7, 0);
+  ASSERT_READ_LABEL(buf + 7, 2, k_label);
+  ASSERT_READ_LABEL(buf + 9, 9, 0);
+  ASSERT_READ_LABEL(buf + 18, 1, i_label);
+  ASSERT_READ_LABEL(buf + 19, 1, 0);
+  ASSERT_READ_LABEL(buf + 20, 2, j_label);
+  ASSERT_READ_LABEL(buf + 22, 15, 0);
+  ASSERT_LABEL(r, 0);
+  assert(n == 38);
+
+  // Test formatting & label propagation (single conversion specifier, with
+  // additional length and precision modifiers).
+  test_sprintf_chunk("-559038737", "%d", 0xdeadbeef);
+  test_sprintf_chunk("3735928559", "%u", 0xdeadbeef);
+  test_sprintf_chunk("12345", "%i", 12345);
+  test_sprintf_chunk("751", "%o", 0751);
+  test_sprintf_chunk("babe", "%x", 0xbabe);
+  test_sprintf_chunk("0000BABE", "%.8X", 0xbabe);
+  test_sprintf_chunk("-17", "%hhd", 0xdeadbeef);
+  test_sprintf_chunk("-16657", "%hd", 0xdeadbeef);
+  test_sprintf_chunk("deadbeefdeadbeef", "%lx", 0xdeadbeefdeadbeef);
+  test_sprintf_chunk("0xdeadbeefdeadbeef", "%p",
+                 (void *)  0xdeadbeefdeadbeef);
+  test_sprintf_chunk("18446744073709551615", "%ju", (intmax_t) -1);
+  test_sprintf_chunk("18446744073709551615", "%zu", (size_t) -1);
+  test_sprintf_chunk("18446744073709551615", "%tu", (size_t) -1);
+
+  test_sprintf_chunk("0x1.f9acffa7eb6bfp-4", "%a", 0.123456);
+  test_sprintf_chunk("0X1.F9ACFFA7EB6BFP-4", "%A", 0.123456);
+  test_sprintf_chunk("0.12346", "%.5f", 0.123456);
+  test_sprintf_chunk("0.123456", "%g", 0.123456);
+  test_sprintf_chunk("1.234560e-01", "%e", 0.123456);
+  test_sprintf_chunk("1.234560E-01", "%E", 0.123456);
+  test_sprintf_chunk("0.1234567891234560", "%.16Lf",
+                     (long double) 0.123456789123456);
+
+  test_sprintf_chunk("z", "%c", 'z');
+
+  // %n, %s, %d, %f, and %% already tested
+}
+
+void test_snprintf() {
+  char buf[2048];
+  memset(buf, 'a', sizeof(buf));
+  dfsan_set_label(0, buf, sizeof(buf));
+  const char* s = "world";
+  int y = 2014;
+  int m = 8;
+  int d = 27;
+  dfsan_set_label(k_label, (void *) (s + 1), 2);
+  dfsan_set_label(i_label, &y, sizeof(y));
+  dfsan_set_label(j_label, &m, sizeof(m));
+  int r = snprintf(buf, 19, "hello %s, %-d/%d/%d %f", s, y, m, d,
+                   12345.6781234);
+  // The return value is the number of bytes that would have been written to
+  // the final string if enough space had been available.
+  assert(r == 35);
+  assert(memcmp(buf, "hello world, 2014/", 19) == 0);
+  ASSERT_READ_LABEL(buf, 7, 0);
+  ASSERT_READ_LABEL(buf + 7, 2, k_label);
+  ASSERT_READ_LABEL(buf + 9, 4, 0);
+  ASSERT_READ_LABEL(buf + 13, 4, i_label);
+  ASSERT_READ_LABEL(buf + 17, 2, 0);
+  ASSERT_LABEL(r, 0);
+}
+
 int main(void) {
   i_label = dfsan_create_label("i", 0);
   j_label = dfsan_create_label("j", 0);
@@ -810,7 +928,9 @@
   test_select();
   test_sigaction();
   test_sigemptyset();
+  test_snprintf();
   test_socketpair();
+  test_sprintf();
   test_stat();
   test_strcasecmp();
   test_strchr();
diff --git a/compiler-rt/test/sanitizer_common/lit.common.cfg b/compiler-rt/test/sanitizer_common/lit.common.cfg
index 6a3ce2a..3b44e00 100644
--- a/compiler-rt/test/sanitizer_common/lit.common.cfg
+++ b/compiler-rt/test/sanitizer_common/lit.common.cfg
@@ -31,4 +31,3 @@
 
 if config.host_os not in ['Linux', 'Darwin']:
   config.unsupported = True
-