Merge branch 'upstream' into merge_2012_03_18

Change-Id: I45a9f74b42eae6a1fe5320a46a3a562eee0a8357
diff --git a/lib/asan/CMakeLists.txt b/lib/asan/CMakeLists.txt
index e451e23..3e3505e 100644
--- a/lib/asan/CMakeLists.txt
+++ b/lib/asan/CMakeLists.txt
@@ -30,7 +30,9 @@
 
 include_directories(..)
 
-set(ASAN_CFLAGS ${SANITIZER_COMMON_CFLAGS})
+set(ASAN_CFLAGS
+  ${SANITIZER_COMMON_CFLAGS}
+  -fno-rtti)
 
 set(ASAN_COMMON_DEFINITIONS
   ASAN_HAS_EXCEPTIONS=1)
diff --git a/lib/asan/asan_allocator.h b/lib/asan/asan_allocator.h
index 9ba2542..38477c0 100644
--- a/lib/asan/asan_allocator.h
+++ b/lib/asan/asan_allocator.h
@@ -24,11 +24,7 @@
 // will co-exist in the source base for a while. The actual allocator is chosen
 // at build time by redefining this macro.
 #ifndef ASAN_ALLOCATOR_VERSION
-# if (ASAN_LINUX && !ASAN_ANDROID) || ASAN_MAC || ASAN_WINDOWS
-#  define ASAN_ALLOCATOR_VERSION 2
-# else
-#  define ASAN_ALLOCATOR_VERSION 1
-# endif
+#define ASAN_ALLOCATOR_VERSION 2
 #endif  // ASAN_ALLOCATOR_VERSION
 
 namespace __asan {
diff --git a/lib/asan/asan_allocator2.cc b/lib/asan/asan_allocator2.cc
index 546cf24..3288f28 100644
--- a/lib/asan/asan_allocator2.cc
+++ b/lib/asan/asan_allocator2.cc
@@ -61,7 +61,7 @@
 #else
 const uptr kAllocatorSpace = 0x600000000000ULL;
 #endif
-const uptr kAllocatorSize  =  0x10000000000ULL;  // 1T.
+const uptr kAllocatorSize  =  0x40000000000ULL;  // 4T.
 typedef DefaultSizeClassMap SizeClassMap;
 typedef SizeClassAllocator64<kAllocatorSpace, kAllocatorSize, 0 /*metadata*/,
     SizeClassMap, AsanMapUnmapCallback> PrimaryAllocator;
@@ -604,7 +604,9 @@
 void *asan_calloc(uptr nmemb, uptr size, StackTrace *stack) {
   if (CallocShouldReturnNullDueToOverflow(size, nmemb)) return 0;
   void *ptr = Allocate(nmemb * size, 8, stack, FROM_MALLOC);
-  if (ptr)
+  // If the memory comes from the secondary allocator no need to clear it
+  // as it comes directly from mmap.
+  if (ptr && allocator.FromPrimary(ptr))
     REAL(memset)(ptr, 0, nmemb * size);
   return ptr;
 }
diff --git a/lib/asan/asan_flags.h b/lib/asan/asan_flags.h
index 47c14aa..377354a 100644
--- a/lib/asan/asan_flags.h
+++ b/lib/asan/asan_flags.h
@@ -110,6 +110,9 @@
   bool alloc_dealloc_mismatch;
   // Use stack depot instead of storing stacks in the redzones.
   bool use_stack_depot;
+  // If true, assume that memcmp(p1, p2, n) always reads n bytes before
+  // comparing p1 and p2.
+  bool strict_memcmp;
 };
 
 Flags *flags();
diff --git a/lib/asan/asan_intercepted_functions.h b/lib/asan/asan_intercepted_functions.h
index d529560..1f872c9 100644
--- a/lib/asan/asan_intercepted_functions.h
+++ b/lib/asan/asan_intercepted_functions.h
@@ -79,9 +79,9 @@
 # define ASAN_INTERCEPT___CXA_THROW 0
 #endif
 
-// Windows threads.
 # if defined(_WIN32)
 extern "C" {
+// Windows threads.
 __declspec(dllimport)
 void* __stdcall CreateThread(void *sec, uptr st, void* start,
                              void *arg, DWORD fl, DWORD *id);
@@ -103,6 +103,7 @@
 long atol(const char *nptr);  // NOLINT
 long strtol(const char *nptr, char **endptr, int base);  // NOLINT
 void longjmp(void *env, int value);
+double frexp(double x, int *expptr);
 }
 # endif
 
diff --git a/lib/asan/asan_interceptors.cc b/lib/asan/asan_interceptors.cc
index e10a3b2..b1efe74 100644
--- a/lib/asan/asan_interceptors.cc
+++ b/lib/asan/asan_interceptors.cc
@@ -260,10 +260,26 @@
   if (!asan_inited) return internal_memcmp(a1, a2, size);
   ENSURE_ASAN_INITED();
   if (flags()->replace_intrin) {
-    // We check the entire regions even if the first bytes of the buffers
-    // are different.
-    ASAN_READ_RANGE(a1, size);
-    ASAN_READ_RANGE(a2, size);
+    if (flags()->strict_memcmp) {
+      // Check the entire regions even if the first bytes of the buffers are
+      // different.
+      ASAN_READ_RANGE(a1, size);
+      ASAN_READ_RANGE(a2, size);
+      // Fallthrough to REAL(memcmp) below.
+    } else {
+      unsigned char c1 = 0, c2 = 0;
+      const unsigned char *s1 = (const unsigned char*)a1;
+      const unsigned char *s2 = (const unsigned char*)a2;
+      uptr i;
+      for (i = 0; i < size; i++) {
+        c1 = s1[i];
+        c2 = s2[i];
+        if (c1 != c2) break;
+      }
+      ASAN_READ_RANGE(s1, Min(i + 1, size));
+      ASAN_READ_RANGE(s2, Min(i + 1, size));
+      return CharCmp(c1, c2);
+    }
   }
   return REAL(memcmp(a1, a2, size));
 }
diff --git a/lib/asan/asan_interface_internal.h b/lib/asan/asan_interface_internal.h
index 48220e7..8288d0c 100644
--- a/lib/asan/asan_interface_internal.h
+++ b/lib/asan/asan_interface_internal.h
@@ -25,8 +25,8 @@
   // Everytime the asan ABI changes we also change the version number in this
   // name. Objects build with incompatible asan ABI version
   // will not link with run-time.
-  void __asan_init_v1() SANITIZER_INTERFACE_ATTRIBUTE;
-  #define __asan_init __asan_init_v1
+  void __asan_init_v2() SANITIZER_INTERFACE_ATTRIBUTE;
+  #define __asan_init __asan_init_v2
 
   // This structure describes an instrumented global variable.
   struct __asan_global {
@@ -34,6 +34,7 @@
     uptr size;               // The original size of the global.
     uptr size_with_redzone;  // The size with the redzone.
     const char *name;        // Name as a C string.
+    const char *module_name; // Module name as a C string.
     uptr has_dynamic_init;   // Non-zero if the global has dynamic initializer.
   };
 
diff --git a/lib/asan/asan_mapping.h b/lib/asan/asan_mapping.h
index feaf353..161ab65 100644
--- a/lib/asan/asan_mapping.h
+++ b/lib/asan/asan_mapping.h
@@ -32,13 +32,13 @@
 // || `[0x000000040000, 0x01ffffffffff]` || ShadowGap  ||
 //
 // Special case when something is already mapped between
-// 0x003000000000 and 0x004000000000 (e.g. when prelink is installed):
+// 0x003000000000 and 0x005000000000 (e.g. when prelink is installed):
 // || `[0x10007fff8000, 0x7fffffffffff]` || HighMem    ||
 // || `[0x02008fff7000, 0x10007fff7fff]` || HighShadow ||
-// || `[0x004000000000, 0x02008fff6fff]` || ShadowGap3 ||
-// || `[0x003000000000, 0x003fffffffff]` || MidMem     ||
-// || `[0x00087fff8000, 0x002fffffffff]` || ShadowGap2 ||
-// || `[0x00067fff8000, 0x00087fff7fff]` || MidShadow  ||
+// || `[0x005000000000, 0x02008fff6fff]` || ShadowGap3 ||
+// || `[0x003000000000, 0x004fffffffff]` || MidMem     ||
+// || `[0x000a7fff8000, 0x002fffffffff]` || ShadowGap2 ||
+// || `[0x00067fff8000, 0x000a7fff7fff]` || MidShadow  ||
 // || `[0x00008fff7000, 0x00067fff7fff]` || ShadowGap  ||
 // || `[0x00007fff8000, 0x00008fff6fff]` || LowShadow  ||
 // || `[0x000000000000, 0x00007fff7fff]` || LowMem     ||
@@ -131,7 +131,7 @@
 // difference between fixed and non-fixed mapping is below the noise level.
 static uptr kHighMemEnd = 0x7fffffffffffULL;
 static uptr kMidMemBeg =    0x3000000000ULL;
-static uptr kMidMemEnd =    0x3fffffffffULL;
+static uptr kMidMemEnd =    0x4fffffffffULL;
 #else
 SANITIZER_INTERFACE_ATTRIBUTE
 extern uptr kHighMemEnd, kMidMemBeg, kMidMemEnd;  // Initialized in __asan_init.
diff --git a/lib/asan/asan_report.cc b/lib/asan/asan_report.cc
index 8fa42f7..6359b26 100644
--- a/lib/asan/asan_report.cc
+++ b/lib/asan/asan_report.cc
@@ -208,8 +208,8 @@
     // Can it happen?
     Printf("%p is located %zd bytes inside", (void*)addr, addr - g.beg);
   }
-  Printf(" of global variable '%s' (0x%zx) of size %zu\n",
-             g.name, g.beg, g.size);
+  Printf(" of global variable '%s' from '%s' (0x%zx) of size %zu\n",
+             g.name, g.module_name, g.beg, g.size);
   Printf("%s", d.EndLocation());
   PrintGlobalNameIfASCII(g);
   return true;
diff --git a/lib/asan/asan_rtl.cc b/lib/asan/asan_rtl.cc
index 7985903..2902339 100644
--- a/lib/asan/asan_rtl.cc
+++ b/lib/asan/asan_rtl.cc
@@ -127,18 +127,21 @@
   ParseFlag(str, &f->poison_heap, "poison_heap");
   ParseFlag(str, &f->alloc_dealloc_mismatch, "alloc_dealloc_mismatch");
   ParseFlag(str, &f->use_stack_depot, "use_stack_depot");
+  ParseFlag(str, &f->strict_memcmp, "strict_memcmp");
 }
 
+static const char *asan_external_symbolizer;
+
 void InitializeFlags(Flags *f, const char *env) {
   internal_memset(f, 0, sizeof(*f));
 
   f->quarantine_size = (ASAN_LOW_MEMORY) ? 1UL << 26 : 1UL << 28;
-  f->symbolize = false;
+  f->symbolize = (asan_external_symbolizer != 0);
   f->verbosity = 0;
   f->redzone = ASAN_ALLOCATOR_VERSION == 2 ? 16 : (ASAN_LOW_MEMORY) ? 64 : 128;
   f->debug = false;
   f->report_globals = 1;
-  f->check_initialization_order = true;
+  f->check_initialization_order = false;
   f->malloc_context_size = kDeafultMallocContextSize;
   f->replace_str = true;
   f->replace_intrin = true;
@@ -168,6 +171,7 @@
   // TODO(glider): Fix known issues and enable this back.
   f->alloc_dealloc_mismatch = (ASAN_MAC == 0);;
   f->use_stack_depot = true;  // Only affects allocator2.
+  f->strict_memcmp = true;
 
   // Override from compile definition.
   ParseFlagsFromString(f, MaybeUseAsanDefaultOptionsCompileDefiniton());
@@ -424,6 +428,8 @@
   SetCheckFailedCallback(AsanCheckFailed);
   SetPrintfAndReportCallback(AppendToErrorMessageBuffer);
 
+  // Check if external symbolizer is defined before parsing the flags.
+  asan_external_symbolizer = GetEnv("ASAN_SYMBOLIZER_PATH");
   // Initialize flags. This must be done early, because most of the
   // initialization steps look at flags().
   const char *options = GetEnv("ASAN_OPTIONS");
@@ -459,7 +465,7 @@
 #if ASAN_LINUX && defined(__x86_64__) && !ASAN_FIXED_MAPPING
   if (!full_shadow_is_available) {
     kMidMemBeg = kLowMemEnd < 0x3000000000ULL ? 0x3000000000ULL : 0;
-    kMidMemEnd = kLowMemEnd < 0x3000000000ULL ? 0x3fffffffffULL : 0;
+    kMidMemEnd = kLowMemEnd < 0x3000000000ULL ? 0x4fffffffffULL : 0;
   }
 #endif
 
@@ -501,11 +507,9 @@
 
   InstallSignalHandlers();
   // Start symbolizer process if necessary.
-  if (flags()->symbolize) {
-    const char *external_symbolizer = GetEnv("ASAN_SYMBOLIZER_PATH");
-    if (external_symbolizer) {
-      InitializeExternalSymbolizer(external_symbolizer);
-    }
+  if (flags()->symbolize && asan_external_symbolizer &&
+      asan_external_symbolizer[0]) {
+    InitializeExternalSymbolizer(asan_external_symbolizer);
   }
 
   // On Linux AsanThread::ThreadStart() calls malloc() that's why asan_inited
diff --git a/lib/asan/lit_tests/Linux/asan_prelink_test.cc b/lib/asan/lit_tests/Linux/asan_prelink_test.cc
index 114ffa5..522c191 100644
--- a/lib/asan/lit_tests/Linux/asan_prelink_test.cc
+++ b/lib/asan/lit_tests/Linux/asan_prelink_test.cc
@@ -22,5 +22,5 @@
   *getG() = 0;
 }
 #endif
-// CHECK: 0x003000000000, 0x003fffffffff{{.*}} MidMem
+// CHECK: 0x003000000000, 0x004fffffffff{{.*}} MidMem
 // CHECK: SO mapped at 3600000000
diff --git a/lib/asan/lit_tests/Linux/initialization-bug-any-order.cc b/lib/asan/lit_tests/Linux/initialization-bug-any-order.cc
index 645fe1c..f054a81 100644
--- a/lib/asan/lit_tests/Linux/initialization-bug-any-order.cc
+++ b/lib/asan/lit_tests/Linux/initialization-bug-any-order.cc
@@ -2,11 +2,11 @@
 // Check that on Linux initialization order bugs are caught
 // independently on order in which we list source files.
 
-// RUN: %clangxx_asan -m64 -O0 %s %p/../Helpers/initialization-bug-extra.cc\
-// RUN:   -fsanitize=init-order -o %t && %t 2>&1 \
+// RUN: %clangxx_asan -m64 -O0 %s %p/../Helpers/initialization-bug-extra.cc -o %t
+// RUN: ASAN_OPTIONS=check_initialization_order=true %t 2>&1 \
 // RUN:    | %symbolize | FileCheck %s
-// RUN: %clangxx_asan -m64 -O0 %p/../Helpers/initialization-bug-extra.cc %s\
-// RUN:   -fsanitize=init-order -o %t && %t 2>&1 \
+// RUN: %clangxx_asan -m64 -O0 %p/../Helpers/initialization-bug-extra.cc %s -o %t
+// RUN: ASAN_OPTIONS=check_initialization_order=true %t 2>&1 \
 // RUN:    | %symbolize | FileCheck %s
 
 // Do not test with optimization -- the error may be optimized away.
diff --git a/lib/asan/lit_tests/initialization-blacklist.cc b/lib/asan/lit_tests/initialization-blacklist.cc
index f8df24c..d0f86a7 100644
--- a/lib/asan/lit_tests/initialization-blacklist.cc
+++ b/lib/asan/lit_tests/initialization-blacklist.cc
@@ -2,22 +2,28 @@
 
 // RUN: %clangxx_asan -m64 -O0 %s %p/Helpers/initialization-blacklist-extra.cc\
 // RUN:   -fsanitize-blacklist=%p/Helpers/initialization-blacklist.txt \
-// RUN:   -fsanitize=init-order -o %t && %t 2>&1
+// RUN:   -fsanitize=init-order -o %t
+// RUN: ASAN_OPTIONS=check_initialization_order=true %t 2>&1
 // RUN: %clangxx_asan -m64 -O1 %s %p/Helpers/initialization-blacklist-extra.cc\
 // RUN:   -fsanitize-blacklist=%p/Helpers/initialization-blacklist.txt \
-// RUN:   -fsanitize=init-order -o %t && %t 2>&1
+// RUN:   -fsanitize=init-order -o %t
+// RUN: ASAN_OPTIONS=check_initialization_order=true %t 2>&1
 // RUN: %clangxx_asan -m64 -O2 %s %p/Helpers/initialization-blacklist-extra.cc\
 // RUN:   -fsanitize-blacklist=%p/Helpers/initialization-blacklist.txt \
-// RUN:   -fsanitize=init-order -o %t && %t 2>&1
+// RUN:   -fsanitize=init-order -o %t
+// RUN: ASAN_OPTIONS=check_initialization_order=true %t 2>&1
 // RUN: %clangxx_asan -m32 -O0 %s %p/Helpers/initialization-blacklist-extra.cc\
 // RUN:   -fsanitize-blacklist=%p/Helpers/initialization-blacklist.txt \
-// RUN:   -fsanitize=init-order -o %t && %t 2>&1
+// RUN:   -fsanitize=init-order -o %t
+// RUN: ASAN_OPTIONS=check_initialization_order=true %t 2>&1
 // RUN: %clangxx_asan -m32 -O1 %s %p/Helpers/initialization-blacklist-extra.cc\
 // RUN:   -fsanitize-blacklist=%p/Helpers/initialization-blacklist.txt \
-// RUN:   -fsanitize=init-order -o %t && %t 2>&1
+// RUN:   -fsanitize=init-order -o %t
+// RUN: ASAN_OPTIONS=check_initialization_order=true %t 2>&1
 // RUN: %clangxx_asan -m32 -O2 %s %p/Helpers/initialization-blacklist-extra.cc\
 // RUN:   -fsanitize-blacklist=%p/Helpers/initialization-blacklist.txt \
-// RUN:   -fsanitize=init-order -o %t && %t 2>&1
+// RUN:   -fsanitize=init-order -o %t
+// RUN: ASAN_OPTIONS=check_initialization_order=true %t 2>&1
 
 // Function is defined in another TU.
 int readBadGlobal();
diff --git a/lib/asan/lit_tests/initialization-bug.cc b/lib/asan/lit_tests/initialization-bug.cc
index 8f4e33e..624afb0 100644
--- a/lib/asan/lit_tests/initialization-bug.cc
+++ b/lib/asan/lit_tests/initialization-bug.cc
@@ -1,10 +1,10 @@
 // Test to make sure basic initialization order errors are caught.
 
-// RUN: %clangxx_asan -m64 -O0 %s %p/Helpers/initialization-bug-extra2.cc\
-// RUN:   -fsanitize=init-order -o %t && %t 2>&1 \
+// RUN: %clangxx_asan -m64 -O0 %s %p/Helpers/initialization-bug-extra2.cc -o %t
+// RUN: ASAN_OPTIONS=check_initialization_order=true %t 2>&1 \
 // RUN:    | %symbolize | FileCheck %s
-// RUN: %clangxx_asan -m32 -O0 %s %p/Helpers/initialization-bug-extra2.cc\
-// RUN:   -fsanitize=init-order -o %t && %t 2>&1 \
+// RUN: %clangxx_asan -m32 -O0 %s %p/Helpers/initialization-bug-extra2.cc -o %t
+// RUN: ASAN_OPTIONS=check_initialization_order=true %t 2>&1 \
 // RUN:     | %symbolize | FileCheck %s
 
 // Do not test with optimization -- the error may be optimized away.
diff --git a/lib/asan/lit_tests/initialization-nobug.cc b/lib/asan/lit_tests/initialization-nobug.cc
index 1b89616..93df993 100644
--- a/lib/asan/lit_tests/initialization-nobug.cc
+++ b/lib/asan/lit_tests/initialization-nobug.cc
@@ -2,23 +2,32 @@
 // order checking.  If successful, this will just return 0.
 
 // RUN: %clangxx_asan -m64 -O0 %s %p/Helpers/initialization-nobug-extra.cc\
-// RUN:   --std=c++11 -fsanitize=init-order -o %t && %t 2>&1
+// RUN:   --std=c++11 -fsanitize=init-order -o %t
+// RUN: ASAN_OPTIONS=check_initialization_order=true %t 2>&1
 // RUN: %clangxx_asan -m64 -O1 %s %p/Helpers/initialization-nobug-extra.cc\
-// RUN:   --std=c++11 -fsanitize=init-order -o %t && %t 2>&1
+// RUN:   --std=c++11 -fsanitize=init-order -o %t
+// RUN: ASAN_OPTIONS=check_initialization_order=true %t 2>&1
 // RUN: %clangxx_asan -m64 -O2 %s %p/Helpers/initialization-nobug-extra.cc\
-// RUN:   --std=c++11 -fsanitize=init-order -o %t && %t 2>&1
+// RUN:   --std=c++11 -fsanitize=init-order -o %t
+// RUN: ASAN_OPTIONS=check_initialization_order=true %t 2>&1
 // RUN: %clangxx_asan -m64 -O3 %s %p/Helpers/initialization-nobug-extra.cc\
-// RUN:   --std=c++11 -fsanitize=init-order -o %t && %t 2>&1
+// RUN:   --std=c++11 -fsanitize=init-order -o %t
+// RUN: ASAN_OPTIONS=check_initialization_order=true %t 2>&1
 // RUN: %clangxx_asan -m32 -O0 %s %p/Helpers/initialization-nobug-extra.cc\
-// RUN:   --std=c++11 -fsanitize=init-order -o %t && %t 2>&1
+// RUN:   --std=c++11 -fsanitize=init-order -o %t
+// RUN: ASAN_OPTIONS=check_initialization_order=true %t 2>&1
 // RUN: %clangxx_asan -m32 -O0 %s %p/Helpers/initialization-nobug-extra.cc\
-// RUN:   --std=c++11 -fsanitize=init-order -o %t && %t 2>&1
+// RUN:   --std=c++11 -fsanitize=init-order -o %t
+// RUN: ASAN_OPTIONS=check_initialization_order=true %t 2>&1
 // RUN: %clangxx_asan -m32 -O1 %s %p/Helpers/initialization-nobug-extra.cc\
-// RUN:   --std=c++11 -fsanitize=init-order -o %t && %t 2>&1
+// RUN:   --std=c++11 -fsanitize=init-order -o %t
+// RUN: ASAN_OPTIONS=check_initialization_order=true %t 2>&1
 // RUN: %clangxx_asan -m32 -O2 %s %p/Helpers/initialization-nobug-extra.cc\
-// RUN:   --std=c++11 -fsanitize=init-order -o %t && %t 2>&1
+// RUN:   --std=c++11 -fsanitize=init-order -o %t
+// RUN: ASAN_OPTIONS=check_initialization_order=true %t 2>&1
 // RUN: %clangxx_asan -m32 -O3 %s %p/Helpers/initialization-nobug-extra.cc\
-// RUN:   --std=c++11 -fsanitize=init-order -o %t && %t 2>&1
+// RUN:   --std=c++11 -fsanitize=init-order -o %t
+// RUN: ASAN_OPTIONS=check_initialization_order=true %t 2>&1
 
 // Simple access:
 // Make sure that accessing a global in the same TU is safe
diff --git a/lib/asan/lit_tests/memcmp_strict_test.cc b/lib/asan/lit_tests/memcmp_strict_test.cc
new file mode 100644
index 0000000..00bf921
--- /dev/null
+++ b/lib/asan/lit_tests/memcmp_strict_test.cc
@@ -0,0 +1,16 @@
+// RUN: %clangxx_asan -m64 -O0 %s -o %t && ASAN_OPTIONS=strict_memcmp=0 %t 2>&1 | %symbolize | FileCheck %s --check-prefix=CHECK-nonstrict
+// RUN: %clangxx_asan -m64 -O0 %s -o %t && ASAN_OPTIONS=strict_memcmp=1 %t 2>&1 | %symbolize | FileCheck %s --check-prefix=CHECK-strict
+// Default to strict_memcmp=1.
+// RUN: %clangxx_asan -m64 -O0 %s -o %t && %t 2>&1 | %symbolize | FileCheck %s --check-prefix=CHECK-strict
+
+#include <stdio.h>
+#include <string.h>
+int main() {
+  char kFoo[] = "foo";
+  char kFubar[] = "fubar";
+  int res = memcmp(kFoo, kFubar, strlen(kFubar));
+  printf("res: %d\n", res);
+  // CHECK-nonstrict: {{res: -1}}
+  // CHECK-strict: AddressSanitizer: stack-buffer-overflow
+  return 0;
+}
diff --git a/lib/asan/tests/CMakeLists.txt b/lib/asan/tests/CMakeLists.txt
index ca18084..80d6f5d 100644
--- a/lib/asan/tests/CMakeLists.txt
+++ b/lib/asan/tests/CMakeLists.txt
@@ -55,12 +55,10 @@
 if(ANDROID)
   list(APPEND ASAN_UNITTEST_COMMON_CFLAGS
     -DASAN_FLEXIBLE_MAPPING_AND_OFFSET=0
-    -DASAN_LOW_MEMORY=1
     -DASAN_NEEDS_SEGV=0)
 else()
   list(APPEND ASAN_UNITTEST_COMMON_CFLAGS
     -DASAN_FLEXIBLE_MAPPING_AND_OFFSET=1
-    -DASAN_LOW_MEMORY=0
     -DASAN_NEEDS_SEGV=1)
 endif()
 
diff --git a/lib/asan/tests/asan_noinst_test.cc b/lib/asan/tests/asan_noinst_test.cc
index 0b8063e..1681842 100644
--- a/lib/asan/tests/asan_noinst_test.cc
+++ b/lib/asan/tests/asan_noinst_test.cc
@@ -79,11 +79,20 @@
 
 
 TEST(AddressSanitizer, NoInstMallocTest) {
-#ifdef __arm__
-  MallocStress(300000);
-#else
-  MallocStress(1000000);
-#endif
+  MallocStress(ASAN_LOW_MEMORY ? 300000 : 1000000);
+}
+
+TEST(AddressSanitizer, ThreadedMallocStressTest) {
+  const int kNumThreads = 4;
+  const int kNumIterations = (ASAN_LOW_MEMORY) ? 10000 : 100000;
+  pthread_t t[kNumThreads];
+  for (int i = 0; i < kNumThreads; i++) {
+    PTHREAD_CREATE(&t[i], 0, (void* (*)(void *x))MallocStress,
+        (void*)kNumIterations);
+  }
+  for (int i = 0; i < kNumThreads; i++) {
+    PTHREAD_JOIN(t[i], 0);
+  }
 }
 
 static void PrintShadow(const char *tag, uptr ptr, size_t size) {
@@ -253,7 +262,7 @@
   stack.trace[0] = 0x890;
   stack.size = 1;
 
-  const int size = 32;
+  const int size = 1024;
   void *p = __asan::asan_malloc(size, &stack);
   __asan::asan_free(p, &stack, __asan::FROM_MALLOC);
   size_t i;
@@ -263,8 +272,7 @@
     __asan::asan_free(p1, &stack, __asan::FROM_MALLOC);
     if (p1 == p) break;
   }
-  // fprintf(stderr, "i=%ld\n", i);
-  EXPECT_GE(i, 100000U);
+  EXPECT_GE(i, 10000U);
   EXPECT_LT(i, max_i);
 }
 
@@ -455,7 +463,8 @@
   // asan_allocator2 does not keep huge chunks in free list, but unmaps them.
   // The chunk should be greater than the quarantine size,
   // otherwise it will be stuck in quarantine instead of being unmaped.
-  static const size_t kLargeMallocSize = 1 << 29;  // 512M
+  static const size_t kLargeMallocSize = (1 << 28) + 1;  // 256M
+  free(Ident(malloc(kLargeMallocSize)));  // Drain quarantine.
   uptr old_heap_size = __asan_get_heap_size();
   for (int i = 0; i < 3; i++) {
     // fprintf(stderr, "allocating %zu bytes:\n", kLargeMallocSize);
@@ -857,3 +866,21 @@
   EXPECT_EQ(0L, Ident(p));
 #endif
 }
+
+TEST(AddressSanitizerInterface, CallocReturnsZeroMem) {
+  size_t sizes[] = {16, 1000, 10000, 100000, 2100000};
+  for (size_t s = 0; s < ARRAY_SIZE(sizes); s++) {
+    size_t size = sizes[s];
+    for (size_t iter = 0; iter < 5; iter++) {
+      char *x = Ident((char*)calloc(1, size));
+      EXPECT_EQ(x[0], 0);
+      EXPECT_EQ(x[size - 1], 0);
+      EXPECT_EQ(x[size / 2], 0);
+      EXPECT_EQ(x[size / 3], 0);
+      EXPECT_EQ(x[size / 4], 0);
+      memset(x, 0x42, size);
+      free(Ident(x));
+      free(Ident(malloc(Ident(1 << 27))));  // Try to drain the quarantine.
+    }
+  }
+}
diff --git a/lib/asan/tests/asan_test.cc b/lib/asan/tests/asan_test.cc
index 1096c2e..96df29e 100644
--- a/lib/asan/tests/asan_test.cc
+++ b/lib/asan/tests/asan_test.cc
@@ -277,41 +277,6 @@
 }  // namespace
 #endif
 
-static void MallocStress(size_t n) {
-  uint32_t seed = my_rand();
-  for (size_t iter = 0; iter < 10; iter++) {
-    vector<void *> vec;
-    for (size_t i = 0; i < n; i++) {
-      if ((i % 3) == 0) {
-        if (vec.empty()) continue;
-        size_t idx = my_rand_r(&seed) % vec.size();
-        void *ptr = vec[idx];
-        vec[idx] = vec.back();
-        vec.pop_back();
-        free_aaa(ptr);
-      } else {
-        size_t size = my_rand_r(&seed) % 1000 + 1;
-#ifndef __APPLE__
-        size_t alignment = 1 << (my_rand_r(&seed) % 7 + 3);
-        char *ptr = (char*)memalign_aaa(alignment, size);
-#else
-        char *ptr = (char*) malloc_aaa(size);
-#endif
-        vec.push_back(ptr);
-        ptr[0] = 0;
-        ptr[size-1] = 0;
-        ptr[size/2] = 0;
-      }
-    }
-    for (size_t i = 0; i < vec.size(); i++)
-      free_aaa(vec[i]);
-  }
-}
-
-TEST(AddressSanitizer, MallocStressTest) {
-  MallocStress((ASAN_LOW_MEMORY) ? 20000 : 200000);
-}
-
 static void TestLargeMalloc(size_t size) {
   char buff[1024];
   sprintf(buff, "is located 1 bytes to the left of %lu-byte", (long)size);
@@ -319,26 +284,17 @@
 }
 
 TEST(AddressSanitizer, LargeMallocTest) {
-  for (int i = 113; i < (1 << 28); i = i * 2 + 13) {
+  const int max_size = (SANITIZER_WORDSIZE == 32) ? 1 << 26 : 1 << 28;
+  for (int i = 113; i < max_size; i = i * 2 + 13) {
     TestLargeMalloc(i);
   }
 }
 
-#if ASAN_LOW_MEMORY != 1
 TEST(AddressSanitizer, HugeMallocTest) {
-#ifdef __APPLE__
-  // It was empirically found out that 1215 megabytes is the maximum amount of
-  // memory available to the process under AddressSanitizer on 32-bit Mac 10.6.
-  // 32-bit Mac 10.7 gives even less (< 1G).
-  // (the libSystem malloc() allows allocating up to 2300 megabytes without
-  // ASan).
-  size_t n_megs = SANITIZER_WORDSIZE == 32 ? 500 : 4100;
-#else
-  size_t n_megs = SANITIZER_WORDSIZE == 32 ? 2600 : 4100;
-#endif
+  if (SANITIZER_WORDSIZE != 64) return;
+  size_t n_megs = 4100;
   TestLargeMalloc(n_megs << 20);
 }
-#endif
 
 #ifndef __APPLE__
 void MemalignRun(size_t align, size_t size, int idx) {
@@ -358,19 +314,6 @@
 }
 #endif
 
-TEST(AddressSanitizer, ThreadedMallocStressTest) {
-  const int kNumThreads = 4;
-  const int kNumIterations = (ASAN_LOW_MEMORY) ? 10000 : 100000;
-  pthread_t t[kNumThreads];
-  for (int i = 0; i < kNumThreads; i++) {
-    PTHREAD_CREATE(&t[i], 0, (void* (*)(void *x))MallocStress,
-        (void*)kNumIterations);
-  }
-  for (int i = 0; i < kNumThreads; i++) {
-    PTHREAD_JOIN(t[i], 0);
-  }
-}
-
 void *ManyThreadsWorker(void *a) {
   for (int iter = 0; iter < 100; iter++) {
     for (size_t size = 100; size < 2000; size *= 2) {
diff --git a/lib/asan/tests/asan_test_config.h b/lib/asan/tests/asan_test_config.h
index 1d28e99..6eb33ce 100644
--- a/lib/asan/tests/asan_test_config.h
+++ b/lib/asan/tests/asan_test_config.h
@@ -47,10 +47,6 @@
 # error "please define ASAN_NEEDS_SEGV"
 #endif
 
-#ifndef ASAN_LOW_MEMORY
-# define ASAN_LOW_MEMORY 0
-#endif
-
 #ifndef ASAN_AVOID_EXPENSIVE_TESTS
 # define ASAN_AVOID_EXPENSIVE_TESTS 0
 #endif
diff --git a/lib/interception/interception.h b/lib/interception/interception.h
index b6be72c..2ccc903 100644
--- a/lib/interception/interception.h
+++ b/lib/interception/interception.h
@@ -23,20 +23,12 @@
 
 // These typedefs should be used only in the interceptor definitions to replace
 // the standard system types (e.g. SSIZE_T instead of ssize_t)
-typedef __sanitizer::uptr SIZE_T;
-typedef __sanitizer::sptr SSIZE_T;
-typedef __sanitizer::sptr PTRDIFF_T;
-typedef __sanitizer::s64  INTMAX_T;
-// WARNING: OFF_T may be different from OS type off_t, depending on the value of
-// _FILE_OFFSET_BITS. This definition of OFF_T matches the ABI of system calls
-// like pread and mmap, as opposed to pread64 and mmap64.
-// Mac and Linux/x86-64 are special.
-#if defined(__APPLE__) || (defined(__linux__) && defined(__x86_64__))
-typedef __sanitizer::u64 OFF_T;
-#else
-typedef __sanitizer::uptr OFF_T;
-#endif
-typedef __sanitizer::u64  OFF64_T;
+typedef __sanitizer::uptr    SIZE_T;
+typedef __sanitizer::sptr    SSIZE_T;
+typedef __sanitizer::sptr    PTRDIFF_T;
+typedef __sanitizer::s64     INTMAX_T;
+typedef __sanitizer::OFF_T   OFF_T;
+typedef __sanitizer::OFF64_T OFF64_T;
 
 // How to add an interceptor:
 // Suppose you need to wrap/replace system function (generally, from libc):
diff --git a/lib/msan/CMakeLists.txt b/lib/msan/CMakeLists.txt
index fa057a6..6f10942 100644
--- a/lib/msan/CMakeLists.txt
+++ b/lib/msan/CMakeLists.txt
@@ -11,6 +11,7 @@
   )
 set(MSAN_RTL_CFLAGS
   ${SANITIZER_COMMON_CFLAGS}
+  -fno-rtti
   -fPIE
   # Prevent clang from generating libc calls.
   -ffreestanding)
diff --git a/lib/msan/lit_tests/c-strdup.c b/lib/msan/lit_tests/c-strdup.c
new file mode 100644
index 0000000..7772f0f
--- /dev/null
+++ b/lib/msan/lit_tests/c-strdup.c
@@ -0,0 +1,17 @@
+// RUN: %clang_msan -m64 -O0 %s -o %t && %t >%t.out 2>&1
+// RUN: %clang_msan -m64 -O1 %s -o %t && %t >%t.out 2>&1
+// RUN: %clang_msan -m64 -O2 %s -o %t && %t >%t.out 2>&1
+// RUN: %clang_msan -m64 -O3 %s -o %t && %t >%t.out 2>&1
+
+// Test that strdup in C programs is intercepted.
+// GLibC headers translate strdup to __strdup at -O1 and higher.
+
+#include <stdlib.h>
+#include <string.h>
+int main(int argc, char **argv) {
+  char buf[] = "abc";
+  char *p = strdup(buf);
+  if (*p)
+    exit(0);
+  return 0;
+}
diff --git a/lib/msan/lit_tests/lit.cfg b/lib/msan/lit_tests/lit.cfg
index 9429050..07b5cbe 100644
--- a/lib/msan/lit_tests/lit.cfg
+++ b/lib/msan/lit_tests/lit.cfg
@@ -57,14 +57,17 @@
 lit.load_config(config, compiler_rt_lit_cfg)
 
 # Setup default compiler flags used with -fsanitize=memory option.
-clang_msan_cxxflags = ["-ccc-cxx ",
-                       "-fsanitize=memory",
-                       "-mno-omit-leaf-frame-pointer",
-                       "-fno-omit-frame-pointer",
-                       "-fno-optimize-sibling-calls",
-                       "-g",
-                       "-fPIE",
-                       "-pie"]
+clang_msan_cflags = ["-fsanitize=memory",
+                     "-mno-omit-leaf-frame-pointer",
+                     "-fno-omit-frame-pointer",
+                     "-fno-optimize-sibling-calls",
+                     "-g",
+                     "-fPIE",
+                     "-pie"]
+clang_msan_cxxflags = ["-ccc-cxx "] + clang_msan_cflags
+config.substitutions.append( ("%clang_msan ",
+                              " ".join([config.clang] + clang_msan_cflags) + 
+                              " ") )
 config.substitutions.append( ("%clangxx_msan ",
                               " ".join([config.clang] + clang_msan_cxxflags) + 
                               " ") )
diff --git a/lib/msan/lit_tests/no_sanitize_memory.cc b/lib/msan/lit_tests/no_sanitize_memory.cc
new file mode 100644
index 0000000..48afc17
--- /dev/null
+++ b/lib/msan/lit_tests/no_sanitize_memory.cc
@@ -0,0 +1,34 @@
+// RUN: %clangxx_msan -m64 -O0 %s -o %t && %t >%t.out 2>&1
+// RUN: %clangxx_msan -m64 -O1 %s -o %t && %t >%t.out 2>&1
+// RUN: %clangxx_msan -m64 -O2 %s -o %t && %t >%t.out 2>&1
+// RUN: %clangxx_msan -m64 -O3 %s -o %t && %t >%t.out 2>&1
+
+// RUN: %clangxx_msan -m64 -O0 %s -o %t -DCHECK_IN_F && %t >%t.out 2>&1
+// RUN: %clangxx_msan -m64 -O1 %s -o %t -DCHECK_IN_F && %t >%t.out 2>&1
+// RUN: %clangxx_msan -m64 -O2 %s -o %t -DCHECK_IN_F && %t >%t.out 2>&1
+// RUN: %clangxx_msan -m64 -O3 %s -o %t -DCHECK_IN_F && %t >%t.out 2>&1
+
+// Test that (no_sanitize_memory) functions
+// * don't check shadow values (-DCHECK_IN_F)
+// * treat all values loaded from memory as fully initialized (-UCHECK_IN_F)
+
+#include <stdlib.h>
+#include <stdio.h>
+
+__attribute__((noinline))
+__attribute__((no_sanitize_memory))
+int f(void) {
+  int x;
+  int * volatile p = &x;
+#ifdef CHECK_IN_F
+  if (*p)
+    exit(0);
+#endif
+  return *p;
+}
+
+int main(void) {
+  if (f())
+    exit(0);
+  return 0;
+}
diff --git a/lib/msan/lit_tests/no_sanitize_memory_prop.cc b/lib/msan/lit_tests/no_sanitize_memory_prop.cc
new file mode 100644
index 0000000..c74ca6b
--- /dev/null
+++ b/lib/msan/lit_tests/no_sanitize_memory_prop.cc
@@ -0,0 +1,33 @@
+// RUN: %clangxx_msan -m64 -O0 %s -o %t && %t >%t.out 2>&1
+// RUN: %clangxx_msan -m64 -O1 %s -o %t && not %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+// RUN: %clangxx_msan -m64 -O2 %s -o %t && not %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+// RUN: %clangxx_msan -m64 -O3 %s -o %t && not %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+// Test that (no_sanitize_memory) functions propagate shadow.
+
+// Note that at -O0 there is no report, because 'x' in 'f' is spilled to the
+// stack, and then loaded back as a fully initialiazed value (due to
+// no_sanitize_memory attribute).
+
+#include <stdlib.h>
+#include <stdio.h>
+
+__attribute__((noinline))
+__attribute__((no_sanitize_memory))
+int f(int x) {
+  return x;
+}
+
+int main(void) {
+  int x;
+  int * volatile p = &x;
+  int y = f(*p);
+  // CHECK: WARNING: Use of uninitialized value
+  // CHECK: {{#0 0x.* in main .*no_sanitize_memory_prop.cc:}}[[@LINE+1]]
+  if (y)
+    exit(0);
+  return 0;
+}
diff --git a/lib/msan/lit_tests/readdir64.cc b/lib/msan/lit_tests/readdir64.cc
new file mode 100644
index 0000000..0ec106c
--- /dev/null
+++ b/lib/msan/lit_tests/readdir64.cc
@@ -0,0 +1,27 @@
+// RUN: %clangxx_msan -m64 -O0 %s -o %t && %t
+// RUN: %clangxx_msan -m64 -O1 %s -o %t && %t
+// RUN: %clangxx_msan -m64 -O2 %s -o %t && %t
+// RUN: %clangxx_msan -m64 -O3 %s -o %t && %t
+
+// RUN: %clangxx_msan -m64 -O0 -D_FILE_OFFSET_BITS=64 %s -o %t && %t
+// RUN: %clangxx_msan -m64 -O1 -D_FILE_OFFSET_BITS=64 %s -o %t && %t
+// RUN: %clangxx_msan -m64 -O2 -D_FILE_OFFSET_BITS=64 %s -o %t && %t
+// RUN: %clangxx_msan -m64 -O3 -D_FILE_OFFSET_BITS=64 %s -o %t && %t
+
+// Test that readdir64 is intercepted as well as readdir.
+
+#include <sys/types.h>
+#include <dirent.h>
+#include <stdlib.h>
+
+
+int main(void) {
+  DIR *dir = opendir(".");
+  struct dirent *d = readdir(dir);
+  if (d->d_name[0]) {
+    closedir(dir);
+    exit(0);
+  }
+  closedir(dir);
+  return 0;
+}
diff --git a/lib/msan/msan.cc b/lib/msan/msan.cc
index ba27234..a1f8d0f 100644
--- a/lib/msan/msan.cc
+++ b/lib/msan/msan.cc
@@ -59,6 +59,7 @@
 } __msan_stack_bounds;
 
 static THREADLOCAL bool is_in_symbolizer;
+static THREADLOCAL bool is_in_loader;
 
 extern "C" const int __msan_track_origins;
 int __msan_get_track_origins() {
@@ -73,7 +74,7 @@
   const sptr kBufSize = 4095;
   char *filename = (char*)MmapOrDie(kBufSize, __FUNCTION__);
   while (proc_maps.Next(/* start */0, /* end */0, /* file_offset */0,
-                        filename, kBufSize)) {
+                        filename, kBufSize, /* protection */0)) {
     if (internal_strstr(filename, "libdynamorio") != 0) {
       result = true;
       break;
@@ -87,6 +88,14 @@
 void ExitSymbolizer()  { is_in_symbolizer = false; }
 bool IsInSymbolizer() { return is_in_symbolizer; }
 
+void EnterLoader() { is_in_loader = true; }
+void ExitLoader()  { is_in_loader = false; }
+
+extern "C" {
+SANITIZER_INTERFACE_ATTRIBUTE
+bool __msan_is_in_loader() { return is_in_loader; }
+}
+
 static Flags msan_flags;
 
 Flags *flags() {
@@ -118,6 +127,8 @@
   ParseFlag(str, &f->report_umrs, "report_umrs");
   ParseFlag(str, &f->verbosity, "verbosity");
   ParseFlag(str, &f->strip_path_prefix, "strip_path_prefix");
+  ParseFlag(str, &f->fast_unwind_on_fatal, "fast_unwind_on_fatal");
+  ParseFlag(str, &f->fast_unwind_on_malloc, "fast_unwind_on_malloc");
 }
 
 static void InitializeFlags(Flags *f, const char *options) {
@@ -131,6 +142,8 @@
   f->report_umrs = true;
   f->verbosity = 0;
   f->strip_path_prefix = "";
+  f->fast_unwind_on_fatal = false;
+  f->fast_unwind_on_malloc = true;
 
   // Override from user-specified string.
   if (__msan_default_options)
@@ -153,8 +166,11 @@
 
 void GetStackTrace(StackTrace *stack, uptr max_s, uptr pc, uptr bp,
                    bool fast) {
-  if (!fast)
+  if (!fast) {
+    // Block reports from our interceptors during _Unwind_Backtrace.
+    SymbolizerScope sym_scope;
     return stack->SlowUnwindStack(pc, max_s);
+  }
 
   uptr stack_top, stack_bottom;
   GetCurrentStackBounds(&stack_top, &stack_bottom);
@@ -183,7 +199,7 @@
   ++msan_report_count;
 
   StackTrace stack;
-  GetStackTrace(&stack, kStackTraceMax, pc, bp, /*fast*/false);
+  GetStackTrace(&stack, kStackTraceMax, pc, bp, flags()->fast_unwind_on_fatal);
 
   u32 report_origin =
     (__msan_track_origins && OriginIsValid(origin)) ? origin : 0;
@@ -223,6 +239,7 @@
 
   InstallAtExitHandler();
   SetDieCallback(MsanDie);
+  InitTlsSize();
   InitializeInterceptors();
 
   ReplaceOperatorsNewAndDelete();
@@ -281,7 +298,8 @@
     GET_CALLER_PC_BP_SP;
     (void)sp;
     StackTrace stack;
-    GetStackTrace(&stack, kStackTraceMax, pc, bp, /*fast*/false);
+    GetStackTrace(&stack, kStackTraceMax, pc, bp,
+                  flags()->fast_unwind_on_fatal);
     ReportExpectedUMRNotFound(&stack);
     Die();
   }
diff --git a/lib/msan/msan.h b/lib/msan/msan.h
index 2150c83..1880b8e 100644
--- a/lib/msan/msan.h
+++ b/lib/msan/msan.h
@@ -26,6 +26,8 @@
 #define MEM_IS_SHADOW(mem) ((uptr)mem >=         0x200000000000ULL && \
                             (uptr)mem <=         0x400000000000ULL)
 
+struct link_map;  // Opaque type returned by dlopen().
+
 const int kMsanParamTlsSizeInWords = 100;
 const int kMsanRetvalTlsSizeInWords = 100;
 
@@ -50,6 +52,14 @@
 void ExitSymbolizer();
 bool IsInSymbolizer();
 
+struct SymbolizerScope {
+  SymbolizerScope() { EnterSymbolizer(); }
+  ~SymbolizerScope() { ExitSymbolizer(); }
+};
+
+void EnterLoader();
+void ExitLoader();
+
 void MsanDie();
 void PrintWarning(uptr pc, uptr bp);
 void PrintWarningWithOrigin(uptr pc, uptr bp, u32 origin);
@@ -61,13 +71,15 @@
 void ReportExpectedUMRNotFound(StackTrace *stack);
 void ReportAtExitStatistics();
 
+void UnpoisonMappedDSO(struct link_map *map);
+
 #define GET_MALLOC_STACK_TRACE                                     \
   StackTrace stack;                                                \
   stack.size = 0;                                                  \
   if (__msan_get_track_origins() && msan_inited)                   \
     GetStackTrace(&stack, flags()->num_callers,                    \
         StackTrace::GetCurrentPc(), GET_CURRENT_FRAME(),           \
-        /* fast */ true)
+        flags()->fast_unwind_on_malloc)
 
 }  // namespace __msan
 
diff --git a/lib/msan/msan_flags.h b/lib/msan/msan_flags.h
index 0c41c2e..cfaf963 100644
--- a/lib/msan/msan_flags.h
+++ b/lib/msan/msan_flags.h
@@ -26,6 +26,10 @@
   bool poison_in_malloc;  // default: true
   bool report_umrs;
   const char *strip_path_prefix;
+  // Use fast (frame-pointer-based) unwinder on fatal errors (if available).
+  bool fast_unwind_on_fatal;
+  // Use fast (frame-pointer-based) unwinder on malloc/free (if available).
+  bool fast_unwind_on_malloc;
 };
 
 Flags *flags();
diff --git a/lib/msan/msan_interceptors.cc b/lib/msan/msan_interceptors.cc
index a6f25e8..287d2d2 100644
--- a/lib/msan/msan_interceptors.cc
+++ b/lib/msan/msan_interceptors.cc
@@ -91,6 +91,13 @@
   return res;
 }
 
+INTERCEPTOR(void *, readdir64, void *a) {
+  ENSURE_MSAN_INITED();
+  void *res = REAL(readdir)(a);
+  __msan_unpoison(res, __sanitizer::struct_dirent64_sz);
+  return res;
+}
+
 INTERCEPTOR(void *, memcpy, void *dest, const void *src, SIZE_T n) {
   return __msan_memcpy(dest, src, n);
 }
@@ -161,6 +168,32 @@
   return res;
 }
 
+INTERCEPTOR(char *, __strdup, char *src) {
+  ENSURE_MSAN_INITED();
+  SIZE_T n = REAL(strlen)(src);
+  char *res = REAL(__strdup)(src);
+  __msan_copy_poison(res, src, n + 1);
+  return res;
+}
+
+INTERCEPTOR(char *, strndup, char *src, SIZE_T n) {
+  ENSURE_MSAN_INITED();
+  SIZE_T copy_size = REAL(strnlen)(src, n);
+  char *res = REAL(strndup)(src, n);
+  __msan_copy_poison(res, src, copy_size);
+  __msan_unpoison(res + copy_size, 1); // \0
+  return res;
+}
+
+INTERCEPTOR(char *, __strndup, char *src, SIZE_T n) {
+  ENSURE_MSAN_INITED();
+  SIZE_T copy_size = REAL(strnlen)(src, n);
+  char *res = REAL(__strndup)(src, n);
+  __msan_copy_poison(res, src, copy_size);
+  __msan_unpoison(res + copy_size, 1); // \0
+  return res;
+}
+
 INTERCEPTOR(char *, gcvt, double number, SIZE_T ndigit, char *buf) {
   ENSURE_MSAN_INITED();
   char *res = REAL(gcvt)(number, ndigit, buf);
@@ -762,6 +795,25 @@
   return res;
 }
 
+// dlopen() ultimately calls mmap() down inside the loader, which generally
+// doesn't participate in dynamic symbol resolution.  Therefore we won't
+// intercept its calls to mmap, and we have to hook it here.  The loader
+// initializes the module before returning, so without the dynamic component, we
+// won't be able to clear the shadow before the initializers.  Fixing this would
+// require putting our own initializer first to clear the shadow.
+INTERCEPTOR(void *, dlopen, const char *filename, int flag) {
+  ENSURE_MSAN_INITED();
+  EnterLoader();
+  link_map *map = (link_map *)REAL(dlopen)(filename, flag);
+  ExitLoader();
+  if (!__msan_has_dynamic_component()) {
+    // If msandr didn't clear the shadow before the initializers ran, we do it
+    // ourselves afterwards.
+    UnpoisonMappedDSO(map);
+  }
+  return (void *)map;
+}
+
 INTERCEPTOR(int, getrusage, int who, void *usage) {
   ENSURE_MSAN_INITED();
   int res = REAL(getrusage)(who, usage);
@@ -771,6 +823,36 @@
   return res;
 }
 
+extern "C" int pthread_attr_init(void *attr);
+extern "C" int pthread_attr_destroy(void *attr);
+extern "C" int pthread_attr_setstacksize(void *attr, uptr stacksize);
+extern "C" int pthread_attr_getstacksize(void *attr, uptr *stacksize);
+
+INTERCEPTOR(int, pthread_create, void *th, void *attr, void *(*callback)(void*),
+            void * param) {
+  ENSURE_MSAN_INITED(); // for GetTlsSize()
+  __sanitizer_pthread_attr_t myattr;
+  if (attr == 0) {
+    pthread_attr_init(&myattr);
+    attr = &myattr;
+  }
+  uptr stacksize = 0;
+  pthread_attr_getstacksize(attr, &stacksize);
+  // We place the huge ThreadState object into TLS, account for that.
+  const uptr minstacksize = GetTlsSize() + 128*1024;
+  if (stacksize < minstacksize) {
+    if (flags()->verbosity)
+      Printf("MemorySanitizer: increasing stacksize %zu->%zu\n", stacksize,
+             minstacksize);
+    pthread_attr_setstacksize(attr, minstacksize);
+  }
+
+  int res = REAL(pthread_create)(th, attr, callback, param);
+  if (attr == &myattr)
+    pthread_attr_destroy(&myattr);
+  return res;
+}
+
 #define COMMON_INTERCEPTOR_WRITE_RANGE(ctx, ptr, size) \
     __msan_unpoison(ptr, size)
 #define COMMON_INTERCEPTOR_READ_RANGE(ctx, ptr, size) do { } while (false)
@@ -908,6 +990,7 @@
   INTERCEPT_FUNCTION(fread_unlocked);
   INTERCEPT_FUNCTION(readlink);
   INTERCEPT_FUNCTION(readdir);
+  INTERCEPT_FUNCTION(readdir64);
   INTERCEPT_FUNCTION(memcpy);
   INTERCEPT_FUNCTION(memset);
   INTERCEPT_FUNCTION(memmove);
@@ -916,6 +999,9 @@
   INTERCEPT_FUNCTION(wmemmove);
   INTERCEPT_FUNCTION(strcpy);  // NOLINT
   INTERCEPT_FUNCTION(strdup);
+  INTERCEPT_FUNCTION(__strdup);
+  INTERCEPT_FUNCTION(strndup);
+  INTERCEPT_FUNCTION(__strndup);
   INTERCEPT_FUNCTION(strncpy);  // NOLINT
   INTERCEPT_FUNCTION(strlen);
   INTERCEPT_FUNCTION(strnlen);
@@ -973,7 +1059,9 @@
   INTERCEPT_FUNCTION(recvfrom);
   INTERCEPT_FUNCTION(recvmsg);
   INTERCEPT_FUNCTION(dladdr);
+  INTERCEPT_FUNCTION(dlopen);
   INTERCEPT_FUNCTION(getrusage);
+  INTERCEPT_FUNCTION(pthread_create);
   inited = 1;
 }
 }  // namespace __msan
diff --git a/lib/msan/msan_interface_internal.h b/lib/msan/msan_interface_internal.h
index 905c5b7..e1cd13c 100644
--- a/lib/msan/msan_interface_internal.h
+++ b/lib/msan/msan_interface_internal.h
@@ -104,6 +104,10 @@
 SANITIZER_INTERFACE_ATTRIBUTE
 int __msan_get_param_tls_offset();
 
+// For intercepting mmap from ld.so in msandr.
+SANITIZER_INTERFACE_ATTRIBUTE
+bool __msan_is_in_loader();
+
 // For testing.
 SANITIZER_INTERFACE_ATTRIBUTE
 u32 __msan_get_umr_origin();
diff --git a/lib/msan/msan_linux.cc b/lib/msan/msan_linux.cc
index 0b08b7d..cda23b1 100644
--- a/lib/msan/msan_linux.cc
+++ b/lib/msan/msan_linux.cc
@@ -16,6 +16,9 @@
 
 #include "msan.h"
 
+#include <algorithm>
+#include <elf.h>
+#include <link.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <signal.h>
@@ -87,6 +90,42 @@
 void InstallAtExitHandler() {
   atexit(MsanAtExit);
 }
+
+void UnpoisonMappedDSO(link_map *map) {
+  typedef ElfW(Phdr) Elf_Phdr;
+  typedef ElfW(Ehdr) Elf_Ehdr;
+  char *base = (char *)map->l_addr;
+  Elf_Ehdr *ehdr = (Elf_Ehdr *)base;
+  char *phdrs = base + ehdr->e_phoff;
+  char *phdrs_end = phdrs + ehdr->e_phnum * ehdr->e_phentsize;
+
+  // Find the segment with the minimum base so we can "relocate" the p_vaddr
+  // fields.  Typically ET_DYN objects (DSOs) have base of zero and ET_EXEC
+  // objects have a non-zero base.
+  uptr preferred_base = ~0ULL;
+  for (char *iter = phdrs; iter != phdrs_end; iter += ehdr->e_phentsize) {
+    Elf_Phdr *phdr = (Elf_Phdr *)iter;
+    if (phdr->p_type == PT_LOAD)
+      preferred_base = std::min(preferred_base, (uptr)phdr->p_vaddr);
+  }
+
+  // Compute the delta from the real base to get a relocation delta.
+  sptr delta = (uptr)base - preferred_base;
+  // Now we can figure out what the loader really mapped.
+  for (char *iter = phdrs; iter != phdrs_end; iter += ehdr->e_phentsize) {
+    Elf_Phdr *phdr = (Elf_Phdr *)iter;
+    if (phdr->p_type == PT_LOAD) {
+      uptr seg_start = phdr->p_vaddr + delta;
+      uptr seg_end = seg_start + phdr->p_memsz;
+      // None of these values are aligned.  We consider the ragged edges of the
+      // load command as defined, since they are mapped from the file.
+      seg_start = RoundDownTo(seg_start, GetPageSizeCached());
+      seg_end = RoundUpTo(seg_end, GetPageSizeCached());
+      __msan_unpoison((void *)seg_start, seg_end - seg_start);
+    }
+  }
+}
+
 }  // namespace __msan
 
 #endif  // __linux__
diff --git a/lib/msan/msan_report.cc b/lib/msan/msan_report.cc
index 16b13f6..df6990f 100644
--- a/lib/msan/msan_report.cc
+++ b/lib/msan/msan_report.cc
@@ -44,11 +44,6 @@
   const char *End()    { return Default(); }
 };
 
-struct SymbolizerScope {
-  SymbolizerScope() { EnterSymbolizer(); }
-  ~SymbolizerScope() { ExitSymbolizer(); }
-};
-
 static void PrintStack(const uptr *trace, uptr size) {
   SymbolizerScope sym_scope;
   StackTrace::PrintStack(trace, size, true, flags()->strip_path_prefix, 0);
diff --git a/lib/msan/tests/CMakeLists.txt b/lib/msan/tests/CMakeLists.txt
index 7067c45..813aad0 100644
--- a/lib/msan/tests/CMakeLists.txt
+++ b/lib/msan/tests/CMakeLists.txt
@@ -32,6 +32,7 @@
 
 # Unittest sources and build flags.
 set(MSAN_UNITTEST_SOURCE msan_test.cc)
+set(MSAN_LOADABLE_SOURCE msan_loadable.cc)
 set(MSAN_UNITTEST_HEADERS
   msandr_test_so.h
   ../../../include/sanitizer/msan_interface.h
@@ -65,6 +66,10 @@
   # FIXME: we build libcxx without cxxabi and need libstdc++ to provide it.
   -lstdc++
 )
+set(MSAN_LOADABLE_LINK_FLAGS
+  -fsanitize=memory
+  -shared
+)
 
 # Compile source for the given architecture, using compiler
 # options in ${ARGN}, and add it to the object list.
@@ -96,6 +101,7 @@
   add_compiler_rt_test(${test_suite} ${test_name}
                        OBJECTS ${ARGN}
                        DEPS ${MSAN_RUNTIME_LIBRARIES} ${ARGN}
+                            ${MSAN_LOADABLE_SO}
                        LINK_FLAGS ${MSAN_UNITTEST_LINK_FLAGS}
                                   ${TARGET_LINK_FLAGS}
                                   "-Wl,-rpath=${CMAKE_CURRENT_BINARY_DIR}")
@@ -130,11 +136,22 @@
   msan_compile(MSAN_INST_TEST_OBJECTS ${MSAN_UNITTEST_SOURCE} ${arch}
                ${MSAN_UNITTEST_INSTRUMENTED_CFLAGS})
 
+  # Instrumented loadable module objects.
+  set(MSAN_INST_LOADABLE_OBJECTS)
+  msan_compile(MSAN_INST_LOADABLE_OBJECTS ${MSAN_LOADABLE_SOURCE} ${arch}
+               ${MSAN_UNITTEST_INSTRUMENTED_CFLAGS})
+
   # Uninstrumented shared object for MSanDR tests.
   set(MSANDR_TEST_OBJECTS)
   msan_compile(MSANDR_TEST_OBJECTS ${MSANDR_UNITTEST_SOURCE} ${arch}
                ${MSAN_UNITTEST_COMMON_CFLAGS})
 
+  # Instrumented loadable library tests.
+  set(MSAN_LOADABLE_SO)
+  msan_link_shared(MSAN_LOADABLE_SO "libmsan_loadable" ${arch}
+                   OBJECTS ${MSAN_INST_LOADABLE_OBJECTS}
+                   DEPS ${MSAN_INST_LOADABLE_OBJECTS} ${MSAN_RUNTIME_LIBRARIES})
+
   # Uninstrumented shared library tests.
   set(MSANDR_TEST_SO)
   msan_link_shared(MSANDR_TEST_SO "libmsandr_test" ${arch}
diff --git a/lib/msan/tests/msan_loadable.cc b/lib/msan/tests/msan_loadable.cc
new file mode 100644
index 0000000..db3bf48
--- /dev/null
+++ b/lib/msan/tests/msan_loadable.cc
@@ -0,0 +1,45 @@
+//===-- msan_loadable.cc --------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of MemorySanitizer.
+//
+// MemorySanitizer unit tests.
+//===----------------------------------------------------------------------===//
+
+#include "msan/msan_interface_internal.h"
+#include <stdlib.h>
+
+static void *dso_global;
+
+// No name mangling.
+extern "C" {
+
+__attribute__((constructor))
+void loadable_module_init(void) {
+  if (!__msan_has_dynamic_component())
+    return;
+  // The real test is that this compare should not make an uninit.
+  if (dso_global == NULL)
+    dso_global = malloc(4);
+}
+
+__attribute__((destructor))
+void loadable_module_fini(void) {
+  if (!__msan_has_dynamic_component())
+    return;
+  free(dso_global);
+  // *Don't* overwrite it with NULL!  That would unpoison it, but our test
+  // relies on reloading at the same address and keeping the poison.
+}
+
+void **get_dso_global() {
+  return &dso_global;
+}
+
+}
diff --git a/lib/msan/tests/msan_test.cc b/lib/msan/tests/msan_test.cc
index b30a8df..3380c13 100644
--- a/lib/msan/tests/msan_test.cc
+++ b/lib/msan/tests/msan_test.cc
@@ -21,6 +21,7 @@
 #include <stdio.h>
 #include <assert.h>
 #include <wchar.h>
+#include <math.h>
 
 #include <dlfcn.h>
 #include <unistd.h>
@@ -34,6 +35,8 @@
 #include <sys/utsname.h>
 #include <sys/mman.h>
 #include <sys/vfs.h>
+#include <sys/types.h>
+#include <dirent.h>
 
 #if defined(__i386__) || defined(__x86_64__)
 # include <emmintrin.h>
@@ -612,6 +615,14 @@
   free(res);
 }
 
+TEST(MemorySanitizer, readdir) {
+  DIR *dir = opendir(".");
+  struct dirent *d = readdir(dir);
+  assert(d);
+  EXPECT_NOT_POISONED(d->d_name[0]);
+  closedir(dir);
+}
+
 TEST(MemorySanitizer, realpath) {
   const char* relpath = ".";
   char path[PATH_MAX + 1];
@@ -641,11 +652,39 @@
 }
 
 TEST(MemorySanitizer, strdup) {
-  char *x = strdup("zzz");
-  EXPECT_NOT_POISONED(*x);
+  char buf[4] = "abc";
+  __msan_poison(buf + 2, sizeof(*buf));
+  char *x = strdup(buf);
+  EXPECT_NOT_POISONED(x[0]);
+  EXPECT_NOT_POISONED(x[1]);
+  EXPECT_POISONED(x[2]);
+  EXPECT_NOT_POISONED(x[3]);
   free(x);
 }
 
+TEST(MemorySanitizer, strndup) {
+  char buf[4] = "abc";
+  __msan_poison(buf + 2, sizeof(*buf));
+  char *x = strndup(buf, 3);
+  EXPECT_NOT_POISONED(x[0]);
+  EXPECT_NOT_POISONED(x[1]);
+  EXPECT_POISONED(x[2]);
+  EXPECT_NOT_POISONED(x[3]);
+  free(x);
+}
+
+TEST(MemorySanitizer, strndup_short) {
+  char buf[4] = "abc";
+  __msan_poison(buf + 1, sizeof(*buf));
+  __msan_poison(buf + 2, sizeof(*buf));
+  char *x = strndup(buf, 2);
+  EXPECT_NOT_POISONED(x[0]);
+  EXPECT_POISONED(x[1]);
+  EXPECT_NOT_POISONED(x[2]);
+  free(x);
+}
+
+
 template<class T, int size>
 void TestOverlapMemmove() {
   T *x = new T[size];
@@ -860,6 +899,24 @@
   EXPECT_NOT_POISONED(b);
 }
 
+TEST(MemorySanitizer, frexp) {
+  int x;
+  x = *GetPoisoned<int>();
+  double r = frexp(1.1, &x);
+  EXPECT_NOT_POISONED(r);
+  EXPECT_NOT_POISONED(x);
+
+  x = *GetPoisoned<int>();
+  float rf = frexpf(1.1, &x);
+  EXPECT_NOT_POISONED(rf);
+  EXPECT_NOT_POISONED(x);
+
+  x = *GetPoisoned<int>();
+  double rl = frexpl(1.1, &x);
+  EXPECT_NOT_POISONED(rl);
+  EXPECT_NOT_POISONED(x);
+}
+
 struct StructWithDtor {
   ~StructWithDtor();
 };
@@ -1288,6 +1345,55 @@
   EXPECT_NOT_POISONED((unsigned long)info.dli_saddr);
 }
 
+#ifdef __GLIBC__
+extern "C" {
+  extern void *__libc_stack_end;
+}
+
+static char **GetArgv(void) {
+  uintptr_t *stack_end = (uintptr_t *)__libc_stack_end;
+  return (char**)(stack_end + 1);
+}
+
+#else  // __GLIBC__
+# error "TODO: port this"
+#endif
+
+TEST(MemorySanitizer, dlopen) {
+  // Compute the path to our loadable DSO.  We assume it's in the same
+  // directory.  Only use string routines that we intercept so far to do this.
+  char **argv = GetArgv();
+  const char *basename = "libmsan_loadable.x86_64.so";
+  size_t path_max = strlen(argv[0]) + 1 + strlen(basename) + 1;
+  char *path = new char[path_max];
+  char *last_slash = strrchr(argv[0], '/');
+  assert(last_slash);
+  snprintf(path, path_max, "%.*s/%s", int(last_slash - argv[0]),
+           argv[0], basename);
+
+  // We need to clear shadow for globals when doing dlopen.  In order to test
+  // this, we have to poison the shadow for the DSO before we load it.  In
+  // general this is difficult, but the loader tends to reload things in the
+  // same place, so we open, close, and then reopen.  The global should always
+  // start out clean after dlopen.
+  for (int i = 0; i < 2; i++) {
+    void *lib = dlopen(path, RTLD_LAZY);
+    if (lib == NULL) {
+      printf("dlerror: %s\n", dlerror());
+      assert(lib != NULL);
+    }
+    void **(*get_dso_global)() = (void **(*)())dlsym(lib, "get_dso_global");
+    assert(get_dso_global);
+    void **dso_global = get_dso_global();
+    EXPECT_NOT_POISONED(*dso_global);
+    __msan_poison(dso_global, sizeof(*dso_global));
+    EXPECT_POISONED(*dso_global);
+    dlclose(lib);
+  }
+
+  delete[] path;
+}
+
 TEST(MemorySanitizer, scanf) {
   const char *input = "42 hello";
   int* d = new int;
@@ -1323,6 +1429,27 @@
   delete (int*)p;
 }
 
+static void* SmallStackThread_threadfn(void* data) {
+  return 0;
+}
+
+TEST(MemorySanitizer, SmallStackThread) {
+  pthread_attr_t attr;
+  pthread_t t;
+  void* p;
+  int res;
+  res = pthread_attr_init(&attr);
+  ASSERT_EQ(0, res);
+  res = pthread_attr_setstacksize(&attr, 64 * 1024);
+  ASSERT_EQ(0, res);
+  res = pthread_create(&t, &attr, SimpleThread_threadfn, NULL);
+  ASSERT_EQ(0, res);
+  res = pthread_join(t, &p);
+  ASSERT_EQ(0, res);
+  res = pthread_attr_destroy(&attr);
+  ASSERT_EQ(0, res);
+}
+
 TEST(MemorySanitizer, uname) {
   struct utsname u;
   int res = uname(&u);
diff --git a/lib/msandr/msandr.cc b/lib/msandr/msandr.cc
index 235a1ed..d4c07a3 100644
--- a/lib/msandr/msandr.cc
+++ b/lib/msandr/msandr.cc
@@ -37,6 +37,7 @@
 #include <drsyscall.h>
 
 #include <sys/mman.h>
+#include <sys/syscall.h>  /* for SYS_mmap */
 
 #include <algorithm>
 #include <string>
@@ -103,6 +104,17 @@
 
 int(*__msan_get_retval_tls_offset)();
 int(*__msan_get_param_tls_offset)();
+void (*__msan_unpoison)(void *base, size_t size);
+bool (*__msan_is_in_loader)();
+
+static generic_func_t LookupCallback(module_data_t *app, const char *name) {
+  generic_func_t callback = dr_get_proc_address(app->handle, name);
+  if (callback == NULL) {
+    dr_printf("Couldn't find `%s` in %s\n", name, app->full_path);
+    CHECK(callback);
+  }
+  return callback;
+}
 
 void InitializeMSanCallbacks() {
   module_data_t *app = dr_lookup_module_by_name(dr_get_application_name());
@@ -113,25 +125,18 @@
   }
   g_app_path = app->full_path;
 
-  const char *callback_name = "__msan_get_retval_tls_offset";
-  __msan_get_retval_tls_offset =
-      (int(*)()) dr_get_proc_address(app->handle, callback_name);
-  if (__msan_get_retval_tls_offset == NULL) {
-    dr_printf("Couldn't find `%s` in %s\n", callback_name, app->full_path);
-    CHECK(__msan_get_retval_tls_offset);
-  }
+  __msan_get_retval_tls_offset = (int (*)())
+      LookupCallback(app, "__msan_get_retval_tls_offset");
+  __msan_get_param_tls_offset = (int (*)())
+      LookupCallback(app, "__msan_get_param_tls_offset");
+  __msan_unpoison = (void(*)(void *, size_t))
+      LookupCallback(app, "__msan_unpoison");
+  __msan_is_in_loader = (bool (*)())
+      LookupCallback(app, "__msan_is_in_loader");
 
-  callback_name = "__msan_get_param_tls_offset";
-  __msan_get_param_tls_offset =
-      (int(*)()) dr_get_proc_address(app->handle, callback_name);
-  if (__msan_get_param_tls_offset == NULL) {
-    dr_printf("Couldn't find `%s` in %s\n", callback_name, app->full_path);
-    CHECK(__msan_get_param_tls_offset);
-  }
+  dr_free_module_data(app);
 }
 
-#define MEM_TO_SHADOW(mem) ((mem) & ~0x400000000000ULL)
-
 // FIXME: Handle absolute addresses and PC-relative addresses.
 // FIXME: Handle TLS accesses via FS or GS.  DR assumes all other segments have
 // a zero base anyway.
@@ -506,6 +511,11 @@
 }
 
 void event_exit() {
+  // Clean up so DR doesn't tell us we're leaking memory.
+  drsys_exit();
+  drutil_exit();
+  drmgr_exit();
+
   if (VERBOSITY > 0)
     dr_printf("==DRMSAN== DONE\n");
 }
@@ -520,7 +530,7 @@
 
   if (arg->pre)
     return true;
-  if (arg->mode != DRSYS_PARAM_OUT)
+  if (!TESTANY(DRSYS_PARAM_OUT, arg->mode))
     return true;
 
   size_t sz = arg->size;
@@ -538,8 +548,19 @@
               (unsigned long long)(sz & 0xFFFFFFFF));
   }
 
-  void *p = (void *)MEM_TO_SHADOW((ptr_uint_t) arg->start_addr);
-  memset(p, 0, sz);
+  if (VERBOSITY > 0) {
+    drmf_status_t res;
+    drsys_syscall_t *syscall = (drsys_syscall_t *)user_data;
+    const char *name;
+    res = drsys_syscall_name(syscall, &name);
+    dr_printf("drsyscall: syscall '%s' arg %d wrote range [%p, %p)\n",
+              name, arg->ordinal, arg->start_addr,
+              (char *)arg->start_addr + sz);
+  }
+
+  // We don't switch to the app context because __msan_unpoison() doesn't need
+  // TLS segments.
+  __msan_unpoison(arg->start_addr, sz);
 
   return true; /* keep going */
 }
@@ -576,6 +597,19 @@
   return true;
 }
 
+static bool IsInLoader(void *drcontext) {
+  // TODO: This segment swap is inefficient.  DR should just let us query the
+  // app segment base, which it has.  Alternatively, if we disable
+  // -mangle_app_seg, then we won't need the swap.
+  bool need_swap = !dr_using_app_state(drcontext);
+  if (need_swap)
+    dr_switch_to_app_state(drcontext);
+  bool is_in_loader = __msan_is_in_loader();
+  if (need_swap)
+    dr_switch_to_dr_state(drcontext);
+  return is_in_loader;
+}
+
 void event_post_syscall(void *drcontext, int sysnum) {
   drsys_syscall_t *syscall;
   drsys_sysnum_t sysnum_full;
@@ -598,6 +632,30 @@
         drsys_iterate_memargs(drcontext, drsys_iter_memarg_cb, (void *)syscall);
     CHECK(res == DRMF_SUCCESS);
   }
+
+  // Our normal mmap interceptor can't intercept calls from the loader itself.
+  // This means we don't clear the shadow for calls to dlopen.  For now, we
+  // solve this by intercepting mmap from ld.so here, but ideally we'd have a
+  // solution that doesn't rely on msandr.
+  //
+  // Be careful not to intercept maps done by the msan rtl.  Otherwise we end up
+  // unpoisoning vast regions of memory and OOMing.
+  // TODO: __msan_unpoison() could "flush" large regions of memory like tsan
+  // does instead of doing a large memset.  However, we need the memory to be
+  // zeroed, where as tsan does not, so plain madvise is not enough.
+  if (success && (sysnum == SYS_mmap IF_NOT_X64(|| sysnum == SYS_mmap2))) {
+    if (IsInLoader(drcontext)) {
+      app_pc base = (app_pc)dr_syscall_get_result(drcontext);
+      ptr_uint_t size;
+      drmf_status_t res = drsys_pre_syscall_arg(drcontext, 1, &size);
+      CHECK(res == DRMF_SUCCESS);
+      if (VERBOSITY > 0)
+        dr_printf("unpoisoning for dlopen: [%p-%p]\n", base, base + size);
+      // We don't switch to the app context because __msan_unpoison() doesn't
+      // need TLS segments.
+      __msan_unpoison(base, size);
+    }
+  }
 }
 
 } // namespace
diff --git a/lib/profile/GCDAProfiling.c b/lib/profile/GCDAProfiling.c
index 7c52a17..ccaf01b 100644
--- a/lib/profile/GCDAProfiling.c
+++ b/lib/profile/GCDAProfiling.c
@@ -42,8 +42,24 @@
  * --- GCOV file format I/O primitives ---
  */
 
+/*
+ * The current file we're outputting.
+ */ 
 static FILE *output_file = NULL;
 
+/*
+ *  A list of flush functions that our __gcov_flush() function should call.
+ */
+typedef void (*flush_fn)();
+
+struct flush_fn_node {
+  flush_fn fn;
+  struct flush_fn_node *next;
+};
+
+struct flush_fn_node *flush_fn_head = NULL;
+struct flush_fn_node *flush_fn_tail = NULL;
+
 static void write_int32(uint32_t i) {
   fwrite(&i, 4, 1, output_file);
 }
@@ -145,7 +161,7 @@
  * profiling enabled will emit to a different file. Only one file may be
  * started at a time.
  */
-void llvm_gcda_start_file(const char *orig_filename) {
+void llvm_gcda_start_file(const char *orig_filename, const char version[4]) {
   char *filename = mangle_filename(orig_filename);
 
   /* Try just opening the file. */
@@ -167,13 +183,10 @@
     }
   }
 
-  /* gcda file, version 404*, stamp LLVM. */
-#ifdef __APPLE__
-  fwrite("adcg*204MVLL", 12, 1, output_file);
-#else
-  fwrite("adcg*404MVLL", 12, 1, output_file);
-#endif
-
+  /* gcda file, version, stamp LLVM. */
+  fwrite("adcg", 4, 1, output_file);
+  fwrite(version, 4, 1, output_file);
+  fwrite("MVLL", 4, 1, output_file);
   free(filename);
 
 #ifdef DEBUG_GCDAPROFILING
@@ -206,19 +219,28 @@
 #endif
 }
 
-void llvm_gcda_emit_function(uint32_t ident, const char *function_name) {
+void llvm_gcda_emit_function(uint32_t ident, const char *function_name,
+                             uint8_t use_extra_checksum) {
+  uint32_t len = 2;
+  if (use_extra_checksum)
+    len++;
 #ifdef DEBUG_GCDAPROFILING
-  fprintf(stderr, "llvmgcda: function id=0x%08x\n", ident);
+  fprintf(stderr, "llvmgcda: function id=0x%08x name=%s\n", ident,
+          function_name ? function_name : "NULL");
 #endif
   if (!output_file) return;
 
-  /* function tag */  
+  /* function tag */
   fwrite("\0\0\0\1", 4, 1, output_file);
-  write_int32(3 + 1 + length_of_string(function_name));
+  if (function_name)
+    len += 1 + length_of_string(function_name);
+  write_int32(len);
   write_int32(ident);
   write_int32(0);
-  write_int32(0);
-  write_string(function_name);
+  if (use_extra_checksum)
+    write_int32(0);
+  if (function_name)
+    write_string(function_name);
 }
 
 void llvm_gcda_emit_arcs(uint32_t num_counters, uint64_t *counters) {
@@ -282,3 +304,35 @@
   fprintf(stderr, "llvmgcda: -----\n");
 #endif
 }
+
+void llvm_register_flush_function(flush_fn fn) {
+  struct flush_fn_node *new_node = malloc(sizeof(struct flush_fn_node));
+  new_node->fn = fn;
+  new_node->next = NULL;
+
+  if (!flush_fn_head) {
+    flush_fn_head = flush_fn_tail = new_node;
+  } else {
+    flush_fn_tail->next = new_node;
+    flush_fn_tail = new_node;
+  }
+}
+
+void __gcov_flush() {
+  struct flush_fn_node *curr = flush_fn_head;
+
+  while (curr) {
+    curr->fn();
+    curr = curr->next;
+  }
+}
+
+void llvm_delete_flush_function_list() {
+  while (flush_fn_head) {
+    struct flush_fn_node *node = flush_fn_head;
+    flush_fn_head = flush_fn_head->next;
+    free(node);
+  }
+
+  flush_fn_head = flush_fn_tail = NULL;
+}
diff --git a/lib/sanitizer_common/CMakeLists.txt b/lib/sanitizer_common/CMakeLists.txt
index 56aa3f7..e89e207 100644
--- a/lib/sanitizer_common/CMakeLists.txt
+++ b/lib/sanitizer_common/CMakeLists.txt
@@ -13,11 +13,13 @@
   sanitizer_printf.cc
   sanitizer_stackdepot.cc
   sanitizer_stacktrace.cc
+  sanitizer_stoptheworld_linux.cc
   sanitizer_symbolizer.cc
   sanitizer_symbolizer_itanium.cc
   sanitizer_symbolizer_linux.cc
   sanitizer_symbolizer_mac.cc
   sanitizer_symbolizer_win.cc
+  sanitizer_thread_registry.cc
   sanitizer_win.cc
   )
 
@@ -36,6 +38,7 @@
   sanitizer_internal_defs.h
   sanitizer_lfstack.h
   sanitizer_libc.h
+  sanitizer_linux.h
   sanitizer_list.h
   sanitizer_mutex.h
   sanitizer_placement_new.h
@@ -46,9 +49,12 @@
   sanitizer_stackdepot.h
   sanitizer_stacktrace.h
   sanitizer_symbolizer.h
+  sanitizer_thread_registry.h
   )
 
-set(SANITIZER_CFLAGS ${SANITIZER_COMMON_CFLAGS})
+set(SANITIZER_CFLAGS
+  ${SANITIZER_COMMON_CFLAGS}
+  -fno-rtti)
 
 set(SANITIZER_RUNTIME_LIBRARIES)
 if(APPLE)
diff --git a/lib/sanitizer_common/sanitizer_allocator.h b/lib/sanitizer_common/sanitizer_allocator.h
index fa105c6..45c93da 100644
--- a/lib/sanitizer_common/sanitizer_allocator.h
+++ b/lib/sanitizer_common/sanitizer_allocator.h
@@ -26,15 +26,15 @@
 // SizeClassMap maps allocation sizes into size classes and back.
 // Class 0 corresponds to size 0.
 // Classes 1 - 16 correspond to sizes 16 to 256 (size = class_id * 16).
-// Next 8 classes: 256 + i * 32 (i = 1 to 8).
-// Next 8 classes: 512 + i * 64 (i = 1 to 8).
+// Next 4 classes: 256 + i * 64  (i = 1 to 4).
+// Next 4 classes: 512 + i * 128 (i = 1 to 4).
 // ...
-// Next 8 classes: 2^k + i * 2^(k-3) (i = 1 to 8).
+// Next 4 classes: 2^k + i * 2^(k-2) (i = 1 to 4).
 // Last class corresponds to kMaxSize = 1 << kMaxSizeLog.
 //
 // This structure of the size class map gives us:
 //   - Efficient table-free class-to-size and size-to-class functions.
-//   - Difference between two consequent size classes is betweed 12% and 6%
+//   - Difference between two consequent size classes is betweed 14% and 25%
 //
 // This class also gives a hint to a thread-caching allocator about the amount
 // of chunks that need to be cached per-thread:
@@ -61,45 +61,50 @@
 // c15 => s: 240 diff: +16 07% l 7 cached: 256 61440; id 15
 //
 // c16 => s: 256 diff: +16 06% l 8 cached: 256 65536; id 16
-// c17 => s: 288 diff: +32 12% l 8 cached: 227 65376; id 17
-// c18 => s: 320 diff: +32 11% l 8 cached: 204 65280; id 18
-// c19 => s: 352 diff: +32 10% l 8 cached: 186 65472; id 19
-// c20 => s: 384 diff: +32 09% l 8 cached: 170 65280; id 20
-// c21 => s: 416 diff: +32 08% l 8 cached: 157 65312; id 21
-// c22 => s: 448 diff: +32 07% l 8 cached: 146 65408; id 22
-// c23 => s: 480 diff: +32 07% l 8 cached: 136 65280; id 23
+// c17 => s: 320 diff: +64 25% l 8 cached: 204 65280; id 17
+// c18 => s: 384 diff: +64 20% l 8 cached: 170 65280; id 18
+// c19 => s: 448 diff: +64 16% l 8 cached: 146 65408; id 19
 //
-// c24 => s: 512 diff: +32 06% l 9 cached: 128 65536; id 24
-// c25 => s: 576 diff: +64 12% l 9 cached: 113 65088; id 25
-// c26 => s: 640 diff: +64 11% l 9 cached: 102 65280; id 26
-// c27 => s: 704 diff: +64 10% l 9 cached: 93 65472; id 27
-// c28 => s: 768 diff: +64 09% l 9 cached: 85 65280; id 28
-// c29 => s: 832 diff: +64 08% l 9 cached: 78 64896; id 29
-// c30 => s: 896 diff: +64 07% l 9 cached: 73 65408; id 30
-// c31 => s: 960 diff: +64 07% l 9 cached: 68 65280; id 31
+// c20 => s: 512 diff: +64 14% l 9 cached: 128 65536; id 20
+// c21 => s: 640 diff: +128 25% l 9 cached: 102 65280; id 21
+// c22 => s: 768 diff: +128 20% l 9 cached: 85 65280; id 22
+// c23 => s: 896 diff: +128 16% l 9 cached: 73 65408; id 23
 //
-// c32 => s: 1024 diff: +64 06% l 10 cached: 64 65536; id 32
+// c24 => s: 1024 diff: +128 14% l 10 cached: 64 65536; id 24
+// c25 => s: 1280 diff: +256 25% l 10 cached: 51 65280; id 25
+// c26 => s: 1536 diff: +256 20% l 10 cached: 42 64512; id 26
+// c27 => s: 1792 diff: +256 16% l 10 cached: 36 64512; id 27
+//
+// ...
+//
+// c48 => s: 65536 diff: +8192 14% l 16 cached: 1 65536; id 48
+// c49 => s: 81920 diff: +16384 25% l 16 cached: 1 81920; id 49
+// c50 => s: 98304 diff: +16384 20% l 16 cached: 1 98304; id 50
+// c51 => s: 114688 diff: +16384 16% l 16 cached: 1 114688; id 51
+//
+// c52 => s: 131072 diff: +16384 14% l 17 cached: 1 131072; id 52
 
-template <uptr kMaxSizeLog, uptr kMaxNumCachedT, uptr kMaxBytesCachedLog,
-          uptr kMinBatchClassT>
+template <uptr kMaxSizeLog, uptr kMaxNumCachedT, uptr kMaxBytesCachedLog>
 class SizeClassMap {
   static const uptr kMinSizeLog = 4;
   static const uptr kMidSizeLog = kMinSizeLog + 4;
   static const uptr kMinSize = 1 << kMinSizeLog;
   static const uptr kMidSize = 1 << kMidSizeLog;
   static const uptr kMidClass = kMidSize / kMinSize;
-  static const uptr S = 3;
+  static const uptr S = 2;
   static const uptr M = (1 << S) - 1;
 
  public:
   static const uptr kMaxNumCached = kMaxNumCachedT;
+  // We transfer chunks between central and thread-local free lists in batches.
+  // For small size classes we allocate batches separately.
+  // For large size classes we use one of the chunks to store the batch.
   struct TransferBatch {
     TransferBatch *next;
     uptr count;
     void *batch[kMaxNumCached];
   };
 
-  static const uptr kMinBatchClass = kMinBatchClassT;
   static const uptr kMaxSize = 1 << kMaxSizeLog;
   static const uptr kNumClasses =
       kMidClass + ((kMaxSizeLog - kMidSizeLog) << S) + 1;
@@ -143,7 +148,7 @@
         Printf("\n");
       uptr d = s - prev_s;
       uptr p = prev_s ? (d * 100 / prev_s) : 0;
-      uptr l = MostSignificantSetBitIndex(s);
+      uptr l = s ? MostSignificantSetBitIndex(s) : 0;
       uptr cached = MaxCached(i) * s;
       Printf("c%02zd => s: %zd diff: +%zd %02zd%% l %zd "
              "cached: %zd %zd; id %zd\n",
@@ -154,6 +159,11 @@
     Printf("Total cached: %zd\n", total_cached);
   }
 
+  static bool SizeClassRequiresSeparateTransferBatch(uptr class_id) {
+    return Size(class_id) < sizeof(TransferBatch) -
+        sizeof(uptr) * (kMaxNumCached - MaxCached(class_id));
+  }
+
   static void Validate() {
     for (uptr c = 1; c < kNumClasses; c++) {
       // Printf("Validate: c%zd\n", c);
@@ -176,24 +186,11 @@
       if (c > 0)
         CHECK_LT(Size(c-1), s);
     }
-
-    // TransferBatch for kMinBatchClass must fit into the block itself.
-    const uptr batch_size = sizeof(TransferBatch)
-        - sizeof(void*)  // NOLINT
-            * (kMaxNumCached - MaxCached(kMinBatchClass));
-    CHECK_LE(batch_size, Size(kMinBatchClass));
-    // TransferBatch for kMinBatchClass-1 must not fit into the block itself.
-    const uptr batch_size1 = sizeof(TransferBatch)
-        - sizeof(void*)  // NOLINT
-            * (kMaxNumCached - MaxCached(kMinBatchClass - 1));
-    CHECK_GT(batch_size1, Size(kMinBatchClass - 1));
   }
 };
 
-typedef SizeClassMap<17, 256, 16, FIRST_32_SECOND_64(25, 28)>
-    DefaultSizeClassMap;
-typedef SizeClassMap<17, 64, 14, FIRST_32_SECOND_64(17, 20)>
-    CompactSizeClassMap;
+typedef SizeClassMap<17, 256, 16> DefaultSizeClassMap;
+typedef SizeClassMap<17, 64,  14> CompactSizeClassMap;
 template<class SizeClassAllocator> struct SizeClassAllocatorLocalCache;
 
 // Memory allocator statistics
@@ -342,6 +339,7 @@
 
   NOINLINE void DeallocateBatch(AllocatorStats *stat, uptr class_id, Batch *b) {
     RegionInfo *region = GetRegionInfo(class_id);
+    CHECK_GT(b->count, 0);
     region->free_list.Push(b);
     region->n_freed += b->count;
   }
@@ -357,10 +355,12 @@
   void *GetBlockBegin(void *p) {
     uptr class_id = GetSizeClass(p);
     uptr size = SizeClassMap::Size(class_id);
+    if (!size) return 0;
     uptr chunk_idx = GetChunkIdx((uptr)p, size);
     uptr reg_beg = (uptr)p & ~(kRegionSize - 1);
     uptr beg = chunk_idx * size;
     uptr next_beg = beg + size;
+    if (class_id >= kNumClasses) return 0;
     RegionInfo *region = GetRegionInfo(class_id);
     if (region->mapped_user >= next_beg)
       return reinterpret_cast<void*>(reg_beg + beg);
@@ -433,6 +433,24 @@
     }
   }
 
+  // Iterate over existing chunks. May include chunks that are not currently
+  // allocated to the user (e.g. freed).
+  // The caller is expected to call ForceLock() before calling this function.
+  template<typename Callable>
+  void ForEachChunk(const Callable &callback) {
+    for (uptr class_id = 1; class_id < kNumClasses; class_id++) {
+      RegionInfo *region = GetRegionInfo(class_id);
+      uptr chunk_size = SizeClassMap::Size(class_id);
+      uptr region_beg = kSpaceBeg + class_id * kRegionSize;
+      for (uptr p = region_beg;
+           p < region_beg + region->allocated_user;
+           p += chunk_size) {
+        // Too slow: CHECK_EQ((void *)p, GetBlockBegin((void *)p));
+        callback((void *)p);
+      }
+    }
+  }
+
   typedef SizeClassMap SizeClassMapT;
   static const uptr kNumClasses = SizeClassMap::kNumClasses;
   static const uptr kNumClassesRounded = SizeClassMap::kNumClassesRounded;
@@ -517,13 +535,13 @@
     }
     CHECK_LE(region->allocated_meta, region->mapped_meta);
     if (region->allocated_user + region->allocated_meta > kRegionSize) {
-      Printf("Out of memory. Dying.\n");
+      Printf("%s: Out of memory. Dying. ", SanitizerToolName);
       Printf("The process has exhausted %zuMB for size class %zu.\n",
           kRegionSize / 1024 / 1024, size);
       Die();
     }
     for (;;) {
-      if (class_id < SizeClassMap::kMinBatchClass)
+      if (SizeClassMap::SizeClassRequiresSeparateTransferBatch(class_id))
         b = (Batch*)c->Allocate(this, SizeClassMap::ClassID(sizeof(Batch)));
       else
         b = (Batch*)(region_beg + beg_idx);
@@ -535,6 +553,7 @@
       beg_idx += count * size;
       if (beg_idx + count * size + size > region->mapped_user)
         break;
+      CHECK_GT(b->count, 0);
       region->free_list.Push(b);
     }
     return b;
@@ -620,6 +639,7 @@
     CHECK_LT(class_id, kNumClasses);
     SizeClassInfo *sci = GetSizeClassInfo(class_id);
     SpinMutexLock l(&sci->mutex);
+    CHECK_GT(b->count, 0);
     sci->free_list.push_front(b);
   }
 
@@ -679,6 +699,25 @@
     }
   }
 
+  // Iterate over existing chunks. May include chunks that are not currently
+  // allocated to the user (e.g. freed).
+  // The caller is expected to call ForceLock() before calling this function.
+  template<typename Callable>
+  void ForEachChunk(const Callable &callback) {
+    for (uptr region = 0; region < kNumPossibleRegions; region++)
+      if (state_->possible_regions[region]) {
+        uptr chunk_size = SizeClassMap::Size(state_->possible_regions[region]);
+        uptr max_chunks_in_region = kRegionSize / (chunk_size + kMetadataSize);
+        uptr region_beg = region * kRegionSize;
+        for (uptr p = region_beg;
+             p < region_beg + max_chunks_in_region * chunk_size;
+             p += chunk_size) {
+          // Too slow: CHECK_EQ((void *)p, GetBlockBegin((void *)p));
+          callback((void *)p);
+        }
+      }
+  }
+
   void PrintStats() {
   }
 
@@ -733,7 +772,7 @@
     Batch *b = 0;
     for (uptr i = reg; i < reg + n_chunks * size; i += size) {
       if (b == 0) {
-        if (class_id < SizeClassMap::kMinBatchClass)
+        if (SizeClassMap::SizeClassRequiresSeparateTransferBatch(class_id))
           b = (Batch*)c->Allocate(this, SizeClassMap::ClassID(sizeof(Batch)));
         else
           b = (Batch*)i;
@@ -741,12 +780,15 @@
       }
       b->batch[b->count++] = (void*)i;
       if (b->count == max_count) {
+        CHECK_GT(b->count, 0);
         sci->free_list.push_back(b);
         b = 0;
       }
     }
-    if (b)
+    if (b) {
+      CHECK_GT(b->count, 0);
       sci->free_list.push_back(b);
+    }
   }
 
   struct State {
@@ -791,8 +833,12 @@
   void Deallocate(SizeClassAllocator *allocator, uptr class_id, void *p) {
     CHECK_NE(class_id, 0UL);
     CHECK_LT(class_id, kNumClasses);
+    // If the first allocator call on a new thread is a deallocation, then
+    // max_count will be zero, leading to check failure.
+    InitCache();
     stats_.Add(AllocatorStatFreed, SizeClassMap::Size(class_id));
     PerClass *c = &per_class_[class_id];
+    CHECK_NE(c->max_count, 0UL);
     if (UNLIKELY(c->count == c->max_count))
       Drain(allocator, class_id);
     c->batch[c->count++] = p;
@@ -818,7 +864,7 @@
   AllocatorStats stats_;
 
   void InitCache() {
-    if (per_class_[0].max_count)
+    if (per_class_[1].max_count)
       return;
     for (uptr i = 0; i < kNumClasses; i++) {
       PerClass *c = &per_class_[i];
@@ -834,7 +880,7 @@
     for (uptr i = 0; i < b->count; i++)
       c->batch[i] = b->batch[i];
     c->count = b->count;
-    if (class_id < SizeClassMap::kMinBatchClass)
+    if (SizeClassMap::SizeClassRequiresSeparateTransferBatch(class_id))
       Deallocate(allocator, SizeClassMap::ClassID(sizeof(Batch)), b);
   }
 
@@ -842,7 +888,7 @@
     InitCache();
     PerClass *c = &per_class_[class_id];
     Batch *b;
-    if (class_id < SizeClassMap::kMinBatchClass)
+    if (SizeClassMap::SizeClassRequiresSeparateTransferBatch(class_id))
       b = (Batch*)Allocate(allocator, SizeClassMap::ClassID(sizeof(Batch)));
     else
       b = (Batch*)c->batch[0];
@@ -853,6 +899,7 @@
     }
     b->count = cnt;
     c->count -= cnt;
+    CHECK_GT(b->count, 0);
     allocator->DeallocateBatch(&stats_, class_id, b);
   }
 };
@@ -995,6 +1042,15 @@
     mutex_.Unlock();
   }
 
+  // Iterate over existing chunks. May include chunks that are not currently
+  // allocated to the user (e.g. freed).
+  // The caller is expected to call ForceLock() before calling this function.
+  template<typename Callable>
+  void ForEachChunk(const Callable &callback) {
+    for (uptr i = 0; i < n_chunks_; i++)
+      callback(GetUser(chunks_[i]));
+  }
+
  private:
   static const int kMaxNumChunks = 1 << FIRST_32_SECOND_64(15, 18);
   struct Header {
@@ -1158,6 +1214,15 @@
     primary_.ForceUnlock();
   }
 
+  // Iterate over existing chunks. May include chunks that are not currently
+  // allocated to the user (e.g. freed).
+  // The caller is expected to call ForceLock() before calling this function.
+  template<typename Callable>
+  void ForEachChunk(const Callable &callback) {
+    primary_.ForEachChunk(callback);
+    secondary_.ForEachChunk(callback);
+  }
+
  private:
   PrimaryAllocator primary_;
   SecondaryAllocator secondary_;
diff --git a/lib/sanitizer_common/sanitizer_common.cc b/lib/sanitizer_common/sanitizer_common.cc
index 0518f41..39e52e8 100644
--- a/lib/sanitizer_common/sanitizer_common.cc
+++ b/lib/sanitizer_common/sanitizer_common.cc
@@ -17,6 +17,7 @@
 namespace __sanitizer {
 
 const char *SanitizerToolName = "SanitizerTool";
+uptr SanitizerVerbosity = 0;
 
 uptr GetPageSizeCached() {
   static uptr PageSize;
diff --git a/lib/sanitizer_common/sanitizer_common.h b/lib/sanitizer_common/sanitizer_common.h
index ce04919..bca65c1 100644
--- a/lib/sanitizer_common/sanitizer_common.h
+++ b/lib/sanitizer_common/sanitizer_common.h
@@ -33,6 +33,7 @@
 #endif
 
 extern const char *SanitizerToolName;  // Can be changed by the tool.
+extern uptr SanitizerVerbosity;
 
 uptr GetPageSize();
 uptr GetPageSizeCached();
@@ -133,6 +134,9 @@
 void SetStackSizeLimitInBytes(uptr limit);
 void PrepareForSandboxing();
 
+void InitTlsSize();
+uptr GetTlsSize();
+
 // Other
 void SleepForSeconds(int seconds);
 void SleepForMillis(int millis);
@@ -287,6 +291,14 @@
     }
     data_[size_++] = element;
   }
+  T &back() {
+    CHECK_GT(size_, 0);
+    return data_[size_ - 1];
+  }
+  void pop_back() {
+    CHECK_GT(size_, 0);
+    size_--;
+  }
   uptr size() {
     return size_;
   }
diff --git a/lib/sanitizer_common/sanitizer_common_interceptors.inc b/lib/sanitizer_common/sanitizer_common_interceptors.inc
index 0478262..3cb1d4a 100644
--- a/lib/sanitizer_common/sanitizer_common_interceptors.inc
+++ b/lib/sanitizer_common/sanitizer_common_interceptors.inc
@@ -28,6 +28,44 @@
 #define va_copy(dst, src) ((dst) = (src))
 #endif // _WIN32
 
+#if SANITIZER_INTERCEPT_FREXP
+INTERCEPTOR(double, frexp, double x, int *exp) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, frexp, x, exp);
+  double res = REAL(frexp)(x, exp);
+  COMMON_INTERCEPTOR_WRITE_RANGE(ctx, exp, sizeof(*exp));
+  return res;
+}
+
+#define INIT_FREXP INTERCEPT_FUNCTION(frexp);
+#else
+#define INIT_FREXP
+#endif // SANITIZER_INTERCEPT_FREXP
+
+#if SANITIZER_INTERCEPT_FREXPF_FREXPL
+INTERCEPTOR(float, frexpf, float x, int *exp) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, frexpf, x, exp);
+  float res = REAL(frexpf)(x, exp);
+  COMMON_INTERCEPTOR_WRITE_RANGE(ctx, exp, sizeof(*exp));
+  return res;
+}
+
+INTERCEPTOR(long double, frexpl, long double x, int *exp) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, frexpl, x, exp);
+  long double res = REAL(frexpl)(x, exp);
+  COMMON_INTERCEPTOR_WRITE_RANGE(ctx, exp, sizeof(*exp));
+  return res;
+}
+
+#define INIT_FREXPF_FREXPL                       \
+  INTERCEPT_FUNCTION(frexpf);                    \
+  INTERCEPT_FUNCTION(frexpl)
+#else
+#define INIT_FREXPF_FREXPL
+#endif // SANITIZER_INTERCEPT_FREXPF_FREXPL
+
 #if SANITIZER_INTERCEPT_READ
 INTERCEPTOR(SSIZE_T, read, int fd, void *ptr, SIZE_T count) {
   void *ctx;
@@ -336,4 +374,6 @@
   INIT_PWRITE;                                                                 \
   INIT_PWRITE64;                                                               \
   INIT_LOCALTIME_AND_FRIENDS;                                                  \
-  INIT_SCANF;
+  INIT_SCANF;                                                                  \
+  INIT_FREXP;                                                                  \
+  INIT_FREXPF_FREXPL;
diff --git a/lib/sanitizer_common/sanitizer_internal_defs.h b/lib/sanitizer_common/sanitizer_internal_defs.h
index 083761b..e052cbd 100644
--- a/lib/sanitizer_common/sanitizer_internal_defs.h
+++ b/lib/sanitizer_common/sanitizer_internal_defs.h
@@ -66,6 +66,16 @@
 typedef signed   long long s64;  // NOLINT
 typedef int fd_t;
 
+// WARNING: OFF_T may be different from OS type off_t, depending on the value of
+// _FILE_OFFSET_BITS. This definition of OFF_T matches the ABI of system calls
+// like pread and mmap, as opposed to pread64 and mmap64.
+// Mac and Linux/x86-64 are special.
+#if defined(__APPLE__) || (defined(__linux__) && defined(__x86_64__))
+typedef u64 OFF_T;
+#else
+typedef uptr OFF_T;
+#endif
+typedef u64  OFF64_T;
 }  // namespace __sanitizer
 
 extern "C" {
diff --git a/lib/sanitizer_common/sanitizer_libc.h b/lib/sanitizer_common/sanitizer_libc.h
index d4e954c..7c2a1b8 100644
--- a/lib/sanitizer_common/sanitizer_libc.h
+++ b/lib/sanitizer_common/sanitizer_libc.h
@@ -80,6 +80,11 @@
 int internal_dup2(int oldfd, int newfd);
 uptr internal_readlink(const char *path, char *buf, uptr bufsize);
 void NORETURN internal__exit(int exitcode);
+OFF_T internal_lseek(fd_t fd, OFF_T offset, int whence);
+
+long internal_ptrace(int request, int pid, void *addr, void *data);
+int internal_waitpid(int pid, int *status, int options);
+int internal_getppid();
 
 // Threading
 int internal_sched_yield();
diff --git a/lib/sanitizer_common/sanitizer_linux.cc b/lib/sanitizer_common/sanitizer_linux.cc
index 11cec22..dd36ca3 100644
--- a/lib/sanitizer_common/sanitizer_linux.cc
+++ b/lib/sanitizer_common/sanitizer_linux.cc
@@ -16,15 +16,19 @@
 #include "sanitizer_common.h"
 #include "sanitizer_internal_defs.h"
 #include "sanitizer_libc.h"
+#include "sanitizer_linux.h"
 #include "sanitizer_mutex.h"
 #include "sanitizer_placement_new.h"
 #include "sanitizer_procmaps.h"
 #include "sanitizer_stacktrace.h"
 
+#include <dlfcn.h>
+#include <errno.h>
 #include <fcntl.h>
 #include <pthread.h>
 #include <sched.h>
 #include <sys/mman.h>
+#include <sys/ptrace.h>
 #include <sys/resource.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
@@ -33,7 +37,10 @@
 #include <sys/prctl.h>
 #include <unistd.h>
 #include <unwind.h>
-#include <errno.h>
+
+#if !defined(__ANDROID__) && !defined(ANDROID)
+#include <sys/signal.h>
+#endif
 
 // <linux/futex.h> is broken on some linux distributions.
 const int FUTEX_WAIT = 0;
@@ -178,7 +185,7 @@
     MemoryMappingLayout proc_maps;
     uptr start, end, offset;
     uptr prev_end = 0;
-    while (proc_maps.Next(&start, &end, &offset, 0, 0)) {
+    while (proc_maps.Next(&start, &end, &offset, 0, 0, /* protection */0)) {
       if ((uptr)&rl < end)
         break;
       prev_end = end;
@@ -375,7 +382,7 @@
   return x;
 }
 
-static bool IsOnOf(char c, char c1, char c2) {
+static bool IsOneOf(char c, char c1, char c2) {
   return c == c1 || c == c2;
 }
 
@@ -384,7 +391,8 @@
 }
 
 bool MemoryMappingLayout::Next(uptr *start, uptr *end, uptr *offset,
-                               char filename[], uptr filename_size) {
+                               char filename[], uptr filename_size,
+                               uptr *protection) {
   char *last = proc_self_maps_.data + proc_self_maps_.len;
   if (current_ >= last) return false;
   uptr dummy;
@@ -399,10 +407,22 @@
   CHECK_EQ(*current_++, '-');
   *end = ParseHex(&current_);
   CHECK_EQ(*current_++, ' ');
-  CHECK(IsOnOf(*current_++, '-', 'r'));
-  CHECK(IsOnOf(*current_++, '-', 'w'));
-  CHECK(IsOnOf(*current_++, '-', 'x'));
-  CHECK(IsOnOf(*current_++, 's', 'p'));
+  uptr local_protection = 0;
+  CHECK(IsOneOf(*current_, '-', 'r'));
+  if (*current_++ == 'r')
+    local_protection |= kProtectionRead;
+  CHECK(IsOneOf(*current_, '-', 'w'));
+  if (*current_++ == 'w')
+    local_protection |= kProtectionWrite;
+  CHECK(IsOneOf(*current_, '-', 'x'));
+  if (*current_++ == 'x')
+    local_protection |= kProtectionExecute;
+  CHECK(IsOneOf(*current_, 's', 'p'));
+  if (*current_++ == 's')
+    local_protection |= kProtectionShared;
+  if (protection) {
+    *protection = local_protection;
+  }
   CHECK_EQ(*current_++, ' ');
   *offset = ParseHex(&current_);
   CHECK_EQ(*current_++, ' ');
@@ -432,8 +452,10 @@
 // Gets the object name and the offset by walking MemoryMappingLayout.
 bool MemoryMappingLayout::GetObjectNameAndOffset(uptr addr, uptr *offset,
                                                  char filename[],
-                                                 uptr filename_size) {
-  return IterateForObjectNameAndOffset(addr, offset, filename, filename_size);
+                                                 uptr filename_size,
+                                                 uptr *protection) {
+  return IterateForObjectNameAndOffset(addr, offset, filename, filename_size,
+                                       protection);
 }
 
 bool SanitizerSetThreadName(const char *name) {
@@ -523,6 +545,10 @@
   CHECK_EQ(owner_, 0);
 }
 
+BlockingMutex::BlockingMutex() {
+  internal_memset(this, 0, sizeof(*this));
+}
+
 void BlockingMutex::Lock() {
   atomic_uint32_t *m = reinterpret_cast<atomic_uint32_t *>(&opaque_storage_);
   if (atomic_exchange(m, MtxLocked, memory_order_acquire) == MtxUnlocked)
@@ -539,6 +565,147 @@
     syscall(__NR_futex, m, FUTEX_WAKE, 1, 0, 0, 0);
 }
 
+void BlockingMutex::CheckLocked() {
+  atomic_uint32_t *m = reinterpret_cast<atomic_uint32_t *>(&opaque_storage_);
+  CHECK_NE(MtxUnlocked, atomic_load(m, memory_order_relaxed));
+}
+
+// ----------------- sanitizer_linux.h
+// The actual size of this structure is specified by d_reclen.
+// Note that getdents64 uses a different structure format. We only provide the
+// 32-bit syscall here.
+struct linux_dirent {
+  unsigned long      d_ino;
+  unsigned long      d_off;
+  unsigned short     d_reclen;
+  char               d_name[256];
+};
+
+// Syscall wrappers.
+long internal_ptrace(int request, int pid, void *addr, void *data) {
+  return syscall(__NR_ptrace, request, pid, addr, data);
+}
+
+int internal_waitpid(int pid, int *status, int options) {
+  return syscall(__NR_wait4, pid, status, options, NULL /* rusage */);
+}
+
+int internal_getppid() {
+  return syscall(__NR_getppid);
+}
+
+int internal_getdents(fd_t fd, struct linux_dirent *dirp, unsigned int count) {
+  return syscall(__NR_getdents, fd, dirp, count);
+}
+
+OFF_T internal_lseek(fd_t fd, OFF_T offset, int whence) {
+  return syscall(__NR_lseek, fd, offset, whence);
+}
+
+int internal_prctl(int option, uptr arg2, uptr arg3, uptr arg4, uptr arg5) {
+  return syscall(__NR_prctl, option, arg2, arg3, arg4, arg5);
+}
+
+int internal_sigaltstack(const struct sigaltstack *ss,
+                         struct sigaltstack *oss) {
+  return syscall(__NR_sigaltstack, ss, oss);
+}
+
+
+// ThreadLister implementation.
+ThreadLister::ThreadLister(int pid)
+  : pid_(pid),
+    descriptor_(-1),
+    error_(true),
+    entry_((linux_dirent *)buffer_),
+    bytes_read_(0) {
+  char task_directory_path[80];
+  internal_snprintf(task_directory_path, sizeof(task_directory_path),
+                    "/proc/%d/task/", pid);
+  descriptor_ = internal_open(task_directory_path, O_RDONLY | O_DIRECTORY);
+  if (descriptor_ < 0) {
+    error_ = true;
+    Report("Can't open /proc/%d/task for reading.\n", pid);
+  } else {
+    error_ = false;
+  }
+}
+
+int ThreadLister::GetNextTID() {
+  int tid = -1;
+  do {
+    if (error_)
+      return -1;
+    if ((char *)entry_ >= &buffer_[bytes_read_] && !GetDirectoryEntries())
+      return -1;
+    if (entry_->d_ino != 0 && entry_->d_name[0] >= '0' &&
+        entry_->d_name[0] <= '9') {
+      // Found a valid tid.
+      tid = (int)internal_atoll(entry_->d_name);
+    }
+    entry_ = (struct linux_dirent *)(((char *)entry_) + entry_->d_reclen);
+  } while (tid < 0);
+  return tid;
+}
+
+void ThreadLister::Reset() {
+  if (error_ || descriptor_ < 0)
+    return;
+  internal_lseek(descriptor_, 0, SEEK_SET);
+}
+
+ThreadLister::~ThreadLister() {
+  if (descriptor_ >= 0)
+    internal_close(descriptor_);
+}
+
+bool ThreadLister::error() { return error_; }
+
+bool ThreadLister::GetDirectoryEntries() {
+  CHECK_GE(descriptor_, 0);
+  CHECK_NE(error_, true);
+  bytes_read_ = internal_getdents(descriptor_,
+                                  (struct linux_dirent *)buffer_,
+                                  sizeof(buffer_));
+  if (bytes_read_ < 0) {
+    Report("Can't read directory entries from /proc/%d/task.\n", pid_);
+    error_ = true;
+    return false;
+  } else if (bytes_read_ == 0) {
+    return false;
+  }
+  entry_ = (struct linux_dirent *)buffer_;
+  return true;
+}
+
+static uptr g_tls_size;
+
+#ifdef __i386__
+# define DL_INTERNAL_FUNCTION __attribute__((regparm(3), stdcall))
+#else
+# define DL_INTERNAL_FUNCTION
+#endif
+
+void InitTlsSize() {
+#ifndef SANITIZER_GO
+  typedef void (*get_tls_func)(size_t*, size_t*) DL_INTERNAL_FUNCTION;
+  get_tls_func get_tls;
+  void *get_tls_static_info_ptr = dlsym(RTLD_NEXT, "_dl_get_tls_static_info");
+  CHECK_EQ(sizeof(get_tls), sizeof(get_tls_static_info_ptr));
+  internal_memcpy(&get_tls, &get_tls_static_info_ptr,
+                  sizeof(get_tls_static_info_ptr));
+  CHECK_NE(get_tls, 0);
+  size_t tls_size = 0;
+  size_t tls_align = 0;
+  get_tls(&tls_size, &tls_align);
+  g_tls_size = tls_size;
+#endif
+}
+
+uptr GetTlsSize() {
+  return g_tls_size;
+}
+
 }  // namespace __sanitizer
 
 #endif  // __linux__
diff --git a/lib/sanitizer_common/sanitizer_linux.h b/lib/sanitizer_common/sanitizer_linux.h
new file mode 100644
index 0000000..b4ac310
--- /dev/null
+++ b/lib/sanitizer_common/sanitizer_linux.h
@@ -0,0 +1,53 @@
+//===-- sanitizer_linux.h ---------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Linux-specific syscall wrappers and classes.
+//
+//===----------------------------------------------------------------------===//
+#ifndef SANITIZER_LINUX_H
+#define SANITIZER_LINUX_H
+
+#include "sanitizer_internal_defs.h"
+
+struct sigaltstack;
+
+namespace __sanitizer {
+// Dirent structure for getdents(). Note that this structure is different from
+// the one in <dirent.h>, which is used by readdir().
+struct linux_dirent;
+
+// Syscall wrappers.
+int internal_getdents(fd_t fd, struct linux_dirent *dirp, unsigned int count);
+int internal_prctl(int option, uptr arg2, uptr arg3, uptr arg4, uptr arg5);
+int internal_sigaltstack(const struct sigaltstack *ss, struct sigaltstack *oss);
+
+// This class reads thread IDs from /proc/<pid>/task using only syscalls.
+class ThreadLister {
+ public:
+  explicit ThreadLister(int pid);
+  ~ThreadLister();
+  // GetNextTID returns -1 if the list of threads is exhausted, or if there has
+  // been an error.
+  int GetNextTID();
+  void Reset();
+  bool error();
+
+ private:
+  bool GetDirectoryEntries();
+
+  int pid_;
+  int descriptor_;
+  char buffer_[4096];
+  bool error_;
+  struct linux_dirent* entry_;
+  int bytes_read_;
+};
+}  // namespace __sanitizer
+
+#endif  // SANITIZER_LINUX_H
diff --git a/lib/sanitizer_common/sanitizer_mac.cc b/lib/sanitizer_common/sanitizer_mac.cc
index 17bc223..3990f26 100644
--- a/lib/sanitizer_common/sanitizer_mac.cc
+++ b/lib/sanitizer_common/sanitizer_mac.cc
@@ -216,7 +216,9 @@
 template<u32 kLCSegment, typename SegmentCommand>
 bool MemoryMappingLayout::NextSegmentLoad(
     uptr *start, uptr *end, uptr *offset,
-    char filename[], uptr filename_size) {
+    char filename[], uptr filename_size, uptr *protection) {
+  if (protection)
+    UNIMPLEMENTED();
   const char* lc = current_load_cmd_addr_;
   current_load_cmd_addr_ += ((const load_command *)lc)->cmdsize;
   if (((const load_command *)lc)->cmd == kLCSegment) {
@@ -241,7 +243,8 @@
 }
 
 bool MemoryMappingLayout::Next(uptr *start, uptr *end, uptr *offset,
-                               char filename[], uptr filename_size) {
+                               char filename[], uptr filename_size,
+                               uptr *protection) {
   for (; current_image_ >= 0; current_image_--) {
     const mach_header* hdr = _dyld_get_image_header(current_image_);
     if (!hdr) continue;
@@ -273,14 +276,14 @@
 #ifdef MH_MAGIC_64
         case MH_MAGIC_64: {
           if (NextSegmentLoad<LC_SEGMENT_64, struct segment_command_64>(
-                  start, end, offset, filename, filename_size))
+                  start, end, offset, filename, filename_size, protection))
             return true;
           break;
         }
 #endif
         case MH_MAGIC: {
           if (NextSegmentLoad<LC_SEGMENT, struct segment_command>(
-                  start, end, offset, filename, filename_size))
+                  start, end, offset, filename, filename_size, protection))
             return true;
           break;
         }
@@ -294,14 +297,20 @@
 
 bool MemoryMappingLayout::GetObjectNameAndOffset(uptr addr, uptr *offset,
                                                  char filename[],
-                                                 uptr filename_size) {
-  return IterateForObjectNameAndOffset(addr, offset, filename, filename_size);
+                                                 uptr filename_size,
+                                                 uptr *protection) {
+  return IterateForObjectNameAndOffset(addr, offset, filename, filename_size,
+                                       protection);
 }
 
 BlockingMutex::BlockingMutex(LinkerInitialized) {
   // We assume that OS_SPINLOCK_INIT is zero
 }
 
+BlockingMutex::BlockingMutex() {
+  internal_memset(this, 0, sizeof(*this));
+}
+
 void BlockingMutex::Lock() {
   CHECK(sizeof(OSSpinLock) <= sizeof(opaque_storage_));
   CHECK_EQ(OS_SPINLOCK_INIT, 0);
@@ -317,6 +326,17 @@
   OSSpinLockUnlock((OSSpinLock*)&opaque_storage_);
 }
 
+void BlockingMutex::CheckLocked() {
+  CHECK_EQ((uptr)pthread_self(), owner_);
+}
+
+uptr GetTlsSize() {
+  return 0;
+}
+
+void InitTlsSize() {
+}
+
 }  // namespace __sanitizer
 
 #endif  // __APPLE__
diff --git a/lib/sanitizer_common/sanitizer_mutex.h b/lib/sanitizer_common/sanitizer_mutex.h
index 56438fc..469981c 100644
--- a/lib/sanitizer_common/sanitizer_mutex.h
+++ b/lib/sanitizer_common/sanitizer_mutex.h
@@ -70,8 +70,10 @@
 class BlockingMutex {
  public:
   explicit BlockingMutex(LinkerInitialized);
+  BlockingMutex();
   void Lock();
   void Unlock();
+  void CheckLocked();
  private:
   uptr opaque_storage_[10];
   uptr owner_;  // for debugging
diff --git a/lib/sanitizer_common/sanitizer_platform_interceptors.h b/lib/sanitizer_common/sanitizer_platform_interceptors.h
index f5cf9a7..39860fb 100644
--- a/lib/sanitizer_common/sanitizer_platform_interceptors.h
+++ b/lib/sanitizer_common/sanitizer_platform_interceptors.h
@@ -46,3 +46,6 @@
 
 # define SANITIZER_INTERCEPT_SCANF SI_NOT_WINDOWS
 # define SANITIZER_INTERCEPT_ISOC99_SCANF SI_LINUX
+
+# define SANITIZER_INTERCEPT_FREXP 1
+# define SANITIZER_INTERCEPT_FREXPF_FREXPL SI_NOT_WINDOWS
diff --git a/lib/sanitizer_common/sanitizer_platform_limits_posix.cc b/lib/sanitizer_common/sanitizer_platform_limits_posix.cc
index 0ba71a8..1046b62 100644
--- a/lib/sanitizer_common/sanitizer_platform_limits_posix.cc
+++ b/lib/sanitizer_common/sanitizer_platform_limits_posix.cc
@@ -18,6 +18,7 @@
 #include "sanitizer_platform_limits_posix.h"
 
 #include <dirent.h>
+#include <pthread.h>
 #include <sys/utsname.h>
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -46,6 +47,7 @@
 #endif // __linux__
 
 #if defined(__linux__) && !defined(__ANDROID__)
+  unsigned struct_dirent64_sz = sizeof(struct dirent64);
   unsigned struct_rlimit64_sz = sizeof(struct rlimit64);
   unsigned struct_statfs64_sz = sizeof(struct statfs64);
 #endif // __linux__ && !__ANDROID__
@@ -67,4 +69,6 @@
   }
 }  // namespace __sanitizer
 
+COMPILER_CHECK(sizeof(__sanitizer_pthread_attr_t) >= sizeof(pthread_attr_t));
+
 #endif  // __linux__ || __APPLE__
diff --git a/lib/sanitizer_common/sanitizer_platform_limits_posix.h b/lib/sanitizer_common/sanitizer_platform_limits_posix.h
index dd00663..2eac016 100644
--- a/lib/sanitizer_common/sanitizer_platform_limits_posix.h
+++ b/lib/sanitizer_common/sanitizer_platform_limits_posix.h
@@ -30,6 +30,7 @@
 #endif // __linux__
 
 #if defined(__linux__) && !defined(__ANDROID__)
+  extern unsigned struct_dirent64_sz;
   extern unsigned struct_rlimit64_sz;
   extern unsigned struct_statfs64_sz;
 #endif // __linux__ && !__ANDROID__
@@ -38,6 +39,14 @@
   uptr __sanitizer_get_msghdr_iov_iov_len(void* msg, int idx);
   uptr __sanitizer_get_msghdr_iovlen(void* msg);
   uptr __sanitizer_get_socklen_t(void* socklen_ptr);
+
+  // This thing depends on the platform. We are only interested in the upper
+  // limit. Verified with a compiler assert in .cc.
+  const int pthread_attr_t_max_sz = 128;
+  union __sanitizer_pthread_attr_t {
+    char size[pthread_attr_t_max_sz]; // NOLINT
+    void *align;
+  };
 }  // namespace __sanitizer
 
 #endif
diff --git a/lib/sanitizer_common/sanitizer_posix.cc b/lib/sanitizer_common/sanitizer_posix.cc
index 001ca75..27f0977 100644
--- a/lib/sanitizer_common/sanitizer_posix.cc
+++ b/lib/sanitizer_common/sanitizer_posix.cc
@@ -66,8 +66,8 @@
       Die();
     }
     recursion_count++;
-    Report("ERROR: %s failed to allocate 0x%zx (%zd) bytes of %s: %s\n",
-           SanitizerToolName, size, size, mem_type, strerror(errno));
+    Report("ERROR: %s failed to allocate 0x%zx (%zd) bytes of %s: %d\n",
+           SanitizerToolName, size, size, mem_type, errno);
     DumpProcessMap();
     CHECK("unable to mmap" && 0);
   }
@@ -152,7 +152,8 @@
   MemoryMappingLayout procmaps;
   uptr start, end;
   while (procmaps.Next(&start, &end,
-                       /*offset*/0, /*filename*/0, /*filename_size*/0)) {
+                       /*offset*/0, /*filename*/0, /*filename_size*/0,
+                       /*protection*/0)) {
     if (!IntervalsAreSeparate(start, end, range_start, range_end))
       return false;
   }
@@ -166,7 +167,7 @@
   char *filename = (char*)MmapOrDie(kBufSize, __FUNCTION__);
   Report("Process memory map follows:\n");
   while (proc_maps.Next(&start, &end, /* file_offset */0,
-                        filename, kBufSize)) {
+                        filename, kBufSize, /* protection */0)) {
     Printf("\t%p-%p\t%s\n", (void*)start, (void*)end, filename);
   }
   Report("End of process memory map.\n");
diff --git a/lib/sanitizer_common/sanitizer_printf.cc b/lib/sanitizer_common/sanitizer_printf.cc
index 2393e8f..4b5a8b4 100644
--- a/lib/sanitizer_common/sanitizer_printf.cc
+++ b/lib/sanitizer_common/sanitizer_printf.cc
@@ -201,19 +201,52 @@
 // Like Printf, but prints the current PID before the output string.
 void Report(const char *format, ...) {
   const int kLen = 16 * 1024;
-  InternalScopedBuffer<char> buffer(kLen);
-  int needed_length = internal_snprintf(buffer.data(),
-                                        kLen, "==%d== ", GetPid());
-  RAW_CHECK_MSG(needed_length < kLen, "Buffer in Report is too short!\n");
-  va_list args;
-  va_start(args, format);
-  needed_length += VSNPrintf(buffer.data() + needed_length,
-                             kLen - needed_length, format, args);
-  va_end(args);
-  RAW_CHECK_MSG(needed_length < kLen, "Buffer in Report is too short!\n");
-  RawWrite(buffer.data());
-  if (PrintfAndReportCallback)
-    PrintfAndReportCallback(buffer.data());
+  // |local_buffer| is small enough not to overflow the stack and/or violate
+  // the stack limit enforced by TSan (-Wframe-larger-than=512). On the other
+  // hand, the bigger the buffer is, the more the chance the error report will
+  // fit into it.
+  char local_buffer[400];
+  int needed_length;
+  int pid = GetPid();
+  char *buffer = local_buffer;
+  int cur_size = sizeof(local_buffer) / sizeof(char);
+  for (int use_mmap = 0; use_mmap < 2; use_mmap++) {
+    needed_length = internal_snprintf(buffer, cur_size,
+                                      "==%d==", pid);
+    if (needed_length >= cur_size) {
+      if (use_mmap) {
+        RAW_CHECK_MSG(needed_length < kLen, "Buffer in Report is too short!\n");
+      } else {
+        // The pid doesn't fit into the local buffer.
+        continue;
+      }
+    }
+    va_list args;
+    va_start(args, format);
+    needed_length += VSNPrintf(buffer + needed_length,
+                               cur_size - needed_length, format, args);
+    va_end(args);
+    if (needed_length >= cur_size) {
+      if (use_mmap) {
+        RAW_CHECK_MSG(needed_length < kLen, "Buffer in Report is too short!\n");
+      } else {
+        // The error message doesn't fit into the local buffer - allocate a
+        // bigger one.
+        buffer = (char*)MmapOrDie(kLen, "Report");
+        cur_size = kLen;
+        continue;
+      }
+    } else {
+      RawWrite(buffer);
+      if (PrintfAndReportCallback)
+        PrintfAndReportCallback(buffer);
+      // Don't do anything for the second time if the first iteration
+      // succeeded.
+      break;
+    }
+  }
+  // If we had mapped any memory, clean up.
+  if (buffer != local_buffer) UnmapOrDie((void*)buffer, cur_size);
 }
 
 }  // namespace __sanitizer
diff --git a/lib/sanitizer_common/sanitizer_procmaps.h b/lib/sanitizer_common/sanitizer_procmaps.h
index 1b8ea7a..8df215d 100644
--- a/lib/sanitizer_common/sanitizer_procmaps.h
+++ b/lib/sanitizer_common/sanitizer_procmaps.h
@@ -24,7 +24,8 @@
  public:
   MemoryMappingLayout() {}
   bool GetObjectNameAndOffset(uptr addr, uptr *offset,
-                              char filename[], uptr filename_size) {
+                              char filename[], uptr filename_size,
+                              uptr *protection) {
     UNIMPLEMENTED();
   }
 };
@@ -42,28 +43,37 @@
  public:
   MemoryMappingLayout();
   bool Next(uptr *start, uptr *end, uptr *offset,
-            char filename[], uptr filename_size);
+            char filename[], uptr filename_size, uptr *protection);
   void Reset();
   // Gets the object file name and the offset in that object for a given
   // address 'addr'. Returns true on success.
   bool GetObjectNameAndOffset(uptr addr, uptr *offset,
-                              char filename[], uptr filename_size);
+                              char filename[], uptr filename_size,
+                              uptr *protection);
   // In some cases, e.g. when running under a sandbox on Linux, ASan is unable
   // to obtain the memory mappings. It should fall back to pre-cached data
   // instead of aborting.
   static void CacheMemoryMappings();
   ~MemoryMappingLayout();
 
+  // Memory protection masks.
+  static const uptr kProtectionRead = 1;
+  static const uptr kProtectionWrite = 2;
+  static const uptr kProtectionExecute = 4;
+  static const uptr kProtectionShared = 8;
+
  private:
   void LoadFromCache();
   // Default implementation of GetObjectNameAndOffset.
   // Quite slow, because it iterates through the whole process map for each
   // lookup.
   bool IterateForObjectNameAndOffset(uptr addr, uptr *offset,
-                                     char filename[], uptr filename_size) {
+                                     char filename[], uptr filename_size,
+                                     uptr *protection) {
     Reset();
     uptr start, end, file_offset;
-    for (int i = 0; Next(&start, &end, &file_offset, filename, filename_size);
+    for (int i = 0; Next(&start, &end, &file_offset, filename, filename_size,
+                         protection);
          i++) {
       if (addr >= start && addr < end) {
         // Don't subtract 'start' for the first entry:
@@ -96,7 +106,8 @@
 # elif defined __APPLE__
   template<u32 kLCSegment, typename SegmentCommand>
   bool NextSegmentLoad(uptr *start, uptr *end, uptr *offset,
-                       char filename[], uptr filename_size);
+                       char filename[], uptr filename_size,
+                       uptr *protection);
   int current_image_;
   u32 current_magic_;
   u32 current_filetype_;
diff --git a/lib/sanitizer_common/sanitizer_report_decorator.h b/lib/sanitizer_common/sanitizer_report_decorator.h
index 50a3ee5..49334d5 100644
--- a/lib/sanitizer_common/sanitizer_report_decorator.h
+++ b/lib/sanitizer_common/sanitizer_report_decorator.h
@@ -14,24 +14,26 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SANITIZER_ALLOCATOR_H
-#define SANITIZER_ALLOCATOR_H
+#ifndef SANITIZER_REPORT_DECORATOR_H
+#define SANITIZER_REPORT_DECORATOR_H
 
 namespace __sanitizer {
 class AnsiColorDecorator {
  public:
   explicit AnsiColorDecorator(bool use_ansi_colors) : ansi_(use_ansi_colors) { }
-  const char *Black()        { return ansi_ ? "\033[1m\033[30m" : ""; }
-  const char *Red()          { return ansi_ ? "\033[1m\033[31m" : ""; }
-  const char *Green()        { return ansi_ ? "\033[1m\033[32m" : ""; }
-  const char *Yellow()       { return ansi_ ? "\033[1m\033[33m" : ""; }
-  const char *Blue()         { return ansi_ ? "\033[1m\033[34m" : ""; }
-  const char *Magenta()      { return ansi_ ? "\033[1m\033[35m" : ""; }
-  const char *Cyan()         { return ansi_ ? "\033[1m\033[36m" : ""; }
-  const char *White()        { return ansi_ ? "\033[1m\033[37m" : ""; }
-  const char *Default()      { return ansi_ ? "\033[1m\033[0m"  : ""; }
+  const char *Bold()    const { return ansi_ ? "\033[1m" : ""; }
+  const char *Black()   const { return ansi_ ? "\033[1m\033[30m" : ""; }
+  const char *Red()     const { return ansi_ ? "\033[1m\033[31m" : ""; }
+  const char *Green()   const { return ansi_ ? "\033[1m\033[32m" : ""; }
+  const char *Yellow()  const { return ansi_ ? "\033[1m\033[33m" : ""; }
+  const char *Blue()    const { return ansi_ ? "\033[1m\033[34m" : ""; }
+  const char *Magenta() const { return ansi_ ? "\033[1m\033[35m" : ""; }
+  const char *Cyan()    const { return ansi_ ? "\033[1m\033[36m" : ""; }
+  const char *White()   const { return ansi_ ? "\033[1m\033[37m" : ""; }
+  const char *Default() const { return ansi_ ? "\033[1m\033[0m"  : ""; }
  private:
   bool ansi_;
 };
 }  // namespace __sanitizer
-#endif  // SANITIZER_ALLOCATOR_H
+
+#endif  // SANITIZER_REPORT_DECORATOR_H
diff --git a/lib/sanitizer_common/sanitizer_stacktrace.cc b/lib/sanitizer_common/sanitizer_stacktrace.cc
index 6309b23..1b3a1f5 100644
--- a/lib/sanitizer_common/sanitizer_stacktrace.cc
+++ b/lib/sanitizer_common/sanitizer_stacktrace.cc
@@ -113,7 +113,8 @@
       PrintStackFramePrefix(frame_num, pc);
       uptr offset;
       if (proc_maps.GetObjectNameAndOffset(pc, &offset,
-                                           buff.data(), buff.size())) {
+                                           buff.data(), buff.size(),
+                                           /* protection */0)) {
         PrintModuleAndOffset(buff.data(), offset, strip_file_prefix);
       }
       Printf("\n");
diff --git a/lib/sanitizer_common/sanitizer_stoptheworld.h b/lib/sanitizer_common/sanitizer_stoptheworld.h
new file mode 100644
index 0000000..5dd3498
--- /dev/null
+++ b/lib/sanitizer_common/sanitizer_stoptheworld.h
@@ -0,0 +1,68 @@
+//===-- sanitizer_stoptheworld.h --------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines the StopTheWorld function which suspends the execution of the current
+// process and runs the user-supplied callback in the same address space.
+//
+//===----------------------------------------------------------------------===//
+#ifndef SANITIZER_STOPTHEWORLD_H
+#define SANITIZER_STOPTHEWORLD_H
+
+#include "sanitizer_internal_defs.h"
+#include "sanitizer_common.h"
+
+namespace __sanitizer {
+typedef int SuspendedThreadID;
+
+// Holds the list of suspended threads. Also provides register dumping
+// functionality (to be implemented).
+class SuspendedThreadsList {
+ public:
+  SuspendedThreadsList()
+    : thread_ids_(1024) {}
+  SuspendedThreadID GetThreadID(uptr index) {
+    CHECK_LT(index, thread_ids_.size());
+    return thread_ids_[index];
+  }
+  void DumpRegisters(uptr index) const {
+    UNIMPLEMENTED();
+  }
+  uptr thread_count() { return thread_ids_.size(); }
+  bool Contains(SuspendedThreadID thread_id) {
+    for (uptr i = 0; i < thread_ids_.size(); i++) {
+      if (thread_ids_[i] == thread_id)
+        return true;
+    }
+    return false;
+  }
+  void Append(SuspendedThreadID thread_id) {
+    thread_ids_.push_back(thread_id);
+  }
+
+ private:
+  InternalVector<SuspendedThreadID> thread_ids_;
+
+  // Prohibit copy and assign.
+  SuspendedThreadsList(const SuspendedThreadsList&);
+  void operator=(const SuspendedThreadsList&);
+};
+
+typedef void (*StopTheWorldCallback)(
+    const SuspendedThreadsList &suspended_threads_list,
+    void *argument);
+
+// Suspend all threads in the current process and run the callback on the list
+// of suspended threads. This function will resume the threads before returning.
+// The callback should not call any libc functions.
+// This function should NOT be called from multiple threads simultaneously.
+void StopTheWorld(StopTheWorldCallback callback, void *argument);
+
+}  // namespace __sanitizer
+
+#endif  // SANITIZER_STOPTHEWORLD_H
diff --git a/lib/sanitizer_common/sanitizer_stoptheworld_linux.cc b/lib/sanitizer_common/sanitizer_stoptheworld_linux.cc
new file mode 100644
index 0000000..e072780
--- /dev/null
+++ b/lib/sanitizer_common/sanitizer_stoptheworld_linux.cc
@@ -0,0 +1,327 @@
+//===-- sanitizer_stoptheworld_linux.cc -----------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// See sanitizer_stoptheworld.h for details.
+// This implementation was inspired by Markus Gutschke's linuxthreads.cc.
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef __linux__
+
+#include "sanitizer_stoptheworld.h"
+
+#include <errno.h>
+#include <sched.h> // for clone
+#include <stddef.h>
+#include <sys/prctl.h> // for PR_* definitions
+#include <sys/ptrace.h> // for PTRACE_* definitions
+#include <sys/types.h> // for pid_t
+#include <sys/wait.h> // for signal-related stuff
+
+#include "sanitizer_common.h"
+#include "sanitizer_libc.h"
+#include "sanitizer_linux.h"
+#include "sanitizer_mutex.h"
+#include "sanitizer_placement_new.h"
+
+// This module works by spawning a Linux task which then attaches to every
+// thread in the caller process with ptrace. This suspends the threads, and
+// PTRACE_GETREGS can then be used to obtain their register state. The callback
+// supplied to StopTheWorld() is run in the tracer task while the threads are
+// suspended.
+// The tracer task must be placed in a different thread group for ptrace to
+// work, so it cannot be spawned as a pthread. Instead, we use the low-level
+// clone() interface (we want to share the address space with the caller
+// process, so we prefer clone() over fork()).
+//
+// We avoid the use of libc for two reasons:
+// 1. calling a library function while threads are suspended could cause a
+// deadlock, if one of the treads happens to be holding a libc lock;
+// 2. it's generally not safe to call libc functions from the tracer task,
+// because clone() does not set up a thread-local storage for it. Any
+// thread-local variables used by libc will be shared between the tracer task
+// and the thread which spawned it.
+//
+// We deal with this by replacing libc calls with calls to our own
+// implementations defined in sanitizer_libc.h and sanitizer_linux.h. However,
+// there are still some libc functions which are used here:
+//
+// * All of the system calls ultimately go through the libc syscall() function.
+// We're operating under the assumption that syscall()'s implementation does
+// not acquire any locks or use any thread-local data (except for the errno
+// variable, which we handle separately).
+//
+// * We lack custom implementations of sigfillset() and sigaction(), so we use
+// the libc versions instead. The same assumptions as above apply.
+//
+// * It is safe to call libc functions before the cloned thread is spawned or
+// after it has exited. The following functions are used in this manner:
+// sigdelset()
+// sigprocmask()
+// clone()
+
+COMPILER_CHECK(sizeof(SuspendedThreadID) == sizeof(pid_t));
+
+namespace __sanitizer {
+// This class handles thread suspending/unsuspending in the tracer thread.
+class ThreadSuspender {
+ public:
+  explicit ThreadSuspender(pid_t pid)
+    : pid_(pid) {
+      CHECK_GE(pid, 0);
+    }
+  bool SuspendAllThreads();
+  void ResumeAllThreads();
+  void KillAllThreads();
+  SuspendedThreadsList &suspended_threads_list() {
+    return suspended_threads_list_;
+  }
+ private:
+  SuspendedThreadsList suspended_threads_list_;
+  pid_t pid_;
+  bool SuspendThread(SuspendedThreadID thread_id);
+};
+
+bool ThreadSuspender::SuspendThread(SuspendedThreadID thread_id) {
+  // Are we already attached to this thread?
+  // Currently this check takes linear time, however the number of threads is
+  // usually small.
+  if (suspended_threads_list_.Contains(thread_id))
+    return false;
+  if (internal_ptrace(PTRACE_ATTACH, thread_id, NULL, NULL) != 0) {
+    // Either the thread is dead, or something prevented us from attaching.
+    // Log this event and move on.
+    Report("Could not attach to thread %d (errno %d).\n", thread_id, errno);
+    return false;
+  } else {
+    if (SanitizerVerbosity > 0)
+      Report("Attached to thread %d.\n", thread_id);
+    // The thread is not guaranteed to stop before ptrace returns, so we must
+    // wait on it.
+    int waitpid_status;
+    HANDLE_EINTR(waitpid_status, internal_waitpid(thread_id, NULL, __WALL));
+    if (waitpid_status < 0) {
+      // Got a ECHILD error. I don't think this situation is possible, but it
+      // doesn't hurt to report it.
+      Report("Waiting on thread %d failed, detaching (errno %d).\n", thread_id,
+             errno);
+      internal_ptrace(PTRACE_DETACH, thread_id, NULL, NULL);
+      return false;
+    }
+    suspended_threads_list_.Append(thread_id);
+    return true;
+  }
+}
+
+void ThreadSuspender::ResumeAllThreads() {
+  for (uptr i = 0; i < suspended_threads_list_.thread_count(); i++) {
+    pid_t tid = suspended_threads_list_.GetThreadID(i);
+    if (internal_ptrace(PTRACE_DETACH, tid, NULL, NULL) == 0) {
+      if (SanitizerVerbosity > 0)
+        Report("Detached from thread %d.\n", tid);
+    } else {
+      // Either the thread is dead, or we are already detached.
+      // The latter case is possible, for instance, if this function was called
+      // from a signal handler.
+      Report("Could not detach from thread %d (errno %d).\n", tid, errno);
+    }
+  }
+}
+
+void ThreadSuspender::KillAllThreads() {
+  for (uptr i = 0; i < suspended_threads_list_.thread_count(); i++)
+    internal_ptrace(PTRACE_KILL, suspended_threads_list_.GetThreadID(i),
+                    NULL, NULL);
+}
+
+bool ThreadSuspender::SuspendAllThreads() {
+  void *mem = InternalAlloc(sizeof(ThreadLister));
+  ThreadLister *thread_lister = new(mem) ThreadLister(pid_);
+  bool added_threads;
+  do {
+    // Run through the directory entries once.
+    added_threads = false;
+    pid_t tid = thread_lister->GetNextTID();
+    while (tid >= 0) {
+      if (SuspendThread(tid))
+        added_threads = true;
+      tid = thread_lister->GetNextTID();
+    }
+    if (thread_lister->error()) {
+      // Detach threads and fail.
+      ResumeAllThreads();
+      InternalFree(mem);
+      return false;
+    }
+    thread_lister->Reset();
+  } while (added_threads);
+  InternalFree(mem);
+  return true;
+}
+
+// Pointer to the ThreadSuspender instance for use in signal handler.
+static ThreadSuspender *thread_suspender_instance = NULL;
+
+// Signals that should not be blocked (this is used in the parent thread as well
+// as the tracer thread).
+static const int kUnblockedSignals[] = { SIGABRT, SIGILL, SIGFPE, SIGSEGV,
+                                         SIGBUS, SIGXCPU, SIGXFSZ };
+
+// Structure for passing arguments into the tracer thread.
+struct TracerThreadArgument {
+  StopTheWorldCallback callback;
+  void *callback_argument;
+  // The tracer thread waits on this mutex while the parent finished its
+  // preparations.
+  BlockingMutex mutex;
+};
+
+// Signal handler to wake up suspended threads when the tracer thread dies.
+void TracerThreadSignalHandler(int signum, siginfo_t *siginfo, void *) {
+  if (thread_suspender_instance != NULL) {
+    if (signum == SIGABRT)
+      thread_suspender_instance->KillAllThreads();
+    else
+      thread_suspender_instance->ResumeAllThreads();
+  }
+  internal__exit((signum == SIGABRT) ? 1 : 2);
+}
+
+// Size of alternative stack for signal handlers in the tracer thread.
+static const int kHandlerStackSize = 4096;
+
+// This function will be run as a cloned task.
+static int TracerThread(void* argument) {
+  TracerThreadArgument *tracer_thread_argument =
+      (TracerThreadArgument *)argument;
+
+  // Wait for the parent thread to finish preparations.
+  tracer_thread_argument->mutex.Lock();
+  tracer_thread_argument->mutex.Unlock();
+
+  ThreadSuspender thread_suspender(internal_getppid());
+  // Global pointer for the signal handler.
+  thread_suspender_instance = &thread_suspender;
+
+  // Alternate stack for signal handling.
+  InternalScopedBuffer<char> handler_stack_memory(kHandlerStackSize);
+  struct sigaltstack handler_stack;
+  internal_memset(&handler_stack, 0, sizeof(handler_stack));
+  handler_stack.ss_sp = handler_stack_memory.data();
+  handler_stack.ss_size = kHandlerStackSize;
+  internal_sigaltstack(&handler_stack, NULL);
+
+  // Install our handler for fatal signals. Other signals should be blocked by
+  // the mask we inherited from the caller thread.
+  for (uptr signal_index = 0; signal_index < ARRAY_SIZE(kUnblockedSignals);
+       signal_index++) {
+    struct sigaction new_sigaction;
+    internal_memset(&new_sigaction, 0, sizeof(new_sigaction));
+    new_sigaction.sa_sigaction = TracerThreadSignalHandler;
+    new_sigaction.sa_flags = SA_ONSTACK | SA_SIGINFO;
+    sigfillset(&new_sigaction.sa_mask);
+    sigaction(kUnblockedSignals[signal_index], &new_sigaction, NULL);
+  }
+
+  int exit_code = 0;
+  if (!thread_suspender.SuspendAllThreads()) {
+    Report("Failed suspending threads.\n");
+    exit_code = 3;
+  } else {
+    tracer_thread_argument->callback(thread_suspender.suspended_threads_list(),
+                                     tracer_thread_argument->callback_argument);
+    thread_suspender.ResumeAllThreads();
+    exit_code = 0;
+  }
+  thread_suspender_instance = NULL;
+  handler_stack.ss_flags = SS_DISABLE;
+  internal_sigaltstack(&handler_stack, NULL);
+  return exit_code;
+}
+
+static sigset_t blocked_sigset;
+static sigset_t old_sigset;
+static struct sigaction old_sigactions[ARRAY_SIZE(kUnblockedSignals)];
+
+void StopTheWorld(StopTheWorldCallback callback, void *argument) {
+  // Block all signals that can be blocked safely, and install default handlers
+  // for the remaining signals.
+  // We cannot allow user-defined handlers to run while the ThreadSuspender
+  // thread is active, because they could conceivably call some libc functions
+  // which modify errno (which is shared between the two threads).
+  sigfillset(&blocked_sigset);
+  for (uptr signal_index = 0; signal_index < ARRAY_SIZE(kUnblockedSignals);
+       signal_index++) {
+    // Remove the signal from the set of blocked signals.
+    sigdelset(&blocked_sigset, kUnblockedSignals[signal_index]);
+    // Install the default handler.
+    struct sigaction new_sigaction;
+    internal_memset(&new_sigaction, 0, sizeof(new_sigaction));
+    new_sigaction.sa_handler = SIG_DFL;
+    sigfillset(&new_sigaction.sa_mask);
+    sigaction(kUnblockedSignals[signal_index], &new_sigaction,
+                    &old_sigactions[signal_index]);
+  }
+  int sigprocmask_status = sigprocmask(SIG_BLOCK, &blocked_sigset, &old_sigset);
+  CHECK_EQ(sigprocmask_status, 0); // sigprocmask should never fail
+  // Make this process dumpable. Processes that are not dumpable cannot be
+  // attached to.
+  int process_was_dumpable = internal_prctl(PR_GET_DUMPABLE, 0, 0, 0, 0);
+  if (!process_was_dumpable)
+    internal_prctl(PR_SET_DUMPABLE, 1, 0, 0, 0);
+  // Prepare the arguments for TracerThread.
+  struct TracerThreadArgument tracer_thread_argument;
+  tracer_thread_argument.callback = callback;
+  tracer_thread_argument.callback_argument = argument;
+  // Block the execution of TracerThread until after we have set ptrace
+  // permissions.
+  tracer_thread_argument.mutex.Lock();
+  // The tracer thread will run on the same stack, so we must reserve some
+  // stack space for the caller thread to run in as it waits on the tracer.
+  const uptr kReservedStackSize = 4096;
+  // Get a 16-byte aligned pointer for stack.
+  int a_local_variable __attribute__((__aligned__(16)));
+  pid_t tracer_pid = clone(TracerThread,
+                          (char *)&a_local_variable - kReservedStackSize,
+                          CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_UNTRACED,
+                          &tracer_thread_argument, 0, 0, 0);
+  if (tracer_pid < 0) {
+    Report("Failed spawning a tracer thread (errno %d).\n", errno);
+    tracer_thread_argument.mutex.Unlock();
+  } else {
+    // On some systems we have to explicitly declare that we want to be traced
+    // by the tracer thread.
+#ifdef PR_SET_PTRACER
+    internal_prctl(PR_SET_PTRACER, tracer_pid, 0, 0, 0);
+#endif
+    // Allow the tracer thread to start.
+    tracer_thread_argument.mutex.Unlock();
+    // Since errno is shared between this thread and the tracer thread, we
+    // must avoid using errno while the tracer thread is running.
+    // At this point, any signal will either be blocked or kill us, so waitpid
+    // should never return (and set errno) while the tracer thread is alive.
+    int waitpid_status = internal_waitpid(tracer_pid, NULL, __WALL);
+    if (waitpid_status < 0)
+      Report("Waiting on the tracer thread failed (errno %d).\n", errno);
+  }
+  // Restore the dumpable flag.
+  if (!process_was_dumpable)
+    internal_prctl(PR_SET_DUMPABLE, 0, 0, 0, 0);
+  // Restore the signal handlers.
+  for (uptr signal_index = 0; signal_index < ARRAY_SIZE(kUnblockedSignals);
+       signal_index++) {
+    sigaction(kUnblockedSignals[signal_index],
+              &old_sigactions[signal_index], NULL);
+  }
+  sigprocmask(SIG_SETMASK, &old_sigset, &old_sigset);
+}
+
+}  // namespace __sanitizer
+
+#endif  // __linux__
diff --git a/lib/sanitizer_common/sanitizer_thread_registry.cc b/lib/sanitizer_common/sanitizer_thread_registry.cc
new file mode 100644
index 0000000..3d246fe
--- /dev/null
+++ b/lib/sanitizer_common/sanitizer_thread_registry.cc
@@ -0,0 +1,256 @@
+//===-- sanitizer_thread_registry.cc --------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is shared between sanitizer tools.
+//
+// General thread bookkeeping functionality.
+//===----------------------------------------------------------------------===//
+
+#include "sanitizer_thread_registry.h"
+
+namespace __sanitizer {
+
+ThreadContextBase::ThreadContextBase(u32 tid)
+    : tid(tid), unique_id(0), os_id(0), user_id(0), status(ThreadStatusInvalid),
+      detached(false), reuse_count(0), parent_tid(0), next(0) {
+  name[0] = '\0';
+}
+
+#ifndef SANITIZER_GO
+ThreadContextBase::~ThreadContextBase() {
+  CHECK(0);
+}
+#endif
+
+void ThreadContextBase::SetName(const char *new_name) {
+  name[0] = '\0';
+  if (new_name) {
+    internal_strncpy(name, new_name, sizeof(name));
+    name[sizeof(name) - 1] = '\0';
+  }
+}
+
+void ThreadContextBase::SetDead() {
+  CHECK(status == ThreadStatusRunning ||
+        status == ThreadStatusFinished);
+  status = ThreadStatusDead;
+  user_id = 0;
+  OnDead();
+}
+
+void ThreadContextBase::SetJoined(void *arg) {
+  // FIXME(dvyukov): print message and continue (it's user error).
+  CHECK_EQ(false, detached);
+  CHECK_EQ(ThreadStatusFinished, status);
+  status = ThreadStatusDead;
+  user_id = 0;
+  OnJoined(arg);
+}
+
+void ThreadContextBase::SetFinished() {
+  if (!detached)
+    status = ThreadStatusFinished;
+  OnFinished();
+}
+
+void ThreadContextBase::SetStarted(uptr _os_id, void *arg) {
+  status = ThreadStatusRunning;
+  os_id = _os_id;
+  OnStarted(arg);
+}
+
+void ThreadContextBase::SetCreated(uptr _user_id, u64 _unique_id,
+                                   bool _detached, u32 _parent_tid, void *arg) {
+  status = ThreadStatusCreated;
+  user_id = _user_id;
+  unique_id = _unique_id;
+  detached = _detached;
+  // Parent tid makes no sense for the main thread.
+  if (tid != 0)
+    parent_tid = _parent_tid;
+  OnCreated(arg);
+}
+
+void ThreadContextBase::Reset(void *arg) {
+  status = ThreadStatusInvalid;
+  reuse_count++;
+  SetName(0);
+  OnReset(arg);
+}
+
+// ThreadRegistry implementation.
+
+const u32 ThreadRegistry::kUnknownTid = -1U;
+
+ThreadRegistry::ThreadRegistry(ThreadContextFactory factory, u32 max_threads,
+                               u32 thread_quarantine_size)
+    : context_factory_(factory),
+      max_threads_(max_threads),
+      thread_quarantine_size_(thread_quarantine_size),
+      mtx_(),
+      n_contexts_(0),
+      total_threads_(0),
+      alive_threads_(0),
+      max_alive_threads_(0),
+      running_threads_(0) {
+  threads_ = (ThreadContextBase **)MmapOrDie(max_threads_ * sizeof(threads_[0]),
+                                             "ThreadRegistry");
+  dead_threads_.clear();
+}
+
+void ThreadRegistry::GetNumberOfThreads(uptr *total, uptr *running,
+                                        uptr *alive) {
+  BlockingMutexLock l(&mtx_);
+  if (total) *total = n_contexts_;
+  if (running) *running = running_threads_;
+  if (alive) *alive = alive_threads_;
+}
+
+uptr ThreadRegistry::GetMaxAliveThreads() {
+  BlockingMutexLock l(&mtx_);
+  return max_alive_threads_;
+}
+
+u32 ThreadRegistry::CreateThread(uptr user_id, bool detached, u32 parent_tid,
+                                 void *arg) {
+  BlockingMutexLock l(&mtx_);
+  u32 tid = kUnknownTid;
+  ThreadContextBase *tctx = 0;
+  if (dead_threads_.size() > thread_quarantine_size_ ||
+      n_contexts_ >= max_threads_) {
+    // Reusing old thread descriptor and tid.
+    if (dead_threads_.size() == 0) {
+      Report("%s: Thread limit (%u threads) exceeded. Dying.\n",
+             SanitizerToolName, max_threads_);
+      Die();
+    }
+    tctx = dead_threads_.front();
+    dead_threads_.pop_front();
+    CHECK_EQ(ThreadStatusDead, tctx->status);
+    tctx->Reset(arg);
+    tid = tctx->tid;
+  } else {
+    // Allocate new thread context and tid.
+    tid = n_contexts_++;
+    tctx = context_factory_(tid);
+    threads_[tid] = tctx;
+  }
+  CHECK_NE(tctx, 0);
+  CHECK_NE(tid, kUnknownTid);
+  CHECK_LT(tid, max_threads_);
+  CHECK_EQ(tctx->status, ThreadStatusInvalid);
+  alive_threads_++;
+  if (max_alive_threads_ < alive_threads_) {
+    max_alive_threads_++;
+    CHECK_EQ(alive_threads_, max_alive_threads_);
+  }
+  tctx->SetCreated(user_id, total_threads_++, detached,
+                   parent_tid, arg);
+  return tid;
+}
+
+void ThreadRegistry::RunCallbackForEachThreadLocked(ThreadCallback cb,
+                                                    void *arg) {
+  CheckLocked();
+  for (u32 tid = 0; tid < n_contexts_; tid++) {
+    ThreadContextBase *tctx = threads_[tid];
+    if (tctx == 0)
+      continue;
+    cb(tctx, arg);
+  }
+}
+
+u32 ThreadRegistry::FindThread(FindThreadCallback cb, void *arg) {
+  BlockingMutexLock l(&mtx_);
+  for (u32 tid = 0; tid < n_contexts_; tid++) {
+    ThreadContextBase *tctx = threads_[tid];
+    if (tctx != 0 && cb(tctx, arg))
+      return tctx->tid;
+  }
+  return kUnknownTid;
+}
+
+ThreadContextBase *
+ThreadRegistry::FindThreadContextLocked(FindThreadCallback cb, void *arg) {
+  CheckLocked();
+  for (u32 tid = 0; tid < n_contexts_; tid++) {
+    ThreadContextBase *tctx = threads_[tid];
+    if (tctx != 0 && cb(tctx, arg))
+      return tctx;
+  }
+  return 0;
+}
+
+void ThreadRegistry::SetThreadName(u32 tid, const char *name) {
+  BlockingMutexLock l(&mtx_);
+  CHECK_LT(tid, n_contexts_);
+  ThreadContextBase *tctx = threads_[tid];
+  CHECK_NE(tctx, 0);
+  CHECK_EQ(ThreadStatusRunning, tctx->status);
+  tctx->SetName(name);
+}
+
+void ThreadRegistry::DetachThread(u32 tid) {
+  BlockingMutexLock l(&mtx_);
+  CHECK_LT(tid, n_contexts_);
+  ThreadContextBase *tctx = threads_[tid];
+  CHECK_NE(tctx, 0);
+  if (tctx->status == ThreadStatusInvalid) {
+    Report("%s: Detach of non-existent thread\n", SanitizerToolName);
+    return;
+  }
+  if (tctx->status == ThreadStatusFinished) {
+    tctx->SetDead();
+    dead_threads_.push_back(tctx);
+  } else {
+    tctx->detached = true;
+  }
+}
+
+void ThreadRegistry::JoinThread(u32 tid, void *arg) {
+  BlockingMutexLock l(&mtx_);
+  CHECK_LT(tid, n_contexts_);
+  ThreadContextBase *tctx = threads_[tid];
+  CHECK_NE(tctx, 0);
+  if (tctx->status == ThreadStatusInvalid) {
+    Report("%s: Join of non-existent thread\n", SanitizerToolName);
+    return;
+  }
+  tctx->SetJoined(arg);
+  dead_threads_.push_back(tctx);
+}
+
+void ThreadRegistry::FinishThread(u32 tid) {
+  BlockingMutexLock l(&mtx_);
+  CHECK_GT(alive_threads_, 0);
+  alive_threads_--;
+  CHECK_GT(running_threads_, 0);
+  running_threads_--;
+  CHECK_LT(tid, n_contexts_);
+  ThreadContextBase *tctx = threads_[tid];
+  CHECK_NE(tctx, 0);
+  CHECK_EQ(ThreadStatusRunning, tctx->status);
+  tctx->SetFinished();
+  if (tctx->detached) {
+    tctx->SetDead();
+    dead_threads_.push_back(tctx);
+  }
+}
+
+void ThreadRegistry::StartThread(u32 tid, uptr os_id, void *arg) {
+  BlockingMutexLock l(&mtx_);
+  running_threads_++;
+  CHECK_LT(tid, n_contexts_);
+  ThreadContextBase *tctx = threads_[tid];
+  CHECK_NE(tctx, 0);
+  CHECK_EQ(ThreadStatusCreated, tctx->status);
+  tctx->SetStarted(os_id, arg);
+}
+
+}  // namespace __sanitizer
diff --git a/lib/sanitizer_common/sanitizer_thread_registry.h b/lib/sanitizer_common/sanitizer_thread_registry.h
new file mode 100644
index 0000000..e2ee8f8
--- /dev/null
+++ b/lib/sanitizer_common/sanitizer_thread_registry.h
@@ -0,0 +1,143 @@
+//===-- sanitizer_thread_registry.h -----------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is shared between sanitizer tools.
+//
+// General thread bookkeeping functionality.
+//===----------------------------------------------------------------------===//
+
+#ifndef SANITIZER_THREAD_REGISTRY_H
+#define SANITIZER_THREAD_REGISTRY_H
+
+#include "sanitizer_common.h"
+#include "sanitizer_list.h"
+#include "sanitizer_mutex.h"
+
+namespace __sanitizer {
+
+enum ThreadStatus {
+  ThreadStatusInvalid,   // Non-existent thread, data is invalid.
+  ThreadStatusCreated,   // Created but not yet running.
+  ThreadStatusRunning,   // The thread is currently running.
+  ThreadStatusFinished,  // Joinable thread is finished but not yet joined.
+  ThreadStatusDead       // Joined, but some info is still available.
+};
+
+// Generic thread context. Specific sanitizer tools may inherit from it.
+// If thread is dead, context may optionally be reused for a new thread.
+class ThreadContextBase {
+ public:
+  explicit ThreadContextBase(u32 tid);
+#ifndef SANITIZER_GO  // Go does not have libstdc++
+  virtual
+#endif
+  ~ThreadContextBase();
+
+  const u32 tid;  // Thread ID. Main thread should have tid = 0.
+  u64 unique_id;  // Unique thread ID.
+  uptr os_id;     // PID (used for reporting).
+  uptr user_id;   // Some opaque user thread id (e.g. pthread_t).
+  char name[64];  // As annotated by user.
+
+  ThreadStatus status;
+  bool detached;
+  int reuse_count;
+
+  u32 parent_tid;
+  ThreadContextBase *next;  // For storing thread contexts in a list.
+
+  void SetName(const char *new_name);
+
+  void SetDead();
+  void SetJoined(void *arg);
+  void SetFinished();
+  void SetStarted(uptr _os_id, void *arg);
+  void SetCreated(uptr _user_id, u64 _unique_id, bool _detached,
+                  u32 _parent_tid, void *arg);
+  void Reset(void *arg);
+
+  // The following methods may be overriden by subclasses.
+  // Some of them take opaque arg that may be optionally be used
+  // by subclasses.
+  virtual void OnDead() {}
+  virtual void OnJoined(void *arg) {}
+  virtual void OnFinished() {}
+  virtual void OnStarted(void *arg) {}
+  virtual void OnCreated(void *arg) {}
+  virtual void OnReset(void *arg) {}
+};
+
+typedef ThreadContextBase* (*ThreadContextFactory)(u32 tid);
+
+class ThreadRegistry {
+ public:
+  static const u32 kUnknownTid;
+
+  ThreadRegistry(ThreadContextFactory factory, u32 max_threads,
+                 u32 thread_quarantine_size);
+  void GetNumberOfThreads(uptr *total = 0, uptr *running = 0, uptr *alive = 0);
+  uptr GetMaxAliveThreads();
+
+  void Lock() { mtx_.Lock(); }
+  void CheckLocked() { mtx_.CheckLocked(); }
+  void Unlock() { mtx_.Unlock(); }
+
+  // Should be guarded by ThreadRegistryLock.
+  ThreadContextBase *GetThreadLocked(u32 tid) {
+    DCHECK_LT(tid, n_contexts_);
+    return threads_[tid];
+  }
+
+  u32 CreateThread(uptr user_id, bool detached, u32 parent_tid, void *arg);
+
+  typedef void (*ThreadCallback)(ThreadContextBase *tctx, void *arg);
+  // Invokes callback with a specified arg for each thread context.
+  // Should be guarded by ThreadRegistryLock.
+  void RunCallbackForEachThreadLocked(ThreadCallback cb, void *arg);
+
+  typedef bool (*FindThreadCallback)(ThreadContextBase *tctx, void *arg);
+  // Finds a thread using the provided callback. Returns kUnknownTid if no
+  // thread is found.
+  u32 FindThread(FindThreadCallback cb, void *arg);
+  // Should be guarded by ThreadRegistryLock. Returns 0 if no thread
+  // is found.
+  ThreadContextBase *FindThreadContextLocked(FindThreadCallback cb,
+                                             void *arg);
+
+  void SetThreadName(u32 tid, const char *name);
+  void DetachThread(u32 tid);
+  void JoinThread(u32 tid, void *arg);
+  void FinishThread(u32 tid);
+  void StartThread(u32 tid, uptr os_id, void *arg);
+
+ private:
+  const ThreadContextFactory context_factory_;
+  const u32 max_threads_;
+  const u32 thread_quarantine_size_;
+
+  BlockingMutex mtx_;
+
+  u32 n_contexts_;      // Number of created thread contexts,
+                        // at most max_threads_.
+  u64 total_threads_;   // Total number of created threads. May be greater than
+                        // max_threads_ if contexts were reused.
+  uptr alive_threads_;  // Created or running.
+  uptr max_alive_threads_;
+  uptr running_threads_;
+
+  ThreadContextBase **threads_;  // Array of thread contexts is leaked.
+  IntrusiveList<ThreadContextBase> dead_threads_;
+};
+
+typedef GenericScopedLock<ThreadRegistry> ThreadRegistryLock;
+
+}  // namespace __sanitizer
+
+#endif  // SANITIZER_THREAD_REGISTRY_H
+
diff --git a/lib/sanitizer_common/sanitizer_win.cc b/lib/sanitizer_common/sanitizer_win.cc
index 40af4e3..77afa47 100644
--- a/lib/sanitizer_common/sanitizer_win.cc
+++ b/lib/sanitizer_common/sanitizer_win.cc
@@ -111,19 +111,38 @@
   UNIMPLEMENTED();
 }
 
+static const int kMaxEnvNameLength = 128;
+static const int kMaxEnvValueLength = 32767;
+
+namespace {
+
+struct EnvVariable {
+  char name[kMaxEnvNameLength];
+  char value[kMaxEnvValueLength];
+};
+
+}  // namespace
+
+static const int kEnvVariables = 5;
+static EnvVariable env_vars[kEnvVariables];
+static int num_env_vars;
+
 const char *GetEnv(const char *name) {
-  static char env_buffer[32767] = {};
-
-  // Note: this implementation stores the result in a static buffer so we only
-  // allow it to be called just once.
-  static bool called_once = false;
-  if (called_once)
-    UNIMPLEMENTED();
-  called_once = true;
-
-  DWORD rv = GetEnvironmentVariableA(name, env_buffer, sizeof(env_buffer));
-  if (rv > 0 && rv < sizeof(env_buffer))
-    return env_buffer;
+  // Note: this implementation caches the values of the environment variables
+  // and limits their quantity.
+  for (int i = 0; i < num_env_vars; i++) {
+    if (0 == internal_strcmp(name, env_vars[i].name))
+      return env_vars[i].value;
+  }
+  CHECK_LT(num_env_vars, kEnvVariables);
+  DWORD rv = GetEnvironmentVariableA(name, env_vars[num_env_vars].value,
+                                     kMaxEnvValueLength);
+  if (rv > 0 && rv < kMaxEnvValueLength) {
+    CHECK_LT(internal_strlen(name), kMaxEnvNameLength);
+    internal_strncpy(env_vars[num_env_vars].name, name, kMaxEnvNameLength);
+    num_env_vars++;
+    return env_vars[num_env_vars - 1].value;
+  }
   return 0;
 }
 
@@ -270,6 +289,12 @@
   owner_ = LOCK_READY;
 }
 
+BlockingMutex::BlockingMutex() {
+  CHECK(sizeof(CRITICAL_SECTION) <= sizeof(opaque_storage_));
+  InitializeCriticalSection((LPCRITICAL_SECTION)opaque_storage_);
+  owner_ = LOCK_READY;
+}
+
 void BlockingMutex::Lock() {
   if (owner_ == LOCK_UNINITIALIZED) {
     // FIXME: hm, global BlockingMutex objects are not initialized?!?
@@ -291,6 +316,17 @@
   LeaveCriticalSection((LPCRITICAL_SECTION)opaque_storage_);
 }
 
+void BlockingMutex::CheckLocked() {
+  CHECK_EQ(owner_, GetThreadSelf());
+}
+
+uptr GetTlsSize() {
+  return 0;
+}
+
+void InitTlsSize() {
+}
+
 }  // namespace __sanitizer
 
 #endif  // _WIN32
diff --git a/lib/sanitizer_common/tests/CMakeLists.txt b/lib/sanitizer_common/tests/CMakeLists.txt
index 111dfee..346e010 100644
--- a/lib/sanitizer_common/tests/CMakeLists.txt
+++ b/lib/sanitizer_common/tests/CMakeLists.txt
@@ -5,13 +5,16 @@
   sanitizer_common_test.cc
   sanitizer_flags_test.cc
   sanitizer_libc_test.cc
+  sanitizer_linux_test.cc
   sanitizer_list_test.cc
   sanitizer_mutex_test.cc
   sanitizer_printf_test.cc
   sanitizer_scanf_interceptor_test.cc
   sanitizer_stackdepot_test.cc
   sanitizer_stacktrace_test.cc
+  sanitizer_stoptheworld_test.cc
   sanitizer_test_main.cc
+  sanitizer_thread_registry_test.cc
   )
 
 set(SANITIZER_TEST_HEADERS)
@@ -19,6 +22,18 @@
   list(APPEND SANITIZER_TEST_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../${header})
 endforeach()
 
+set(SANITIZER_TEST_CFLAGS_COMMON
+  ${COMPILER_RT_GTEST_INCLUDE_CFLAGS}
+  -I${COMPILER_RT_SOURCE_DIR}/include
+  -I${COMPILER_RT_SOURCE_DIR}/lib
+  -I${COMPILER_RT_SOURCE_DIR}/lib/sanitizer_common
+  -DGTEST_HAS_RTTI=0
+  -O2 -g -fno-rtti
+  -Wall -Werror -Werror=sign-compare)
+
+set(SANITIZER_TEST_LINK_FLAGS_COMMON
+  -lstdc++ -ldl)
+
 include_directories(..)
 include_directories(../..)
 
@@ -50,18 +65,12 @@
   get_target_flags_for_arch(${arch} TARGET_FLAGS)
   set(SANITIZER_TEST_SOURCES ${SANITIZER_UNITTESTS}
                              ${COMPILER_RT_GTEST_SOURCE})
-  set(SANITIZER_TEST_CFLAGS ${COMPILER_RT_GTEST_INCLUDE_CFLAGS}
-                            -I${COMPILER_RT_SOURCE_DIR}/include
-                            -I${COMPILER_RT_SOURCE_DIR}/lib
-                            -I${COMPILER_RT_SOURCE_DIR}/lib/sanitizer_common
-                            -O2 -g -Wall -Werror ${TARGET_FLAGS})
-  set(SANITIZER_TEST_LINK_FLAGS -lstdc++ -lpthread ${TARGET_FLAGS})
   set(SANITIZER_TEST_OBJECTS)
   foreach(source ${SANITIZER_TEST_SOURCES})
     get_filename_component(basename ${source} NAME)
     set(output_obj "${basename}.${arch}.o")
     clang_compile(${output_obj} ${source}
-                  CFLAGS ${SANITIZER_TEST_CFLAGS}
+                  CFLAGS ${SANITIZER_TEST_CFLAGS_COMMON} ${TARGET_FLAGS}
                   DEPS gtest ${SANITIZER_RUNTIME_LIBRARIES}
                        ${SANITIZER_TEST_HEADERS})
     list(APPEND SANITIZER_TEST_OBJECTS ${output_obj})
@@ -74,7 +83,8 @@
                        OBJECTS ${SANITIZER_TEST_OBJECTS}
                                ${SANITIZER_COMMON_LIB_NAME}
                        DEPS ${SANITIZER_TEST_OBJECTS} ${SANITIZER_COMMON_LIB}
-                       LINK_FLAGS ${SANITIZER_TEST_LINK_FLAGS})
+                       LINK_FLAGS ${SANITIZER_TEST_LINK_FLAGS_COMMON}
+                                  -lpthread ${TARGET_FLAGS})
 endmacro()
 
 if(COMPILER_RT_CAN_EXECUTE_TESTS)
@@ -119,20 +129,14 @@
   add_executable(SanitizerTest
     ${SANITIZER_UNITTESTS}
     ${COMPILER_RT_GTEST_SOURCE}
-    $<TARGET_OBJECTS:RTSanitizerCommon.arm.android>
-    )
+    $<TARGET_OBJECTS:RTSanitizerCommon.arm.android>)
   set_target_compile_flags(SanitizerTest
     ${SANITIZER_COMMON_CFLAGS}
-    ${COMPILER_RT_GTEST_INCLUDE_CFLAGS}
-    -I${COMPILER_RT_SOURCE_DIR}/include
-    -I${COMPILER_RT_SOURCE_DIR}/lib
-    -I${COMPILER_RT_SOURCE_DIR}/lib/sanitizer_common
-    -O2 -g
-    )
+    ${SANITIZER_TEST_CFLAGS_COMMON})
   # Setup correct output directory and link flags.
   set_target_properties(SanitizerTest PROPERTIES
     RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-  set_target_link_flags(SanitizerTest ${SANITIZER_TEST_LINK_FLAGS})
+  set_target_link_flags(SanitizerTest ${SANITIZER_TEST_LINK_FLAGS_COMMON})
   # Add unit test to test suite.
   add_dependencies(SanitizerUnitTests SanitizerTest)
 endif()
diff --git a/lib/sanitizer_common/tests/sanitizer_allocator_test.cc b/lib/sanitizer_common/tests/sanitizer_allocator_test.cc
index 273b002..a8747a5 100644
--- a/lib/sanitizer_common/tests/sanitizer_allocator_test.cc
+++ b/lib/sanitizer_common/tests/sanitizer_allocator_test.cc
@@ -22,6 +22,7 @@
 #include <pthread.h>
 #include <algorithm>
 #include <vector>
+#include <set>
 
 // Too slow for debug build
 #if TSAN_DEBUG == 0
@@ -115,6 +116,12 @@
     CHECK_EQ(last_total_allocated, total_allocated);
   }
 
+  // Check that GetBlockBegin never crashes.
+  for (uptr x = 0, step = kAddressSpaceSize / 100000;
+       x < kAddressSpaceSize - step; x += step)
+    if (a->PointerIsMine(reinterpret_cast<void *>(x)))
+      Ident(a->GetBlockBegin(reinterpret_cast<void *>(x)));
+
   a->TestOnlyUnmap();
   delete a;
 }
@@ -197,7 +204,7 @@
   cache.Init(0);
   AllocatorStats stats;
   stats.Init();
-  a->AllocateBatch(&stats, &cache, 64);
+  a->AllocateBatch(&stats, &cache, 32);
   EXPECT_EQ(TestMapUnmapCallback::map_count, 3);  // State + alloc + metadata.
   a->TestOnlyUnmap();
   EXPECT_EQ(TestMapUnmapCallback::unmap_count, 1);  // The whole thing.
@@ -219,7 +226,7 @@
   cache.Init(0);
   AllocatorStats stats;
   stats.Init();
-  a->AllocateBatch(&stats, &cache, 64);
+  a->AllocateBatch(&stats, &cache, 32);
   EXPECT_EQ(TestMapUnmapCallback::map_count, 2);  // alloc.
   a->TestOnlyUnmap();
   EXPECT_EQ(TestMapUnmapCallback::unmap_count, 2);  // The whole thing + alloc.
@@ -252,7 +259,7 @@
   AllocatorStats stats;
   stats.Init();
   for (int i = 0; i < 1000000; i++) {
-    a.AllocateBatch(&stats, &cache, 64);
+    a.AllocateBatch(&stats, &cache, 52);
   }
 
   a.TestOnlyUnmap();
@@ -482,6 +489,42 @@
 
   a.TestOnlyUnmap();
 }
+
+// Struct which is allocated to pass info to new threads.  The new thread frees
+// it.
+struct NewThreadParams {
+  AllocatorCache *thread_cache;
+  AllocatorCache::Allocator *allocator;
+  uptr class_id;
+};
+
+// Called in a new thread.  Just frees its argument.
+static void *DeallocNewThreadWorker(void *arg) {
+  NewThreadParams *params = reinterpret_cast<NewThreadParams*>(arg);
+  params->thread_cache->Deallocate(params->allocator, params->class_id, params);
+  return NULL;
+}
+
+// The allocator cache is supposed to be POD and zero initialized.  We should be
+// able to call Deallocate on a zeroed cache, and it will self-initialize.
+TEST(Allocator, AllocatorCacheDeallocNewThread) {
+  AllocatorCache::Allocator allocator;
+  allocator.Init();
+  AllocatorCache main_cache;
+  AllocatorCache child_cache;
+  memset(&main_cache, 0, sizeof(main_cache));
+  memset(&child_cache, 0, sizeof(child_cache));
+
+  uptr class_id = DefaultSizeClassMap::ClassID(sizeof(NewThreadParams));
+  NewThreadParams *params = reinterpret_cast<NewThreadParams*>(
+      main_cache.Allocate(&allocator, class_id));
+  params->thread_cache = &child_cache;
+  params->allocator = &allocator;
+  params->class_id = class_id;
+  pthread_t t;
+  EXPECT_EQ(0, pthread_create(&t, 0, DeallocNewThreadWorker, params));
+  EXPECT_EQ(0, pthread_join(t, 0));
+}
 #endif
 
 TEST(Allocator, Basic) {
@@ -523,4 +566,94 @@
   }
 }
 
+class IterationTestCallback {
+ public:
+  explicit IterationTestCallback(std::set<void *> *chunks)
+    : chunks_(chunks) {}
+  void operator()(void *chunk) const {
+    chunks_->insert(chunk);
+  }
+ private:
+  std::set<void *> *chunks_;
+};
+
+template <class Allocator>
+void TestSizeClassAllocatorIteration() {
+  Allocator *a = new Allocator;
+  a->Init();
+  SizeClassAllocatorLocalCache<Allocator> cache;
+  memset(&cache, 0, sizeof(cache));
+  cache.Init(0);
+
+  static const uptr sizes[] = {1, 16, 30, 40, 100, 1000, 10000,
+    50000, 60000, 100000, 120000, 300000, 500000, 1000000, 2000000};
+
+  std::vector<void *> allocated;
+
+  // Allocate a bunch of chunks.
+  for (uptr s = 0; s < ARRAY_SIZE(sizes); s++) {
+    uptr size = sizes[s];
+    if (!a->CanAllocate(size, 1)) continue;
+    // printf("s = %ld\n", size);
+    uptr n_iter = std::max((uptr)6, 80000 / size);
+    // fprintf(stderr, "size: %ld iter: %ld\n", size, n_iter);
+    for (uptr j = 0; j < n_iter; j++) {
+      uptr class_id0 = Allocator::SizeClassMapT::ClassID(size);
+      void *x = cache.Allocate(a, class_id0);
+      allocated.push_back(x);
+    }
+  }
+
+  std::set<void *> reported_chunks;
+  IterationTestCallback callback(&reported_chunks);
+  a->ForceLock();
+  a->ForEachChunk(callback);
+  a->ForceUnlock();
+
+  for (uptr i = 0; i < allocated.size(); i++) {
+    // Don't use EXPECT_NE. Reporting the first mismatch is enough.
+    ASSERT_NE(reported_chunks.find(allocated[i]), reported_chunks.end());
+  }
+
+  a->TestOnlyUnmap();
+  delete a;
+}
+
+#if SANITIZER_WORDSIZE == 64
+TEST(SanitizerCommon, SizeClassAllocator64Iteration) {
+  TestSizeClassAllocatorIteration<Allocator64>();
+}
+#endif
+
+TEST(SanitizerCommon, SizeClassAllocator32Iteration) {
+  TestSizeClassAllocatorIteration<Allocator32Compact>();
+}
+
+
+TEST(SanitizerCommon, LargeMmapAllocatorIteration) {
+  LargeMmapAllocator<> a;
+  a.Init();
+  AllocatorStats stats;
+  stats.Init();
+
+  static const uptr kNumAllocs = 1000;
+  char *allocated[kNumAllocs];
+  static const uptr size = 40;
+  // Allocate some.
+  for (uptr i = 0; i < kNumAllocs; i++) {
+    allocated[i] = (char *)a.Allocate(&stats, size, 1);
+  }
+
+  std::set<void *> reported_chunks;
+  IterationTestCallback callback(&reported_chunks);
+  a.ForceLock();
+  a.ForEachChunk(callback);
+  a.ForceUnlock();
+
+  for (uptr i = 0; i < kNumAllocs; i++) {
+    // Don't use EXPECT_NE. Reporting the first mismatch is enough.
+    ASSERT_NE(reported_chunks.find(allocated[i]), reported_chunks.end());
+  }
+}
+
 #endif  // #if TSAN_DEBUG==0
diff --git a/lib/sanitizer_common/tests/sanitizer_common_test.cc b/lib/sanitizer_common/tests/sanitizer_common_test.cc
index 1ffcde2..0a777bd 100644
--- a/lib/sanitizer_common/tests/sanitizer_common_test.cc
+++ b/lib/sanitizer_common/tests/sanitizer_common_test.cc
@@ -99,11 +99,16 @@
 TEST(SanitizerCommon, InternalVector) {
   InternalVector<uptr> vector(1);
   for (uptr i = 0; i < 100; i++) {
-    EXPECT_EQ(vector.size(), i);
+    EXPECT_EQ(i, vector.size());
     vector.push_back(i);
   }
   for (uptr i = 0; i < 100; i++) {
-    EXPECT_EQ(vector[i], i);
+    EXPECT_EQ(i, vector[i]);
+  }
+  for (int i = 99; i >= 0; i--) {
+    EXPECT_EQ((uptr)i, vector.back());
+    vector.pop_back();
+    EXPECT_EQ((uptr)i, vector.size());
   }
 }
 
diff --git a/lib/sanitizer_common/tests/sanitizer_linux_test.cc b/lib/sanitizer_common/tests/sanitizer_linux_test.cc
new file mode 100644
index 0000000..035c11f
--- /dev/null
+++ b/lib/sanitizer_common/tests/sanitizer_linux_test.cc
@@ -0,0 +1,190 @@
+//===-- sanitizer_linux_test.cc -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Tests for sanitizer_linux.h
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef __linux__
+
+#include "sanitizer_common/sanitizer_linux.h"
+#include "gtest/gtest.h"
+
+#include "sanitizer_common/sanitizer_common.h"
+
+#include <pthread.h>
+#include <sched.h>
+
+#include <algorithm>
+#include <vector>
+
+namespace __sanitizer {
+
+struct TidReporterArgument {
+  TidReporterArgument() {
+    pthread_mutex_init(&terminate_thread_mutex, NULL);
+    pthread_mutex_init(&tid_reported_mutex, NULL);
+    pthread_cond_init(&terminate_thread_cond, NULL);
+    pthread_cond_init(&tid_reported_cond, NULL);
+    terminate_thread = false;
+  }
+
+  ~TidReporterArgument() {
+    pthread_mutex_destroy(&terminate_thread_mutex);
+    pthread_mutex_destroy(&tid_reported_mutex);
+    pthread_cond_destroy(&terminate_thread_cond);
+    pthread_cond_destroy(&tid_reported_cond);
+  }
+
+  pid_t reported_tid;
+  // For signaling to spawned threads that they should terminate.
+  pthread_cond_t terminate_thread_cond;
+  pthread_mutex_t terminate_thread_mutex;
+  bool terminate_thread;
+  // For signaling to main thread that a child thread has reported its tid.
+  pthread_cond_t tid_reported_cond;
+  pthread_mutex_t tid_reported_mutex;
+
+ private:
+  // Disallow evil constructors
+  TidReporterArgument(const TidReporterArgument &);
+  void operator=(const TidReporterArgument &);
+};
+
+class ThreadListerTest : public ::testing::Test {
+ protected:
+  virtual void SetUp() {
+    pthread_t pthread_id;
+    pid_t tid;
+    for (uptr i = 0; i < kThreadCount; i++) {
+      SpawnTidReporter(&pthread_id, &tid);
+      pthread_ids_.push_back(pthread_id);
+      tids_.push_back(tid);
+    }
+  }
+
+  virtual void TearDown() {
+    pthread_mutex_lock(&thread_arg.terminate_thread_mutex);
+    thread_arg.terminate_thread = true;
+    pthread_cond_broadcast(&thread_arg.terminate_thread_cond);
+    pthread_mutex_unlock(&thread_arg.terminate_thread_mutex);
+    for (uptr i = 0; i < pthread_ids_.size(); i++)
+      pthread_join(pthread_ids_[i], NULL);
+  }
+
+  void SpawnTidReporter(pthread_t *pthread_id, pid_t *tid);
+
+  static const uptr kThreadCount = 20;
+
+  std::vector<pthread_t> pthread_ids_;
+  std::vector<pid_t> tids_;
+
+  TidReporterArgument thread_arg;
+};
+
+// Writes its TID once to reported_tid and waits until signaled to terminate.
+void *TidReporterThread(void *argument) {
+  TidReporterArgument *arg = reinterpret_cast<TidReporterArgument *>(argument);
+  pthread_mutex_lock(&arg->tid_reported_mutex);
+  arg->reported_tid = GetTid();
+  pthread_cond_broadcast(&arg->tid_reported_cond);
+  pthread_mutex_unlock(&arg->tid_reported_mutex);
+
+  pthread_mutex_lock(&arg->terminate_thread_mutex);
+  while (!arg->terminate_thread)
+    pthread_cond_wait(&arg->terminate_thread_cond,
+                      &arg->terminate_thread_mutex);
+  pthread_mutex_unlock(&arg->terminate_thread_mutex);
+  return NULL;
+}
+
+void ThreadListerTest::SpawnTidReporter(pthread_t *pthread_id,
+                                        pid_t *tid) {
+  pthread_mutex_lock(&thread_arg.tid_reported_mutex);
+  thread_arg.reported_tid = -1;
+  ASSERT_EQ(0, pthread_create(pthread_id, NULL,
+                              TidReporterThread,
+                              &thread_arg));
+  while (thread_arg.reported_tid == -1)
+    pthread_cond_wait(&thread_arg.tid_reported_cond,
+                      &thread_arg.tid_reported_mutex);
+  pthread_mutex_unlock(&thread_arg.tid_reported_mutex);
+  *tid = thread_arg.reported_tid;
+}
+
+static std::vector<pid_t> ReadTidsToVector(ThreadLister *thread_lister) {
+  std::vector<pid_t> listed_tids;
+  pid_t tid;
+  while ((tid = thread_lister->GetNextTID()) >= 0)
+    listed_tids.push_back(tid);
+  EXPECT_FALSE(thread_lister->error());
+  return listed_tids;
+}
+
+static bool Includes(std::vector<pid_t> first, std::vector<pid_t> second) {
+  std::sort(first.begin(), first.end());
+  std::sort(second.begin(), second.end());
+  return std::includes(first.begin(), first.end(),
+                       second.begin(), second.end());
+}
+
+static bool HasElement(std::vector<pid_t> vector, pid_t element) {
+  return std::find(vector.begin(), vector.end(), element) != vector.end();
+}
+
+// ThreadLister's output should include the current thread's TID and the TID of
+// every thread we spawned.
+TEST_F(ThreadListerTest, ThreadListerSeesAllSpawnedThreads) {
+  pid_t self_tid = GetTid();
+  ThreadLister thread_lister(getpid());
+  std::vector<pid_t> listed_tids = ReadTidsToVector(&thread_lister);
+  ASSERT_TRUE(HasElement(listed_tids, self_tid));
+  ASSERT_TRUE(Includes(listed_tids, tids_));
+}
+
+// Calling Reset() should not cause ThreadLister to forget any threads it's
+// supposed to know about.
+TEST_F(ThreadListerTest, ResetDoesNotForgetThreads) {
+  ThreadLister thread_lister(getpid());
+
+  // Run the loop body twice, because Reset() might behave differently if called
+  // on a freshly created object.
+  for (uptr i = 0; i < 2; i++) {
+    thread_lister.Reset();
+    std::vector<pid_t> listed_tids = ReadTidsToVector(&thread_lister);
+    ASSERT_TRUE(Includes(listed_tids, tids_));
+  }
+}
+
+// If new threads have spawned during ThreadLister object's lifetime, calling
+// Reset() should cause ThreadLister to recognize their existence.
+TEST_F(ThreadListerTest, ResetMakesNewThreadsKnown) {
+  ThreadLister thread_lister(getpid());
+  std::vector<pid_t> threads_before_extra = ReadTidsToVector(&thread_lister);
+
+  pthread_t extra_pthread_id;
+  pid_t extra_tid;
+  SpawnTidReporter(&extra_pthread_id, &extra_tid);
+  // Register the new thread so it gets terminated in TearDown().
+  pthread_ids_.push_back(extra_pthread_id);
+
+  // It would be very bizarre if the new TID had been listed before we even
+  // spawned that thread, but it would also cause a false success in this test,
+  // so better check for that.
+  ASSERT_FALSE(HasElement(threads_before_extra, extra_tid));
+
+  thread_lister.Reset();
+
+  std::vector<pid_t> threads_after_extra = ReadTidsToVector(&thread_lister);
+  ASSERT_TRUE(HasElement(threads_after_extra, extra_tid));
+}
+
+}  // namespace __sanitizer
+
+#endif  // __linux__
diff --git a/lib/sanitizer_common/tests/sanitizer_mutex_test.cc b/lib/sanitizer_common/tests/sanitizer_mutex_test.cc
index 6bb2ae2..1dc9bef 100644
--- a/lib/sanitizer_common/tests/sanitizer_mutex_test.cc
+++ b/lib/sanitizer_common/tests/sanitizer_mutex_test.cc
@@ -92,6 +92,12 @@
   return 0;
 }
 
+template<typename MutexType>
+static void check_locked(MutexType *mtx) {
+  GenericScopedLock<MutexType> l(mtx);
+  mtx->CheckLocked();
+}
+
 TEST(SanitizerCommon, SpinMutex) {
   SpinMutex mtx;
   mtx.Init();
@@ -123,6 +129,7 @@
     pthread_create(&threads[i], 0, lock_thread<BlockingMutex>, &data);
   for (int i = 0; i < kThreads; i++)
     pthread_join(threads[i], 0);
+  check_locked(mtx);
 }
 
 }  // namespace __sanitizer
diff --git a/lib/sanitizer_common/tests/sanitizer_stoptheworld_test.cc b/lib/sanitizer_common/tests/sanitizer_stoptheworld_test.cc
new file mode 100644
index 0000000..29cbc9a
--- /dev/null
+++ b/lib/sanitizer_common/tests/sanitizer_stoptheworld_test.cc
@@ -0,0 +1,193 @@
+//===-- sanitizer_stoptheworld_test.cc ------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Tests for sanitizer_stoptheworld.h
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef __linux__
+
+#include "sanitizer_common/sanitizer_stoptheworld.h"
+#include "gtest/gtest.h"
+
+#include "sanitizer_common/sanitizer_libc.h"
+#include "sanitizer_common/sanitizer_common.h"
+
+#include <pthread.h>
+#include <sched.h>
+
+namespace __sanitizer {
+
+static pthread_mutex_t incrementer_thread_exit_mutex;
+
+struct CallbackArgument {
+  volatile int counter;
+  volatile bool threads_stopped;
+  volatile bool callback_executed;
+  CallbackArgument()
+    : counter(0),
+      threads_stopped(false),
+      callback_executed(false) {}
+};
+
+void *IncrementerThread(void *argument) {
+  CallbackArgument *callback_argument = (CallbackArgument *)argument;
+  while (true) {
+    __sync_fetch_and_add(&callback_argument->counter, 1);
+    if (pthread_mutex_trylock(&incrementer_thread_exit_mutex) == 0) {
+      pthread_mutex_unlock(&incrementer_thread_exit_mutex);
+      return NULL;
+    } else {
+      sched_yield();
+    }
+  }
+}
+
+// This callback checks that IncrementerThread is suspended at the time of its
+// execution.
+void Callback(const SuspendedThreadsList &suspended_threads_list,
+              void *argument) {
+  CallbackArgument *callback_argument = (CallbackArgument *)argument;
+  callback_argument->callback_executed = true;
+  int counter_at_init = __sync_fetch_and_add(&callback_argument->counter, 0);
+  for (uptr i = 0; i < 1000; i++) {
+    sched_yield();
+    if (__sync_fetch_and_add(&callback_argument->counter, 0) !=
+          counter_at_init) {
+      callback_argument->threads_stopped = false;
+      return;
+    }
+  }
+  callback_argument->threads_stopped = true;
+}
+
+TEST(StopTheWorld, SuspendThreadsSimple) {
+  pthread_mutex_init(&incrementer_thread_exit_mutex, NULL);
+  CallbackArgument argument;
+  pthread_t thread_id;
+  int pthread_create_result;
+  pthread_mutex_lock(&incrementer_thread_exit_mutex);
+  pthread_create_result = pthread_create(&thread_id, NULL, IncrementerThread,
+                                         &argument);
+  ASSERT_EQ(0, pthread_create_result);
+  StopTheWorld(&Callback, &argument);
+  pthread_mutex_unlock(&incrementer_thread_exit_mutex);
+  EXPECT_TRUE(argument.callback_executed);
+  EXPECT_TRUE(argument.threads_stopped);
+  // argument is on stack, so we have to wait for the incrementer thread to
+  // terminate before we can return from this function.
+  ASSERT_EQ(0, pthread_join(thread_id, NULL));
+  pthread_mutex_destroy(&incrementer_thread_exit_mutex);
+}
+
+// A more comprehensive test where we spawn a bunch of threads while executing
+// StopTheWorld in parallel.
+static const uptr kThreadCount = 50;
+static const uptr kStopWorldAfter = 10; // let this many threads spawn first
+
+static pthread_mutex_t advanced_incrementer_thread_exit_mutex;
+
+struct AdvancedCallbackArgument {
+  volatile uptr thread_index;
+  volatile int counters[kThreadCount];
+  pthread_t thread_ids[kThreadCount];
+  volatile bool threads_stopped;
+  volatile bool callback_executed;
+  volatile bool fatal_error;
+  AdvancedCallbackArgument()
+    : thread_index(0),
+      threads_stopped(false),
+      callback_executed(false),
+      fatal_error(false) {}
+};
+
+void *AdvancedIncrementerThread(void *argument) {
+  AdvancedCallbackArgument *callback_argument =
+      (AdvancedCallbackArgument *)argument;
+  uptr this_thread_index = __sync_fetch_and_add(
+      &callback_argument->thread_index, 1);
+  // Spawn the next thread.
+  int pthread_create_result;
+  if (this_thread_index + 1 < kThreadCount) {
+    pthread_create_result =
+        pthread_create(&callback_argument->thread_ids[this_thread_index + 1],
+                       NULL, AdvancedIncrementerThread, argument);
+    // Cannot use ASSERT_EQ in non-void-returning functions. If there's a
+    // problem, defer failing to the main thread.
+    if (pthread_create_result != 0) {
+      callback_argument->fatal_error = true;
+      __sync_fetch_and_add(&callback_argument->thread_index,
+                           kThreadCount - callback_argument->thread_index);
+    }
+  }
+  // Do the actual work.
+  while (true) {
+    __sync_fetch_and_add(&callback_argument->counters[this_thread_index], 1);
+    if (pthread_mutex_trylock(&advanced_incrementer_thread_exit_mutex) == 0) {
+      pthread_mutex_unlock(&advanced_incrementer_thread_exit_mutex);
+      return NULL;
+    } else {
+      sched_yield();
+    }
+  }
+}
+
+void AdvancedCallback(const SuspendedThreadsList &suspended_threads_list,
+                             void *argument) {
+  AdvancedCallbackArgument *callback_argument =
+      (AdvancedCallbackArgument *)argument;
+  callback_argument->callback_executed = true;
+
+  int counters_at_init[kThreadCount];
+  for (uptr j = 0; j < kThreadCount; j++)
+    counters_at_init[j] = __sync_fetch_and_add(&callback_argument->counters[j],
+                                               0);
+  for (uptr i = 0; i < 10; i++) {
+    sched_yield();
+    for (uptr j = 0; j < kThreadCount; j++)
+      if (__sync_fetch_and_add(&callback_argument->counters[j], 0) !=
+            counters_at_init[j]) {
+        callback_argument->threads_stopped = false;
+        return;
+      }
+  }
+  callback_argument->threads_stopped = true;
+}
+
+TEST(StopTheWorld, SuspendThreadsAdvanced) {
+  pthread_mutex_init(&advanced_incrementer_thread_exit_mutex, NULL);
+  AdvancedCallbackArgument argument;
+
+  pthread_mutex_lock(&advanced_incrementer_thread_exit_mutex);
+  int pthread_create_result;
+  pthread_create_result = pthread_create(&argument.thread_ids[0], NULL,
+                                         AdvancedIncrementerThread,
+                                         &argument);
+  ASSERT_EQ(0, pthread_create_result);
+  // Wait for several threads to spawn before proceeding.
+  while (__sync_fetch_and_add(&argument.thread_index, 0) < kStopWorldAfter)
+    sched_yield();
+  StopTheWorld(&AdvancedCallback, &argument);
+  EXPECT_TRUE(argument.callback_executed);
+  EXPECT_TRUE(argument.threads_stopped);
+
+  // Wait for all threads to spawn before we start terminating them.
+  while (__sync_fetch_and_add(&argument.thread_index, 0) < kThreadCount)
+    sched_yield();
+  ASSERT_FALSE(argument.fatal_error); // a pthread_create has failed
+  // Signal the threads to terminate.
+  pthread_mutex_unlock(&advanced_incrementer_thread_exit_mutex);
+  for (uptr i = 0; i < kThreadCount; i++)
+    ASSERT_EQ(0, pthread_join(argument.thread_ids[i], NULL));
+  pthread_mutex_destroy(&advanced_incrementer_thread_exit_mutex);
+}
+
+}  // namespace __sanitizer
+
+#endif  // __linux__
diff --git a/lib/sanitizer_common/tests/sanitizer_stoptheworld_testlib.cc b/lib/sanitizer_common/tests/sanitizer_stoptheworld_testlib.cc
new file mode 100644
index 0000000..74749d4
--- /dev/null
+++ b/lib/sanitizer_common/tests/sanitizer_stoptheworld_testlib.cc
@@ -0,0 +1,52 @@
+//===-- sanitizer_stoptheworld_testlib.cc ---------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Dynamic library to test StopTheWorld functionality.
+// When loaded with LD_PRELOAD, it will periodically suspend all threads.
+//===----------------------------------------------------------------------===//
+/* Usage:
+clang++ -fno-exceptions -g -fPIC -I. \
+ sanitizer_common/tests/sanitizer_stoptheworld_testlib.cc \
+ sanitizer_common/sanitizer_*.cc -shared -lpthread -o teststoptheworld.so
+LD_PRELOAD=`pwd`/teststoptheworld.so /your/app
+*/
+
+#ifdef __linux__
+
+#include <dlfcn.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <pthread.h>
+#include <unistd.h>
+
+#include "sanitizer_common/sanitizer_stoptheworld.h"
+
+namespace {
+const uptr kSuspendDuration = 3;
+const uptr kRunDuration = 3;
+
+void Callback(const SuspendedThreadsList &suspended_threads_list,
+              void *argument) {
+  sleep(kSuspendDuration);
+}
+
+void *SuspenderThread(void *argument) {
+  while (true) {
+    sleep(kRunDuration);
+    StopTheWorld(Callback, NULL);
+  }
+  return NULL;
+}
+
+__attribute__((constructor)) void StopTheWorldTestLibConstructor(void) {
+  pthread_t thread_id;
+  pthread_create(&thread_id, NULL, SuspenderThread, NULL);
+}
+}  // namespace
+
+#endif  // __linux__
diff --git a/lib/sanitizer_common/tests/sanitizer_test_utils.h b/lib/sanitizer_common/tests/sanitizer_test_utils.h
index 4f8bed9..a770d0f 100644
--- a/lib/sanitizer_common/tests/sanitizer_test_utils.h
+++ b/lib/sanitizer_common/tests/sanitizer_test_utils.h
@@ -36,12 +36,14 @@
 #define __has_feature(x) 0
 #endif
 
-#if __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__)
-# define ATTRIBUTE_NO_ADDRESS_SAFETY_ANALYSIS \
+#ifndef ATTRIBUTE_NO_ADDRESS_SAFETY_ANALYSIS
+# if __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__)
+#  define ATTRIBUTE_NO_ADDRESS_SAFETY_ANALYSIS \
     __attribute__((no_sanitize_address))
-#else
-# define ATTRIBUTE_NO_ADDRESS_SAFETY_ANALYSIS
-#endif
+# else
+#  define ATTRIBUTE_NO_ADDRESS_SAFETY_ANALYSIS
+# endif
+#endif  // ATTRIBUTE_NO_ADDRESS_SAFETY_ANALYSIS
 
 #if __LP64__ || defined(_WIN64)
 #  define SANITIZER_WORDSIZE 64
diff --git a/lib/sanitizer_common/tests/sanitizer_thread_registry_test.cc b/lib/sanitizer_common/tests/sanitizer_thread_registry_test.cc
new file mode 100644
index 0000000..e080403
--- /dev/null
+++ b/lib/sanitizer_common/tests/sanitizer_thread_registry_test.cc
@@ -0,0 +1,230 @@
+//===-- sanitizer_thread_registry_test.cc ---------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of shared sanitizer runtime.
+//
+//===----------------------------------------------------------------------===//
+#include "sanitizer_common/sanitizer_thread_registry.h"
+#include "gtest/gtest.h"
+
+#include <vector>
+
+namespace __sanitizer {
+
+static BlockingMutex tctx_allocator_lock(LINKER_INITIALIZED);
+static LowLevelAllocator tctx_allocator;
+
+template<typename TCTX>
+static ThreadContextBase *GetThreadContext(u32 tid) {
+  BlockingMutexLock l(&tctx_allocator_lock);
+  void *mem = tctx_allocator.Allocate(sizeof(TCTX));
+  return new(mem) TCTX(tid);
+}
+
+static const u32 kMaxRegistryThreads = 1000;
+static const u32 kRegistryQuarantine = 2;
+
+static void CheckThreadQuantity(ThreadRegistry *registry, uptr exp_total,
+                                uptr exp_running, uptr exp_alive) {
+  uptr total, running, alive;
+  registry->GetNumberOfThreads(&total, &running, &alive);
+  EXPECT_EQ(exp_total, total);
+  EXPECT_EQ(exp_running, running);
+  EXPECT_EQ(exp_alive, alive);
+}
+
+static bool is_detached(u32 tid) {
+  return (tid % 2 == 0);
+}
+
+static uptr get_uid(u32 tid) {
+  return tid * 2;
+}
+
+static bool HasName(ThreadContextBase *tctx, void *arg) {
+  char *name = (char*)arg;
+  return (tctx->name && 0 == internal_strcmp(tctx->name, name));
+}
+
+static bool HasUid(ThreadContextBase *tctx, void *arg) {
+  uptr uid = (uptr)arg;
+  return (tctx->user_id == uid);
+}
+
+static void MarkUidAsPresent(ThreadContextBase *tctx, void *arg) {
+  bool *arr = (bool*)arg;
+  arr[tctx->tid] = true;
+}
+
+static void TestRegistry(ThreadRegistry *registry, bool has_quarantine) {
+  // Create and start a main thread.
+  EXPECT_EQ(0U, registry->CreateThread(get_uid(0), true, -1, 0));
+  registry->StartThread(0, 0, 0);
+  // Create a bunch of threads.
+  for (u32 i = 1; i <= 10; i++) {
+    EXPECT_EQ(i, registry->CreateThread(get_uid(i), is_detached(i), 0, 0));
+  }
+  CheckThreadQuantity(registry, 11, 1, 11);
+  // Start some of them.
+  for (u32 i = 1; i <= 5; i++) {
+    registry->StartThread(i, 0, 0);
+  }
+  CheckThreadQuantity(registry, 11, 6, 11);
+  // Finish, create and start more threads.
+  for (u32 i = 1; i <= 5; i++) {
+    registry->FinishThread(i);
+    if (!is_detached(i))
+      registry->JoinThread(i, 0);
+  }
+  for (u32 i = 6; i <= 10; i++) {
+    registry->StartThread(i, 0, 0);
+  }
+  std::vector<u32> new_tids;
+  for (u32 i = 11; i <= 15; i++) {
+    new_tids.push_back(
+        registry->CreateThread(get_uid(i), is_detached(i), 0, 0));
+  }
+  ASSERT_LE(kRegistryQuarantine, 5U);
+  u32 exp_total = 16 - (has_quarantine ? 5 - kRegistryQuarantine  : 0);
+  CheckThreadQuantity(registry, exp_total, 6, 11);
+  // Test SetThreadName and FindThread.
+  registry->SetThreadName(6, "six");
+  registry->SetThreadName(7, "seven");
+  EXPECT_EQ(7U, registry->FindThread(HasName, (void*)"seven"));
+  EXPECT_EQ(ThreadRegistry::kUnknownTid,
+            registry->FindThread(HasName, (void*)"none"));
+  EXPECT_EQ(0U, registry->FindThread(HasUid, (void*)get_uid(0)));
+  EXPECT_EQ(10U, registry->FindThread(HasUid, (void*)get_uid(10)));
+  EXPECT_EQ(ThreadRegistry::kUnknownTid,
+            registry->FindThread(HasUid, (void*)0x1234));
+  // Detach and finish and join remaining threads.
+  for (u32 i = 6; i <= 10; i++) {
+    registry->DetachThread(i);
+    registry->FinishThread(i);
+  }
+  for (u32 i = 0; i < new_tids.size(); i++) {
+    u32 tid = new_tids[i];
+    registry->StartThread(tid, 0, 0);
+    registry->DetachThread(tid);
+    registry->FinishThread(tid);
+  }
+  CheckThreadQuantity(registry, exp_total, 1, 1);
+  // Test methods that require the caller to hold a ThreadRegistryLock.
+  bool has_tid[16];
+  internal_memset(&has_tid[0], 0, sizeof(has_tid));
+  {
+    ThreadRegistryLock l(registry);
+    registry->RunCallbackForEachThreadLocked(MarkUidAsPresent, &has_tid[0]);
+  }
+  for (u32 i = 0; i < exp_total; i++) {
+    EXPECT_TRUE(has_tid[i]);
+  }
+  {
+    ThreadRegistryLock l(registry);
+    registry->CheckLocked();
+    ThreadContextBase *main_thread = registry->GetThreadLocked(0);
+    EXPECT_EQ(main_thread, registry->FindThreadContextLocked(
+        HasUid, (void*)get_uid(0)));
+  }
+  EXPECT_EQ(11U, registry->GetMaxAliveThreads());
+}
+
+TEST(SanitizerCommon, ThreadRegistryTest) {
+  ThreadRegistry quarantine_registry(GetThreadContext<ThreadContextBase>,
+                                     kMaxRegistryThreads,
+                                     kRegistryQuarantine);
+  TestRegistry(&quarantine_registry, true);
+
+  ThreadRegistry no_quarantine_registry(GetThreadContext<ThreadContextBase>,
+                                        kMaxRegistryThreads,
+                                        kMaxRegistryThreads);
+  TestRegistry(&no_quarantine_registry, false);
+}
+
+static const int kThreadsPerShard = 20;
+static const int kNumShards = 25;
+
+static int num_created[kNumShards + 1];
+static int num_started[kNumShards + 1];
+static int num_joined[kNumShards + 1];
+
+namespace {
+
+struct RunThreadArgs {
+  ThreadRegistry *registry;
+  uptr shard;  // started from 1.
+};
+
+class TestThreadContext : public ThreadContextBase {
+ public:
+  explicit TestThreadContext(int tid) : ThreadContextBase(tid) {}
+  void OnJoined(void *arg) {
+    uptr shard = (uptr)arg;
+    num_joined[shard]++;
+  }
+  void OnStarted(void *arg) {
+    uptr shard = (uptr)arg;
+    num_started[shard]++;
+  }
+  void OnCreated(void *arg) {
+    uptr shard = (uptr)arg;
+    num_created[shard]++;
+  }
+};
+
+}  // namespace
+
+void *RunThread(void *arg) {
+  RunThreadArgs *args = static_cast<RunThreadArgs*>(arg);
+  std::vector<int> tids;
+  for (int i = 0; i < kThreadsPerShard; i++)
+    tids.push_back(
+        args->registry->CreateThread(0, false, 0, (void*)args->shard));
+  for (int i = 0; i < kThreadsPerShard; i++)
+    args->registry->StartThread(tids[i], 0, (void*)args->shard);
+  for (int i = 0; i < kThreadsPerShard; i++)
+    args->registry->FinishThread(tids[i]);
+  for (int i = 0; i < kThreadsPerShard; i++)
+    args->registry->JoinThread(tids[i], (void*)args->shard);
+  return 0;
+}
+
+static void ThreadedTestRegistry(ThreadRegistry *registry) {
+  // Create and start a main thread.
+  EXPECT_EQ(0U, registry->CreateThread(0, true, -1, 0));
+  registry->StartThread(0, 0, 0);
+  pthread_t threads[kNumShards];
+  RunThreadArgs args[kNumShards];
+  for (int i = 0; i < kNumShards; i++) {
+    args[i].registry = registry;
+    args[i].shard = i + 1;
+    pthread_create(&threads[i], 0, RunThread, &args[i]);
+  }
+  for (int i = 0; i < kNumShards; i++) {
+    pthread_join(threads[i], 0);
+  }
+  // Check that each thread created/started/joined correct amount
+  // of "threads" in thread_registry.
+  EXPECT_EQ(1, num_created[0]);
+  EXPECT_EQ(1, num_started[0]);
+  EXPECT_EQ(0, num_joined[0]);
+  for (int i = 1; i <= kNumShards; i++) {
+    EXPECT_EQ(kThreadsPerShard, num_created[i]);
+    EXPECT_EQ(kThreadsPerShard, num_started[i]);
+    EXPECT_EQ(kThreadsPerShard, num_joined[i]);
+  }
+}
+
+TEST(SanitizerCommon, ThreadRegistryThreadedTest) {
+  ThreadRegistry registry(GetThreadContext<TestThreadContext>,
+                          kThreadsPerShard * kNumShards + 1, 10);
+  ThreadedTestRegistry(&registry);
+}
+
+}  // namespace __sanitizer
diff --git a/lib/tsan/CMakeLists.txt b/lib/tsan/CMakeLists.txt
index 34e3a2e..ef4b610 100644
--- a/lib/tsan/CMakeLists.txt
+++ b/lib/tsan/CMakeLists.txt
@@ -2,7 +2,9 @@
 
 include_directories(..)
 
-set(TSAN_CFLAGS ${SANITIZER_COMMON_CFLAGS})
+set(TSAN_CFLAGS
+  ${SANITIZER_COMMON_CFLAGS}
+  -fno-rtti)
 # FIXME: Add support for compile flags:
 #   -Wframe-larger-than=512,
 #   -Wglobal-constructors,
diff --git a/lib/tsan/Makefile.old b/lib/tsan/Makefile.old
index a492eab..6329bbb 100644
--- a/lib/tsan/Makefile.old
+++ b/lib/tsan/Makefile.old
@@ -1,13 +1,15 @@
 DEBUG=0
 LDFLAGS=-ldl -lpthread -pie
-CXXFLAGS = -fPIE -g -Wall -Werror -DTSAN_DEBUG=$(DEBUG) -DSANITIZER_DEBUG=$(DEBUG)
+CXXFLAGS = -fPIE -fno-rtti -g -Wall -Werror \
+					 -DGTEST_HAS_RTTI=0 -DTSAN_DEBUG=$(DEBUG) -DSANITIZER_DEBUG=$(DEBUG)
+CLANG=clang
 # Silence warnings that Clang produces for gtest code.
 # Use -Wno-attributes so that gcc doesn't complain about unknown warning types.
 CXXFLAGS += -Wno-attributes
 ifeq ($(DEBUG), 0)
 	CXXFLAGS += -O3
 endif
-ifeq ($(CXX), clang++)
+ifeq ($(CXX), $(CLANG)++)
   CXXFLAGS+= -Wno-unused-private-field -Wno-static-in-inline -Wgnu
 endif
 
@@ -54,16 +56,16 @@
 
 run: all
 	(ulimit -s 8192; ./tsan_test)
-	./lit_tests/test_output.sh
+	CC=$(CLANG) CXX=$(CLANG)++ ./lit_tests/test_output.sh
 
 presubmit:
 	../sanitizer_common/scripts/check_lint.sh
 	# Debug build with clang.
 	$(MAKE) -f Makefile.old clean
-	$(MAKE) -f Makefile.old run DEBUG=1 -j 16 CC=clang CXX=clang++
+	$(MAKE) -f Makefile.old run DEBUG=1 -j 16 CC=$(CLANG) CXX=$(CLANG)++
 	# Release build with clang.
 	$(MAKE) -f Makefile.old clean
-	$(MAKE) -f Makefile.old run DEBUG=0 -j 16 CC=clang CXX=clang++
+	$(MAKE) -f Makefile.old run DEBUG=0 -j 16 CC=$(CLANG) CXX=$(CLANG)++
 	# Debug build with gcc
 	$(MAKE) -f Makefile.old clean
 	$(MAKE) -f Makefile.old run DEBUG=1 -j 16 CC=gcc CXX=g++
diff --git a/lib/tsan/go/buildgo.sh b/lib/tsan/go/buildgo.sh
index a153afd..aba03f9 100755
--- a/lib/tsan/go/buildgo.sh
+++ b/lib/tsan/go/buildgo.sh
@@ -20,6 +20,7 @@
 	../../sanitizer_common/sanitizer_flags.cc
 	../../sanitizer_common/sanitizer_libc.cc
 	../../sanitizer_common/sanitizer_printf.cc
+	../../sanitizer_common/sanitizer_thread_registry.cc
 "
 
 if [ "`uname -a | grep Linux`" != "" ]; then
@@ -60,7 +61,7 @@
 	cat $F >> gotsan.cc
 done
 
-FLAGS=" -I../rtl -I../.. -I../../sanitizer_common -I../../../include -m64 -Wall -Werror -fno-exceptions -DTSAN_GO -DSANITIZER_GO -DTSAN_SHADOW_COUNT=4 $OSCFLAGS"
+FLAGS=" -I../rtl -I../.. -I../../sanitizer_common -I../../../include -m64 -Wall -Werror -fno-exceptions -fno-rtti -DTSAN_GO -DSANITIZER_GO -DTSAN_SHADOW_COUNT=4 $OSCFLAGS"
 if [ "$DEBUG" == "" ]; then
 	FLAGS+=" -DTSAN_DEBUG=0 -O3 -fomit-frame-pointer"
 else
diff --git a/lib/tsan/lit_tests/test_output.sh b/lib/tsan/lit_tests/test_output.sh
index d21c9a7..f4f8a5c 100755
--- a/lib/tsan/lit_tests/test_output.sh
+++ b/lib/tsan/lit_tests/test_output.sh
@@ -6,9 +6,10 @@
 ROOTDIR=$(dirname $0)/..
 BLACKLIST=$ROOTDIR/lit_tests/Helpers/blacklist.txt
 
-# Assuming clang is in path.
-CC=clang
-CXX=clang++
+# Assume clang and clang++ are in path.
+: ${CC:=clang}
+: ${CXX:=clang++}
+: ${FILECHECK:=FileCheck}
 
 # TODO: add testing for all of -O0...-O3
 CFLAGS="-fsanitize=thread -fsanitize-blacklist=$BLACKLIST -fPIE -O1 -g -fno-builtin -Wall"
@@ -23,7 +24,7 @@
   $COMPILER $SRC $CFLAGS -c -o $OBJ
   $COMPILER $OBJ $LDFLAGS -o $EXE
   RES=$($EXE 2>&1 || true)
-  printf "%s\n" "$RES" | FileCheck $SRC
+  printf "%s\n" "$RES" | $FILECHECK $SRC
   if [ "$3" == "" ]; then
     rm -f $EXE $OBJ
   fi
diff --git a/lib/tsan/rtl/Makefile.old b/lib/tsan/rtl/Makefile.old
index f522ec6..cf4121e 100644
--- a/lib/tsan/rtl/Makefile.old
+++ b/lib/tsan/rtl/Makefile.old
@@ -10,7 +10,7 @@
 INTERCEPTION=../../interception
 COMMON=../../sanitizer_common
 INCLUDES= -I../.. -I../../../include
-EXTRA_CXXFLAGS=-fno-exceptions
+EXTRA_CXXFLAGS=-fno-exceptions -fno-rtti
 NO_SYSROOT=--sysroot=.
 CXXFLAGS+=$(EXTRA_CXXFLAGS)
 CXXFLAGS+=$(CFLAGS)
diff --git a/lib/tsan/rtl/tsan_defs.h b/lib/tsan/rtl/tsan_defs.h
index 5c5ab9e..27f1db3 100644
--- a/lib/tsan/rtl/tsan_defs.h
+++ b/lib/tsan/rtl/tsan_defs.h
@@ -156,7 +156,6 @@
 MD5Hash md5_hash(const void *data, uptr size);
 
 struct ThreadState;
-struct ThreadContext;
 struct Context;
 struct ReportStack;
 class ReportDesc;
diff --git a/lib/tsan/rtl/tsan_interceptors.cc b/lib/tsan/rtl/tsan_interceptors.cc
index 25dfe9c..8bc2762 100644
--- a/lib/tsan/rtl/tsan_interceptors.cc
+++ b/lib/tsan/rtl/tsan_interceptors.cc
@@ -15,6 +15,7 @@
 
 #include "sanitizer_common/sanitizer_atomic.h"
 #include "sanitizer_common/sanitizer_libc.h"
+#include "sanitizer_common/sanitizer_platform_limits_posix.h"
 #include "sanitizer_common/sanitizer_placement_new.h"
 #include "sanitizer_common/sanitizer_stacktrace.h"
 #include "interception/interception.h"
@@ -86,11 +87,6 @@
 
 #define errno (*__errno_location())
 
-union pthread_attr_t {
-  char size[kPthreadAttrSize];
-  void *align;
-};
-
 struct sigaction_t {
   union {
     sighandler_t sa_handler;
@@ -602,7 +598,7 @@
   if (res != MAP_FAILED) {
     if (fd > 0)
       FdAccess(thr, pc, fd);
-    MemoryResetRange(thr, pc, (uptr)res, sz);
+    MemoryRangeImitateWrite(thr, pc, (uptr)res, sz);
   }
   return res;
 }
@@ -616,13 +612,14 @@
   if (res != MAP_FAILED) {
     if (fd > 0)
       FdAccess(thr, pc, fd);
-    MemoryResetRange(thr, pc, (uptr)res, sz);
+    MemoryRangeImitateWrite(thr, pc, (uptr)res, sz);
   }
   return res;
 }
 
 TSAN_INTERCEPTOR(int, munmap, void *addr, long_t sz) {
   SCOPED_TSAN_INTERCEPTOR(munmap, addr, sz);
+  DontNeedShadowFor((uptr)addr, sz);
   int res = REAL(munmap)(addr, sz);
   return res;
 }
@@ -734,7 +731,7 @@
 TSAN_INTERCEPTOR(int, pthread_create,
     void *th, void *attr, void *(*callback)(void*), void * param) {
   SCOPED_TSAN_INTERCEPTOR(pthread_create, th, attr, callback, param);
-  pthread_attr_t myattr;
+  __sanitizer_pthread_attr_t myattr;
   if (attr == 0) {
     pthread_attr_init(&myattr);
     attr = &myattr;
@@ -1804,7 +1801,7 @@
               (uptr)sigactions[sig].sa_sigaction :
               (uptr)sigactions[sig].sa_handler;
           stack.Init(&pc, 1);
-          Lock l(&ctx->thread_mtx);
+          ThreadRegistryLock l(ctx->thread_registry);
           ScopedReport rep(ReportTypeErrnoInSignal);
           if (!IsFiredSuppression(ctx, rep, stack)) {
             rep.AddStack(&stack);
diff --git a/lib/tsan/rtl/tsan_mman.cc b/lib/tsan/rtl/tsan_mman.cc
index fb32483..35cf43d 100644
--- a/lib/tsan/rtl/tsan_mman.cc
+++ b/lib/tsan/rtl/tsan_mman.cc
@@ -29,6 +29,41 @@
 
 namespace __tsan {
 
+COMPILER_CHECK(sizeof(MBlock) == 16);
+
+void MBlock::Lock() {
+  atomic_uintptr_t *a = reinterpret_cast<atomic_uintptr_t*>(this);
+  uptr v = atomic_load(a, memory_order_relaxed);
+  for (int iter = 0;; iter++) {
+    if (v & 1) {
+      if (iter < 10)
+        proc_yield(20);
+      else
+        internal_sched_yield();
+      v = atomic_load(a, memory_order_relaxed);
+      continue;
+    }
+    if (atomic_compare_exchange_weak(a, &v, v | 1, memory_order_acquire))
+      break;
+  }
+}
+
+void MBlock::Unlock() {
+  atomic_uintptr_t *a = reinterpret_cast<atomic_uintptr_t*>(this);
+  uptr v = atomic_load(a, memory_order_relaxed);
+  DCHECK(v & 1);
+  atomic_store(a, v & ~1, memory_order_relaxed);
+}
+
+struct MapUnmapCallback {
+  void OnMap(uptr p, uptr size) const { }
+  void OnUnmap(uptr p, uptr size) const {
+    // We are about to unmap a chunk of user memory.
+    // Mark the corresponding shadow memory as not needed.
+    DontNeedShadowFor(p, size);
+  }
+};
+
 static char allocator_placeholder[sizeof(Allocator)] ALIGNED(64);
 Allocator *allocator() {
   return reinterpret_cast<Allocator*>(&allocator_placeholder);
@@ -56,7 +91,7 @@
   Context *ctx = CTX();
   StackTrace stack;
   stack.ObtainCurrent(thr, pc);
-  Lock l(&ctx->thread_mtx);
+  ThreadRegistryLock l(ctx->thread_registry);
   ScopedReport rep(ReportTypeSignalUnsafe);
   if (!IsFiredSuppression(ctx, rep, stack)) {
     rep.AddStack(&stack);
@@ -70,13 +105,9 @@
   if (p == 0)
     return 0;
   MBlock *b = new(allocator()->GetMetaData(p)) MBlock;
-  b->size = sz;
-  b->head = 0;
-  b->alloc_tid = thr->unique_id;
-  b->alloc_stack_id = CurrentStackId(thr, pc);
-  if (CTX() && CTX()->initialized) {
+  b->Init(sz, thr->tid, CurrentStackId(thr, pc));
+  if (CTX() && CTX()->initialized)
     MemoryRangeImitateWrite(thr, pc, (uptr)p, sz);
-  }
   DPrintf("#%d: alloc(%zu) = %p\n", thr->tid, sz, p);
   SignalUnsafeCall(thr, pc);
   return p;
@@ -87,9 +118,9 @@
   CHECK_NE(p, (void*)0);
   DPrintf("#%d: free(%p)\n", thr->tid, p);
   MBlock *b = (MBlock*)allocator()->GetMetaData(p);
-  if (b->head)   {
-    Lock l(&b->mtx);
-    for (SyncVar *s = b->head; s;) {
+  if (b->ListHead()) {
+    MBlock::ScopedLock l(b);
+    for (SyncVar *s = b->ListHead(); s;) {
       SyncVar *res = s;
       s = s->next;
       StatInc(thr, StatSyncDestroyed);
@@ -97,12 +128,10 @@
       res->mtx.Unlock();
       DestroyAndFree(res);
     }
-    b->head = 0;
+    b->ListReset();
   }
-  if (CTX() && CTX()->initialized && thr->in_rtl == 1) {
-    MemoryRangeFreed(thr, pc, (uptr)p, b->size);
-  }
-  b->~MBlock();
+  if (CTX() && CTX()->initialized && thr->in_rtl == 1)
+    MemoryRangeFreed(thr, pc, (uptr)p, b->Size());
   allocator()->Deallocate(&thr->alloc_cache, p);
   SignalUnsafeCall(thr, pc);
 }
@@ -118,12 +147,11 @@
       return 0;
     if (p) {
       MBlock *b = user_mblock(thr, p);
-      internal_memcpy(p2, p, min(b->size, sz));
+      internal_memcpy(p2, p, min(b->Size(), sz));
     }
   }
-  if (p) {
+  if (p)
     user_free(thr, pc, p);
-  }
   return p2;
 }
 
@@ -132,7 +160,7 @@
   if (p == 0)
     return 0;
   MBlock *b = (MBlock*)allocator()->GetMetaData(p);
-  return (b) ? b->size : 0;
+  return b ? b->Size() : 0;
 }
 
 MBlock *user_mblock(ThreadState *thr, void *p) {
@@ -223,6 +251,11 @@
   if (p == 0)
     return 0;
   MBlock *b = (MBlock*)allocator()->GetMetaData(p);
-  return b->size;
+  return b->Size();
+}
+
+void __tsan_on_thread_idle() {
+  ThreadState *thr = cur_thread();
+  allocator()->SwallowCache(&thr->alloc_cache);
 }
 }  // extern "C"
diff --git a/lib/tsan/rtl/tsan_platform.h b/lib/tsan/rtl/tsan_platform.h
index 87b41d9..d6b331a 100644
--- a/lib/tsan/rtl/tsan_platform.h
+++ b/lib/tsan/rtl/tsan_platform.h
@@ -132,8 +132,8 @@
 #endif
 }
 
-uptr GetShadowMemoryConsumption();
 void FlushShadowMemory();
+void WriteMemoryProfile(char *buf, uptr buf_size);
 
 const char *InitializePlatform();
 void FinalizePlatform();
@@ -148,7 +148,6 @@
 // Says whether the addr relates to a global var.
 // Guesses with high probability, may yield both false positives and negatives.
 bool IsGlobalVar(uptr addr);
-uptr GetTlsSize();
 void GetThreadStackAndTls(bool main, uptr *stk_addr, uptr *stk_size,
                           uptr *tls_addr, uptr *tls_size);
 int ExtractResolvFDs(void *state, int *fds, int nfd);
diff --git a/lib/tsan/rtl/tsan_platform_linux.cc b/lib/tsan/rtl/tsan_platform_linux.cc
index f756cbc..02a6648 100644
--- a/lib/tsan/rtl/tsan_platform_linux.cc
+++ b/lib/tsan/rtl/tsan_platform_linux.cc
@@ -42,8 +42,10 @@
 #include <dlfcn.h>
 #define __need_res_state
 #include <resolv.h>
+#include <malloc.h>
 
 extern "C" int arch_prctl(int code, __sanitizer::uptr *addr);
+extern "C" struct mallinfo __libc_mallinfo();
 
 namespace __tsan {
 
@@ -68,8 +70,75 @@
 }
 #endif
 
-uptr GetShadowMemoryConsumption() {
-  return 0;
+static bool ishex(char c) {
+  return (c >= '0' && c <= '9')
+      || (c >= 'a' && c <= 'f');
+}
+
+static uptr readhex(const char *p) {
+  uptr v = 0;
+  for (; ishex(p[0]); p++) {
+    if (p[0] >= '0' && p[0] <= '9')
+      v = v * 16 + p[0] - '0';
+    else
+      v = v * 16 + p[0] - 'a' + 10;
+  }
+  return v;
+}
+
+static uptr readdec(const char *p) {
+  uptr v = 0;
+  for (; p[0] >= '0' && p[0] <= '9' ; p++)
+    v = v * 10 + p[0] - '0';
+  return v;
+}
+
+void WriteMemoryProfile(char *buf, uptr buf_size) {
+  char *smaps = 0;
+  uptr smaps_cap = 0;
+  uptr smaps_len = ReadFileToBuffer("/proc/self/smaps",
+      &smaps, &smaps_cap, 64<<20);
+  uptr mem[6] = {};
+  uptr total = 0;
+  uptr start = 0;
+  bool file = false;
+  const char *pos = smaps;
+  while (pos < smaps + smaps_len) {
+    if (ishex(pos[0])) {
+      start = readhex(pos);
+      for (; *pos != '/' && *pos > '\n'; pos++) {}
+      file = *pos == '/';
+    } else if (internal_strncmp(pos, "Rss:", 4) == 0) {
+      for (; *pos < '0' || *pos > '9'; pos++) {}
+      uptr rss = readdec(pos) * 1024;
+      total += rss;
+      start >>= 40;
+      if (start < 0x10)  // shadow
+        mem[0] += rss;
+      else if (start >= 0x20 && start < 0x30)  // compat modules
+        mem[file ? 1 : 2] += rss;
+      else if (start >= 0x7e)  // modules
+        mem[file ? 1 : 2] += rss;
+      else if (start >= 0x60 && start < 0x62)  // traces
+        mem[3] += rss;
+      else if (start >= 0x7d && start < 0x7e)  // heap
+        mem[4] += rss;
+      else  // other
+        mem[5] += rss;
+    }
+    while (*pos++ != '\n') {}
+  }
+  UnmapOrDie(smaps, smaps_cap);
+  char *buf_pos = buf;
+  char *buf_end = buf + buf_size;
+  buf_pos += internal_snprintf(buf_pos, buf_end - buf_pos,
+      "RSS %zd MB: shadow:%zd file:%zd mmap:%zd trace:%zd heap:%zd other:%zd\n",
+      total >> 20, mem[0] >> 20, mem[1] >> 20, mem[2] >> 20,
+      mem[3] >> 20, mem[4] >> 20, mem[5] >> 20);
+  struct mallinfo mi = __libc_mallinfo();
+  buf_pos += internal_snprintf(buf_pos, buf_end - buf_pos,
+      "mallinfo: arena=%d mmap=%d fordblks=%d keepcost=%d\n",
+      mi.arena >> 20, mi.hblkhd >> 20, mi.fordblks >> 20, mi.keepcost >> 20);
 }
 
 void FlushShadowMemory() {
@@ -129,7 +198,8 @@
   MemoryMappingLayout proc_maps;
   uptr start, end;
   if (proc_maps.Next(&start, &end,
-                     /*offset*/0, /*filename*/0, /*filename_size*/0)) {
+                     /*offset*/0, /*filename*/0, /*filename_size*/0,
+                     /*protection*/0)) {
     if ((u64)start < kLinuxAppMemBeg) {
       Printf("FATAL: ThreadSanitizer can not mmap the shadow memory ("
              "something is mapped at 0x%zx < 0x%zx)\n",
@@ -146,7 +216,8 @@
   uptr start, end, offset;
   char name[128];
   bool prev_is_data = false;
-  while (proc_maps.Next(&start, &end, &offset, name, ARRAY_SIZE(name))) {
+  while (proc_maps.Next(&start, &end, &offset, name, ARRAY_SIZE(name),
+                        /*protection*/ 0)) {
     DPrintf("%p-%p %p %s\n", start, end, offset, name);
     bool is_data = offset != 0 && name[0] != 0;
     // BSS may get merged with [heap] in /proc/self/maps. This is not very
@@ -165,27 +236,6 @@
   CHECK_LT((uptr)&g_data_start, g_data_end);
 }
 
-static uptr g_tls_size;
-
-#ifdef __i386__
-# define INTERNAL_FUNCTION __attribute__((regparm(3), stdcall))
-#else
-# define INTERNAL_FUNCTION
-#endif
-
-static int InitTlsSize() {
-  typedef void (*get_tls_func)(size_t*, size_t*) INTERNAL_FUNCTION;
-  get_tls_func get_tls;
-  void *get_tls_static_info_ptr = dlsym(RTLD_NEXT, "_dl_get_tls_static_info");
-  CHECK_EQ(sizeof(get_tls), sizeof(get_tls_static_info_ptr));
-  internal_memcpy(&get_tls, &get_tls_static_info_ptr,
-                  sizeof(get_tls_static_info_ptr));
-  CHECK_NE(get_tls, 0);
-  size_t tls_size = 0;
-  size_t tls_align = 0;
-  get_tls(&tls_size, &tls_align);
-  return tls_size;
-}
 #endif  // #ifndef TSAN_GO
 
 static rlim_t getlim(int res) {
@@ -240,7 +290,7 @@
 
 #ifndef TSAN_GO
   CheckPIE();
-  g_tls_size = (uptr)InitTlsSize();
+  InitTlsSize();
   InitDataSeg();
 #endif
   return GetEnv(kTsanOptionsEnv);
@@ -250,20 +300,12 @@
   fflush(0);
 }
 
-uptr GetTlsSize() {
-#ifndef TSAN_GO
-  return g_tls_size;
-#else
-  return 0;
-#endif
-}
-
 void GetThreadStackAndTls(bool main, uptr *stk_addr, uptr *stk_size,
                           uptr *tls_addr, uptr *tls_size) {
 #ifndef TSAN_GO
   arch_prctl(ARCH_GET_FS, tls_addr);
-  *tls_addr -= g_tls_size;
-  *tls_size = g_tls_size;
+  *tls_size = GetTlsSize();
+  *tls_addr -= *tls_size;
 
   uptr stack_top, stack_bottom;
   GetThreadStackTopAndBottom(main, &stack_top, &stack_bottom);
diff --git a/lib/tsan/rtl/tsan_platform_mac.cc b/lib/tsan/rtl/tsan_platform_mac.cc
index fb00742..d5caea3 100644
--- a/lib/tsan/rtl/tsan_platform_mac.cc
+++ b/lib/tsan/rtl/tsan_platform_mac.cc
@@ -89,10 +89,6 @@
   fflush(0);
 }
 
-uptr GetTlsSize() {
-  return 0;
-}
-
 void GetThreadStackAndTls(bool main, uptr *stk_addr, uptr *stk_size,
                           uptr *tls_addr, uptr *tls_size) {
   *stk_addr = 0;
diff --git a/lib/tsan/rtl/tsan_platform_windows.cc b/lib/tsan/rtl/tsan_platform_windows.cc
index 60a59c7..9bd3958 100644
--- a/lib/tsan/rtl/tsan_platform_windows.cc
+++ b/lib/tsan/rtl/tsan_platform_windows.cc
@@ -41,10 +41,6 @@
   fflush(0);
 }
 
-uptr GetTlsSize() {
-  return 0;
-}
-
 void GetThreadStackAndTls(bool main, uptr *stk_addr, uptr *stk_size,
                           uptr *tls_addr, uptr *tls_size) {
   *stk_addr = 0;
diff --git a/lib/tsan/rtl/tsan_report.cc b/lib/tsan/rtl/tsan_report.cc
index f52f456..b394c40 100644
--- a/lib/tsan/rtl/tsan_report.cc
+++ b/lib/tsan/rtl/tsan_report.cc
@@ -138,7 +138,7 @@
   if (rt->id == 0)  // Little sense in describing the main thread.
     return;
   Printf("  Thread T%d", rt->id);
-  if (rt->name)
+  if (rt->name && rt->name[0] != '\0')
     Printf(" '%s'", rt->name);
   char thrbuf[kThreadBufSize];
   Printf(" (tid=%zu, %s) created by %s",
diff --git a/lib/tsan/rtl/tsan_rtl.cc b/lib/tsan/rtl/tsan_rtl.cc
index e533a9c..37f65eb 100644
--- a/lib/tsan/rtl/tsan_rtl.cc
+++ b/lib/tsan/rtl/tsan_rtl.cc
@@ -47,12 +47,28 @@
   return ctx;
 }
 
+static char thread_registry_placeholder[sizeof(ThreadRegistry)];
+
+static ThreadContextBase *CreateThreadContext(u32 tid) {
+  // Map thread trace when context is created.
+  MapThreadTrace(GetThreadTrace(tid), TraceSize() * sizeof(Event));
+  void *mem = MmapOrDie(sizeof(ThreadContext), "ThreadContext");
+  return new(mem) ThreadContext(tid);
+}
+
+#ifndef TSAN_GO
+static const u32 kThreadQuarantineSize = 16;
+#else
+static const u32 kThreadQuarantineSize = 64;
+#endif
+
 Context::Context()
   : initialized()
   , report_mtx(MutexTypeReport, StatMtxReport)
   , nreported()
   , nmissed_expected()
-  , thread_mtx(MutexTypeThreads, StatMtxThreads)
+  , thread_registry(new(thread_registry_placeholder) ThreadRegistry(
+      CreateThreadContext, kMaxTid, kThreadQuarantineSize))
   , racy_stacks(MBlockRacyStacks)
   , racy_addresses(MBlockRacyAddresses)
   , fired_suppressions(MBlockRacyAddresses) {
@@ -77,61 +93,19 @@
   , tls_size(tls_size) {
 }
 
-ThreadContext::ThreadContext(int tid)
-  : tid(tid)
-  , unique_id()
-  , os_id()
-  , user_id()
-  , thr()
-  , status(ThreadStatusInvalid)
-  , detached()
-  , reuse_count()
-  , epoch0()
-  , epoch1()
-  , dead_info()
-  , dead_next()
-  , name() {
-}
-
-static void WriteMemoryProfile(char *buf, uptr buf_size, int num) {
-  uptr shadow = GetShadowMemoryConsumption();
-
-  int nthread = 0;
-  int nlivethread = 0;
-  uptr threadmem = 0;
-  {
-    Lock l(&ctx->thread_mtx);
-    for (unsigned i = 0; i < kMaxTid; i++) {
-      ThreadContext *tctx = ctx->threads[i];
-      if (tctx == 0)
-        continue;
-      nthread += 1;
-      threadmem += sizeof(ThreadContext);
-      if (tctx->status != ThreadStatusRunning)
-        continue;
-      nlivethread += 1;
-      threadmem += sizeof(ThreadState);
-    }
-  }
-
-  uptr nsync = 0;
-  uptr syncmem = CTX()->synctab.GetMemoryConsumption(&nsync);
-
-  internal_snprintf(buf, buf_size, "%d: shadow=%zuMB"
-                                   " thread=%zuMB(total=%d/live=%d)"
-                                   " sync=%zuMB(cnt=%zu)\n",
-    num,
-    shadow >> 20,
-    threadmem >> 20, nthread, nlivethread,
-    syncmem >> 20, nsync);
-}
-
 static void MemoryProfileThread(void *arg) {
   ScopedInRtl in_rtl;
   fd_t fd = (fd_t)(uptr)arg;
+  Context *ctx = CTX();
   for (int i = 0; ; i++) {
     InternalScopedBuffer<char> buf(4096);
-    WriteMemoryProfile(buf.data(), buf.size(), i);
+    uptr n_threads;
+    uptr n_running_threads;
+    ctx->thread_registry->GetNumberOfThreads(&n_threads, &n_running_threads);
+    internal_snprintf(buf.data(), buf.size(), "%d: nthr=%d nlive=%d\n",
+        i, n_threads, n_running_threads);
+    internal_write(fd, buf.data(), internal_strlen(buf.data()));
+    WriteMemoryProfile(buf.data(), buf.size());
     internal_write(fd, buf.data(), internal_strlen(buf.data()));
     SleepForSeconds(1);
   }
@@ -151,6 +125,12 @@
   internal_start_thread(&MemoryProfileThread, (void*)(uptr)fd);
 }
 
+void DontNeedShadowFor(uptr addr, uptr size) {
+  uptr shadow_beg = MemToShadow(addr);
+  uptr shadow_end = MemToShadow(addr + size);
+  FlushUnneededShadowMemory(shadow_beg, shadow_end - shadow_beg);
+}
+
 static void MemoryFlushThread(void *arg) {
   ScopedInRtl in_rtl;
   for (int i = 0; ; i++) {
@@ -203,9 +183,6 @@
 #ifndef TSAN_GO
   InitializeShadowMemory();
 #endif
-  ctx->dead_list_size = 0;
-  ctx->dead_list_head = 0;
-  ctx->dead_list_tail = 0;
   InitializeFlags(&ctx->flags, env);
   // Setup correct file descriptor for error reports.
   if (internal_strcmp(flags()->log_path, "stdout") == 0)
@@ -234,7 +211,6 @@
                GetPid());
 
   // Initialize thread 0.
-  ctx->thread_seq = 0;
   int tid = ThreadCreate(thr, 0, 0, true);
   CHECK_EQ(tid, 0);
   ThreadStart(thr, tid, GetPid());
@@ -494,6 +470,8 @@
 
 static void MemoryRangeSet(ThreadState *thr, uptr pc, uptr addr, uptr size,
                            u64 val) {
+  (void)thr;
+  (void)pc;
   if (size == 0)
     return;
   // FIXME: fix me.
@@ -510,23 +488,42 @@
   // let it just crash as usual.
   if (!IsAppMem(addr) || !IsAppMem(addr + size - 1))
     return;
-  (void)thr;
-  (void)pc;
-  // Some programs mmap like hundreds of GBs but actually used a small part.
-  // So, it's better to report a false positive on the memory
-  // then to hang here senselessly.
-  const uptr kMaxResetSize = 4ull*1024*1024*1024;
-  if (size > kMaxResetSize)
-    size = kMaxResetSize;
+  // Don't want to touch lots of shadow memory.
+  // If a program maps 10MB stack, there is no need reset the whole range.
   size = (size + (kShadowCell - 1)) & ~(kShadowCell - 1);
-  u64 *p = (u64*)MemToShadow(addr);
-  CHECK(IsShadowMem((uptr)p));
-  CHECK(IsShadowMem((uptr)(p + size * kShadowCnt / kShadowCell - 1)));
-  // FIXME: may overwrite a part outside the region
-  for (uptr i = 0; i < size * kShadowCnt / kShadowCell;) {
-    p[i++] = val;
-    for (uptr j = 1; j < kShadowCnt; j++)
-      p[i++] = 0;
+  if (size < 64*1024) {
+    u64 *p = (u64*)MemToShadow(addr);
+    CHECK(IsShadowMem((uptr)p));
+    CHECK(IsShadowMem((uptr)(p + size * kShadowCnt / kShadowCell - 1)));
+    // FIXME: may overwrite a part outside the region
+    for (uptr i = 0; i < size / kShadowCell * kShadowCnt;) {
+      p[i++] = val;
+      for (uptr j = 1; j < kShadowCnt; j++)
+        p[i++] = 0;
+    }
+  } else {
+    // The region is big, reset only beginning and end.
+    const uptr kPageSize = 4096;
+    u64 *begin = (u64*)MemToShadow(addr);
+    u64 *end = begin + size / kShadowCell * kShadowCnt;
+    u64 *p = begin;
+    // Set at least first kPageSize/2 to page boundary.
+    while ((p < begin + kPageSize / kShadowSize / 2) || ((uptr)p % kPageSize)) {
+      *p++ = val;
+      for (uptr j = 1; j < kShadowCnt; j++)
+        *p++ = 0;
+    }
+    // Reset middle part.
+    u64 *p1 = p;
+    p = RoundDown(end, kPageSize);
+    UnmapOrDie((void*)p1, (uptr)p - (uptr)p1);
+    MmapFixedNoReserve((uptr)p1, (uptr)p - (uptr)p1);
+    // Set the ending.
+    while (p < end) {
+      *p++ = val;
+      for (uptr j = 1; j < kShadowCnt; j++)
+        *p++ = 0;
+    }
   }
 }
 
@@ -535,6 +532,11 @@
 }
 
 void MemoryRangeFreed(ThreadState *thr, uptr pc, uptr addr, uptr size) {
+  // Processing more than 1k (4k of shadow) is expensive,
+  // can cause excessive memory consumption (user does not necessary touch
+  // the whole range) and most likely unnecessary.
+  if (size > 1024)
+    size = 1024;
   CHECK_EQ(thr->is_freeing, false);
   thr->is_freeing = true;
   MemoryAccessRange(thr, pc, addr, size, true);
diff --git a/lib/tsan/rtl/tsan_rtl.h b/lib/tsan/rtl/tsan_rtl.h
index 6452636..053f24a 100644
--- a/lib/tsan/rtl/tsan_rtl.h
+++ b/lib/tsan/rtl/tsan_rtl.h
@@ -26,8 +26,9 @@
 #ifndef TSAN_RTL_H
 #define TSAN_RTL_H
 
-#include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_allocator.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_thread_registry.h"
 #include "tsan_clock.h"
 #include "tsan_defs.h"
 #include "tsan_flags.h"
@@ -46,15 +47,73 @@
 
 // Descriptor of user's memory block.
 struct MBlock {
-  Mutex mtx;
-  uptr size;
-  u32 alloc_tid;
-  u32 alloc_stack_id;
-  SyncVar *head;
+  /*
+  u64 mtx : 1;  // must be first
+  u64 lst : 44;
+  u64 stk : 31;  // on word boundary
+  u64 tid : kTidBits;
+  u64 siz : 128 - 1 - 31 - 44 - kTidBits;  // 39
+  */
+  u64 raw[2];
 
-  MBlock()
-    : mtx(MutexTypeMBlock, StatMtxMBlock) {
+  void Init(uptr siz, u32 tid, u32 stk) {
+    raw[0] = raw[1] = 0;
+    raw[1] |= (u64)siz << ((1 + 44 + 31 + kTidBits) % 64);
+    raw[1] |= (u64)tid << ((1 + 44 + 31) % 64);
+    raw[0] |= (u64)stk << (1 + 44);
+    raw[1] |= (u64)stk >> (64 - 44 - 1);
+    DCHECK_EQ(Size(), siz);
+    DCHECK_EQ(Tid(), tid);
+    DCHECK_EQ(StackId(), stk);
   }
+
+  u32 Tid() const {
+    return GetLsb(raw[1] >> ((1 + 44 + 31) % 64), kTidBits);
+  }
+
+  uptr Size() const {
+    return raw[1] >> ((1 + 31 + 44 + kTidBits) % 64);
+  }
+
+  u32 StackId() const {
+    return (raw[0] >> (1 + 44)) | GetLsb(raw[1] << (64 - 44 - 1), 31);
+  }
+
+  SyncVar *ListHead() const {
+    return (SyncVar*)(GetLsb(raw[0] >> 1, 44) << 3);
+  }
+
+  void ListPush(SyncVar *v) {
+    SyncVar *lst = ListHead();
+    v->next = lst;
+    u64 x = (u64)v ^ (u64)lst;
+    x = (x >> 3) << 1;
+    raw[0] ^= x;
+    DCHECK_EQ(ListHead(), v);
+  }
+
+  SyncVar *ListPop() {
+    SyncVar *lst = ListHead();
+    SyncVar *nxt = lst->next;
+    lst->next = 0;
+    u64 x = (u64)lst ^ (u64)nxt;
+    x = (x >> 3) << 1;
+    raw[0] ^= x;
+    DCHECK_EQ(ListHead(), nxt);
+    return lst;
+  }
+
+  void ListReset() {
+    SyncVar *lst = ListHead();
+    u64 x = (u64)lst;
+    x = (x >> 3) << 1;
+    raw[0] ^= x;
+    DCHECK_EQ(ListHead(), 0);
+  }
+
+  void Lock();
+  void Unlock();
+  typedef GenericScopedLock<MBlock> ScopedLock;
 };
 
 #ifndef TSAN_GO
@@ -65,22 +124,11 @@
 #endif
 const uptr kAllocatorSize  =  0x10000000000ULL;  // 1T.
 
-struct TsanMapUnmapCallback {
-  void OnMap(uptr p, uptr size) const { }
-  void OnUnmap(uptr p, uptr size) const {
-    // We are about to unmap a chunk of user memory.
-    // Mark the corresponding shadow memory as not needed.
-    uptr shadow_beg = MemToShadow(p);
-    uptr shadow_end = MemToShadow(p + size);
-    CHECK(IsAligned(shadow_end|shadow_beg, GetPageSizeCached()));
-    FlushUnneededShadowMemory(shadow_beg, shadow_end - shadow_beg);
-  }
-};
-
+struct MapUnmapCallback;
 typedef SizeClassAllocator64<kAllocatorSpace, kAllocatorSize, sizeof(MBlock),
-    DefaultSizeClassMap> PrimaryAllocator;
+    DefaultSizeClassMap, MapUnmapCallback> PrimaryAllocator;
 typedef SizeClassAllocatorLocalCache<PrimaryAllocator> AllocatorCache;
-typedef LargeMmapAllocator<TsanMapUnmapCallback> SecondaryAllocator;
+typedef LargeMmapAllocator<MapUnmapCallback> SecondaryAllocator;
 typedef CombinedAllocator<PrimaryAllocator, AllocatorCache,
     SecondaryAllocator> Allocator;
 Allocator *allocator();
@@ -410,41 +458,36 @@
 }
 #endif
 
-enum ThreadStatus {
-  ThreadStatusInvalid,   // Non-existent thread, data is invalid.
-  ThreadStatusCreated,   // Created but not yet running.
-  ThreadStatusRunning,   // The thread is currently running.
-  ThreadStatusFinished,  // Joinable thread is finished but not yet joined.
-  ThreadStatusDead       // Joined, but some info (trace) is still alive.
-};
-
 // An info about a thread that is hold for some time after its termination.
 struct ThreadDeadInfo {
   Trace trace;
 };
 
-struct ThreadContext {
-  const int tid;
-  int unique_id;  // Non-rolling thread id.
-  uptr os_id;  // pid
-  uptr user_id;  // Some opaque user thread id (e.g. pthread_t).
+class ThreadContext : public ThreadContextBase {
+ public:
+  explicit ThreadContext(int tid);
+  ~ThreadContext();
   ThreadState *thr;
-  ThreadStatus status;
-  bool detached;
-  int reuse_count;
+#ifdef TSAN_GO
+  StackTrace creation_stack;
+#else
+  u32 creation_stack_id;
+#endif
   SyncClock sync;
   // Epoch at which the thread had started.
   // If we see an event from the thread stamped by an older epoch,
   // the event is from a dead thread that shared tid with this thread.
   u64 epoch0;
   u64 epoch1;
-  StackTrace creation_stack;
-  int creation_tid;
   ThreadDeadInfo *dead_info;
-  ThreadContext *dead_next;  // In dead thread list.
-  char *name;  // As annotated by user.
 
-  explicit ThreadContext(int tid);
+  // Override superclass callbacks.
+  void OnDead();
+  void OnJoined(void *arg);
+  void OnFinished();
+  void OnStarted(void *arg);
+  void OnCreated(void *arg);
+  void OnReset(void *arg);
 };
 
 struct RacyStacks {
@@ -479,15 +522,7 @@
   int nreported;
   int nmissed_expected;
 
-  Mutex thread_mtx;
-  unsigned thread_seq;
-  unsigned unique_thread_seq;
-  int alive_threads;
-  int max_alive_threads;
-  ThreadContext *threads[kMaxTid];
-  int dead_list_size;
-  ThreadContext* dead_list_head;
-  ThreadContext* dead_list_tail;
+  ThreadRegistry *thread_registry;
 
   Vector<RacyStacks> racy_stacks;
   Vector<RacyAddress> racy_addresses;
@@ -543,9 +578,14 @@
   if (kCollectStats)
     thr->stat[typ] += n;
 }
+void ALWAYS_INLINE INLINE StatSet(ThreadState *thr, StatType typ, u64 n) {
+  if (kCollectStats)
+    thr->stat[typ] = n;
+}
 
 void MapShadow(uptr addr, uptr size);
 void MapThreadTrace(uptr addr, uptr size);
+void DontNeedShadowFor(uptr addr, uptr size);
 void InitializeShadowMemory();
 void InitializeInterceptors();
 void InitializeDynamicAnnotations();
diff --git a/lib/tsan/rtl/tsan_rtl_mutex.cc b/lib/tsan/rtl/tsan_rtl_mutex.cc
index a07f6a2..1f3c7ac 100644
--- a/lib/tsan/rtl/tsan_rtl_mutex.cc
+++ b/lib/tsan/rtl/tsan_rtl_mutex.cc
@@ -63,7 +63,7 @@
       && s->owner_tid != SyncVar::kInvalidTid
       && !s->is_broken) {
     s->is_broken = true;
-    Lock l(&ctx->thread_mtx);
+    ThreadRegistryLock l(ctx->thread_registry);
     ScopedReport rep(ReportTypeMutexDestroyLocked);
     rep.AddMutex(s);
     StackTrace trace;
@@ -248,18 +248,19 @@
   s->mtx.ReadUnlock();
 }
 
+static void UpdateClockCallback(ThreadContextBase *tctx_base, void *arg) {
+  ThreadState *thr = reinterpret_cast<ThreadState*>(arg);
+  ThreadContext *tctx = static_cast<ThreadContext*>(tctx_base);
+  if (tctx->status == ThreadStatusRunning)
+    thr->clock.set(tctx->tid, tctx->thr->fast_state.epoch());
+  else
+    thr->clock.set(tctx->tid, tctx->epoch1);
+}
+
 void AcquireGlobal(ThreadState *thr, uptr pc) {
-  Context *ctx = CTX();
-  Lock l(&ctx->thread_mtx);
-  for (unsigned i = 0; i < kMaxTid; i++) {
-    ThreadContext *tctx = ctx->threads[i];
-    if (tctx == 0)
-      continue;
-    if (tctx->status == ThreadStatusRunning)
-      thr->clock.set(i, tctx->thr->fast_state.epoch());
-    else
-      thr->clock.set(i, tctx->epoch1);
-  }
+  ThreadRegistryLock l(CTX()->thread_registry);
+  CTX()->thread_registry->RunCallbackForEachThreadLocked(
+      UpdateClockCallback, thr);
 }
 
 void Release(ThreadState *thr, uptr pc, uptr addr) {
@@ -283,19 +284,20 @@
 }
 
 #ifndef TSAN_GO
+static void UpdateSleepClockCallback(ThreadContextBase *tctx_base, void *arg) {
+  ThreadState *thr = reinterpret_cast<ThreadState*>(arg);
+  ThreadContext *tctx = static_cast<ThreadContext*>(tctx_base);
+  if (tctx->status == ThreadStatusRunning)
+    thr->last_sleep_clock.set(tctx->tid, tctx->thr->fast_state.epoch());
+  else
+    thr->last_sleep_clock.set(tctx->tid, tctx->epoch1);
+}
+
 void AfterSleep(ThreadState *thr, uptr pc) {
-  Context *ctx = CTX();
   thr->last_sleep_stack_id = CurrentStackId(thr, pc);
-  Lock l(&ctx->thread_mtx);
-  for (unsigned i = 0; i < kMaxTid; i++) {
-    ThreadContext *tctx = ctx->threads[i];
-    if (tctx == 0)
-      continue;
-    if (tctx->status == ThreadStatusRunning)
-      thr->last_sleep_clock.set(i, tctx->thr->fast_state.epoch());
-    else
-      thr->last_sleep_clock.set(i, tctx->epoch1);
-  }
+  ThreadRegistryLock l(CTX()->thread_registry);
+  CTX()->thread_registry->RunCallbackForEachThreadLocked(
+      UpdateSleepClockCallback, thr);
 }
 #endif
 
diff --git a/lib/tsan/rtl/tsan_rtl_report.cc b/lib/tsan/rtl/tsan_rtl_report.cc
index de9a0e9..2df4234 100644
--- a/lib/tsan/rtl/tsan_rtl_report.cc
+++ b/lib/tsan/rtl/tsan_rtl_report.cc
@@ -125,7 +125,7 @@
 
 ScopedReport::ScopedReport(ReportType typ) {
   ctx_ = CTX();
-  ctx_->thread_mtx.CheckLocked();
+  ctx_->thread_registry->CheckLocked();
   void *mem = internal_alloc(MBlockReport, sizeof(ReportDesc));
   rep_ = new(mem) ReportDesc;
   rep_->typ = typ;
@@ -177,7 +177,7 @@
 
 void ScopedReport::AddThread(const ThreadContext *tctx) {
   for (uptr i = 0; i < rep_->threads.Size(); i++) {
-    if (rep_->threads[i]->id == tctx->tid)
+    if ((u32)rep_->threads[i]->id == tctx->tid)
       return;
   }
   void *mem = internal_alloc(MBlockReportThread, sizeof(ReportThread));
@@ -187,42 +187,65 @@
   rt->pid = tctx->os_id;
   rt->running = (tctx->status == ThreadStatusRunning);
   rt->name = tctx->name ? internal_strdup(tctx->name) : 0;
-  rt->parent_tid = tctx->creation_tid;
+  rt->parent_tid = tctx->parent_tid;
+  rt->stack = 0;
+#ifdef TSAN_GO
   rt->stack = SymbolizeStack(tctx->creation_stack);
+#else
+  uptr ssz = 0;
+  const uptr *stack = StackDepotGet(tctx->creation_stack_id, &ssz);
+  if (stack) {
+    StackTrace trace;
+    trace.Init(stack, ssz);
+    rt->stack = SymbolizeStack(trace);
+  }
+#endif
 }
 
 #ifndef TSAN_GO
-static ThreadContext *FindThread(int unique_id) {
+static ThreadContext *FindThreadByUidLocked(int unique_id) {
   Context *ctx = CTX();
-  ctx->thread_mtx.CheckLocked();
+  ctx->thread_registry->CheckLocked();
   for (unsigned i = 0; i < kMaxTid; i++) {
-    ThreadContext *tctx = ctx->threads[i];
-    if (tctx && tctx->unique_id == unique_id) {
+    ThreadContext *tctx = static_cast<ThreadContext*>(
+        ctx->thread_registry->GetThreadLocked(i));
+    if (tctx && tctx->unique_id == (u32)unique_id) {
       return tctx;
     }
   }
   return 0;
 }
 
+static ThreadContext *FindThreadByTidLocked(int tid) {
+  Context *ctx = CTX();
+  ctx->thread_registry->CheckLocked();
+  return static_cast<ThreadContext*>(
+      ctx->thread_registry->GetThreadLocked(tid));
+}
+
+static bool IsInStackOrTls(ThreadContextBase *tctx_base, void *arg) {
+  uptr addr = (uptr)arg;
+  ThreadContext *tctx = static_cast<ThreadContext*>(tctx_base);
+  if (tctx->status != ThreadStatusRunning)
+    return false;
+  ThreadState *thr = tctx->thr;
+  CHECK(thr);
+  return ((addr >= thr->stk_addr && addr < thr->stk_addr + thr->stk_size) ||
+          (addr >= thr->tls_addr && addr < thr->tls_addr + thr->tls_size));
+}
+
 ThreadContext *IsThreadStackOrTls(uptr addr, bool *is_stack) {
   Context *ctx = CTX();
-  ctx->thread_mtx.CheckLocked();
-  for (unsigned i = 0; i < kMaxTid; i++) {
-    ThreadContext *tctx = ctx->threads[i];
-    if (tctx == 0 || tctx->status != ThreadStatusRunning)
-      continue;
-    ThreadState *thr = tctx->thr;
-    CHECK(thr);
-    if (addr >= thr->stk_addr && addr < thr->stk_addr + thr->stk_size) {
-      *is_stack = true;
-      return tctx;
-    }
-    if (addr >= thr->tls_addr && addr < thr->tls_addr + thr->tls_size) {
-      *is_stack = false;
-      return tctx;
-    }
-  }
-  return 0;
+  ctx->thread_registry->CheckLocked();
+  ThreadContext *tctx = static_cast<ThreadContext*>(
+      ctx->thread_registry->FindThreadContextLocked(IsInStackOrTls,
+                                                    (void*)addr));
+  if (!tctx)
+    return 0;
+  ThreadState *thr = tctx->thr;
+  CHECK(thr);
+  *is_stack = (addr >= thr->stk_addr && addr < thr->stk_addr + thr->stk_size);
+  return tctx;
 }
 #endif
 
@@ -236,7 +259,16 @@
   rep_->mutexes.PushBack(rm);
   rm->id = s->uid;
   rm->destroyed = false;
-  rm->stack = SymbolizeStack(s->creation_stack);
+  rm->stack = 0;
+#ifndef TSAN_GO
+  uptr ssz = 0;
+  const uptr *stack = StackDepotGet(s->creation_stack_id, &ssz);
+  if (stack) {
+    StackTrace trace;
+    trace.Init(stack, ssz);
+    rm->stack = SymbolizeStack(trace);
+  }
+#endif
 }
 
 void ScopedReport::AddMutex(u64 id) {
@@ -274,27 +306,27 @@
       trace.Init(stack, ssz);
       loc->stack = SymbolizeStack(trace);
     }
-    ThreadContext *tctx = FindThread(creat_tid);
+    ThreadContext *tctx = FindThreadByUidLocked(creat_tid);
     if (tctx)
       AddThread(tctx);
     return;
   }
   if (allocator()->PointerIsMine((void*)addr)) {
     MBlock *b = user_mblock(0, (void*)addr);
-    ThreadContext *tctx = FindThread(b->alloc_tid);
+    ThreadContext *tctx = FindThreadByTidLocked(b->Tid());
     void *mem = internal_alloc(MBlockReportLoc, sizeof(ReportLocation));
     ReportLocation *loc = new(mem) ReportLocation();
     rep_->locs.PushBack(loc);
     loc->type = ReportLocationHeap;
     loc->addr = (uptr)allocator()->GetBlockBegin((void*)addr);
-    loc->size = b->size;
-    loc->tid = tctx ? tctx->tid : b->alloc_tid;
+    loc->size = b->Size();
+    loc->tid = tctx ? tctx->tid : b->Tid();
     loc->name = 0;
     loc->file = 0;
     loc->line = 0;
     loc->stack = 0;
     uptr ssz = 0;
-    const uptr *stack = StackDepotGet(b->alloc_stack_id, &ssz);
+    const uptr *stack = StackDepotGet(b->StackId(), &ssz);
     if (stack) {
       StackTrace trace;
       trace.Init(stack, ssz);
@@ -341,7 +373,10 @@
   // This function restores stack trace and mutex set for the thread/epoch.
   // It does so by getting stack trace and mutex set at the beginning of
   // trace part, and then replaying the trace till the given epoch.
-  ThreadContext *tctx = CTX()->threads[tid];
+  Context *ctx = CTX();
+  ctx->thread_registry->CheckLocked();
+  ThreadContext *tctx = static_cast<ThreadContext*>(
+      ctx->thread_registry->GetThreadLocked(tid));
   if (tctx == 0)
     return;
   Trace* trace = 0;
@@ -585,7 +620,7 @@
   }
 
   Context *ctx = CTX();
-  Lock l0(&ctx->thread_mtx);
+  ThreadRegistryLock l0(ctx->thread_registry);
 
   ScopedReport rep(freed ? ReportTypeUseAfterFree : ReportTypeRace);
   const uptr kMop = 2;
@@ -613,7 +648,8 @@
 
   for (uptr i = 0; i < kMop; i++) {
     FastState s(thr->racy_state[i]);
-    ThreadContext *tctx = ctx->threads[s.tid()];
+    ThreadContext *tctx = static_cast<ThreadContext*>(
+        ctx->thread_registry->GetThreadLocked(s.tid()));
     if (s.epoch() < tctx->epoch0 || s.epoch() > tctx->epoch1)
       continue;
     rep.AddThread(tctx);
diff --git a/lib/tsan/rtl/tsan_rtl_thread.cc b/lib/tsan/rtl/tsan_rtl_thread.cc
index f25fb41..72b9f1a 100644
--- a/lib/tsan/rtl/tsan_rtl_thread.cc
+++ b/lib/tsan/rtl/tsan_rtl_thread.cc
@@ -20,13 +20,138 @@
 
 namespace __tsan {
 
+// ThreadContext implementation.
+
+ThreadContext::ThreadContext(int tid)
+  : ThreadContextBase(tid)
+  , thr()
+  , sync()
+  , epoch0()
+  , epoch1()
+  , dead_info() {
+}
+
 #ifndef TSAN_GO
-const int kThreadQuarantineSize = 16;
-#else
-const int kThreadQuarantineSize = 64;
+ThreadContext::~ThreadContext() {
+}
 #endif
 
-static void MaybeReportThreadLeak(ThreadContext *tctx) {
+void ThreadContext::OnDead() {
+  sync.Reset();
+}
+
+void ThreadContext::OnJoined(void *arg) {
+  ThreadState *caller_thr = static_cast<ThreadState *>(arg);
+  caller_thr->clock.acquire(&sync);
+  StatInc(caller_thr, StatSyncAcquire);
+}
+
+struct OnCreatedArgs {
+  ThreadState *thr;
+  uptr pc;
+};
+
+void ThreadContext::OnCreated(void *arg) {
+  thr = 0;
+  if (tid == 0)
+    return;
+  OnCreatedArgs *args = static_cast<OnCreatedArgs *>(arg);
+  args->thr->fast_state.IncrementEpoch();
+  // Can't increment epoch w/o writing to the trace as well.
+  TraceAddEvent(args->thr, args->thr->fast_state, EventTypeMop, 0);
+  args->thr->clock.set(args->thr->tid, args->thr->fast_state.epoch());
+  args->thr->fast_synch_epoch = args->thr->fast_state.epoch();
+  args->thr->clock.release(&sync);
+  StatInc(args->thr, StatSyncRelease);
+#ifdef TSAN_GO
+  creation_stack.ObtainCurrent(args->thr, args->pc);
+#else
+  creation_stack_id = CurrentStackId(args->thr, args->pc);
+#endif
+  if (reuse_count == 0)
+    StatInc(args->thr, StatThreadMaxTid);
+}
+
+void ThreadContext::OnReset(void *arg) {
+  OnCreatedArgs *args = static_cast<OnCreatedArgs *>(arg);
+  StatInc(args->thr, StatThreadReuse);
+  sync.Reset();
+  DestroyAndFree(dead_info);
+}
+
+struct OnStartedArgs {
+  ThreadState *thr;
+  uptr stk_addr;
+  uptr stk_size;
+  uptr tls_addr;
+  uptr tls_size;
+};
+
+void ThreadContext::OnStarted(void *arg) {
+  OnStartedArgs *args = static_cast<OnStartedArgs*>(arg);
+  thr = args->thr;
+  // RoundUp so that one trace part does not contain events
+  // from different threads.
+  epoch0 = RoundUp(epoch1 + 1, kTracePartSize);
+  epoch1 = (u64)-1;
+  new(thr) ThreadState(CTX(), tid, unique_id,
+      epoch0, args->stk_addr, args->stk_size, args->tls_addr, args->tls_size);
+#ifdef TSAN_GO
+  // Setup dynamic shadow stack.
+  const int kInitStackSize = 8;
+  args->thr->shadow_stack = (uptr*)internal_alloc(MBlockShadowStack,
+      kInitStackSize * sizeof(uptr));
+  args->thr->shadow_stack_pos = thr->shadow_stack;
+  args->thr->shadow_stack_end = thr->shadow_stack + kInitStackSize;
+#endif
+#ifndef TSAN_GO
+  AllocatorThreadStart(args->thr);
+#endif
+  thr = args->thr;
+  thr->fast_synch_epoch = epoch0;
+  thr->clock.set(tid, epoch0);
+  thr->clock.acquire(&sync);
+  thr->fast_state.SetHistorySize(flags()->history_size);
+  const uptr trace = (epoch0 / kTracePartSize) % TraceParts();
+  thr->trace.headers[trace].epoch0 = epoch0;
+  StatInc(thr, StatSyncAcquire);
+  DPrintf("#%d: ThreadStart epoch=%zu stk_addr=%zx stk_size=%zx "
+          "tls_addr=%zx tls_size=%zx\n",
+          tid, (uptr)epoch0, args->stk_addr, args->stk_size,
+          args->tls_addr, args->tls_size);
+  thr->is_alive = true;
+}
+
+void ThreadContext::OnFinished() {
+  if (!detached) {
+    thr->fast_state.IncrementEpoch();
+    // Can't increment epoch w/o writing to the trace as well.
+    TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
+    thr->clock.set(thr->tid, thr->fast_state.epoch());
+    thr->fast_synch_epoch = thr->fast_state.epoch();
+    thr->clock.release(&sync);
+    StatInc(thr, StatSyncRelease);
+  }
+  // Save from info about the thread.
+  dead_info = new(internal_alloc(MBlockDeadInfo, sizeof(ThreadDeadInfo)))
+      ThreadDeadInfo();
+  for (uptr i = 0; i < TraceParts(); i++) {
+    dead_info->trace.headers[i].epoch0 = thr->trace.headers[i].epoch0;
+    dead_info->trace.headers[i].stack0.CopyFrom(
+        thr->trace.headers[i].stack0);
+  }
+  epoch1 = thr->fast_state.epoch();
+
+#ifndef TSAN_GO
+  AllocatorThreadFinish(thr);
+#endif
+  thr->~ThreadState();
+  StatAggregate(CTX()->stat, thr->stat);
+  thr = 0;
+}
+
+static void MaybeReportThreadLeak(ThreadContextBase *tctx_base, void *unused) {
+  ThreadContext *tctx = static_cast<ThreadContext*>(tctx_base);
   if (tctx->detached)
     return;
   if (tctx->status != ThreadStatusCreated
@@ -42,122 +167,27 @@
   CHECK_GT(thr->in_rtl, 0);
   if (!flags()->report_thread_leaks)
     return;
-  Context *ctx = CTX();
-  Lock l(&ctx->thread_mtx);
-  for (unsigned i = 0; i < kMaxTid; i++) {
-    ThreadContext *tctx = ctx->threads[i];
-    if (tctx == 0)
-      continue;
-    MaybeReportThreadLeak(tctx);
-  }
+  ThreadRegistryLock l(CTX()->thread_registry);
+  CTX()->thread_registry->RunCallbackForEachThreadLocked(
+      MaybeReportThreadLeak, 0);
 }
 
 int ThreadCount(ThreadState *thr) {
   CHECK_GT(thr->in_rtl, 0);
   Context *ctx = CTX();
-  Lock l(&ctx->thread_mtx);
-  int cnt = 0;
-  for (unsigned i = 0; i < kMaxTid; i++) {
-    ThreadContext *tctx = ctx->threads[i];
-    if (tctx == 0)
-      continue;
-    if (tctx->status != ThreadStatusCreated
-        && tctx->status != ThreadStatusRunning)
-      continue;
-    cnt++;
-  }
-  return cnt;
-}
-
-static void ThreadDead(ThreadState *thr, ThreadContext *tctx) {
-  Context *ctx = CTX();
-  CHECK_GT(thr->in_rtl, 0);
-  CHECK(tctx->status == ThreadStatusRunning
-      || tctx->status == ThreadStatusFinished);
-  DPrintf("#%d: ThreadDead uid=%zu\n", thr->tid, tctx->user_id);
-  tctx->status = ThreadStatusDead;
-  tctx->user_id = 0;
-  tctx->sync.Reset();
-
-  // Put to dead list.
-  tctx->dead_next = 0;
-  if (ctx->dead_list_size == 0)
-    ctx->dead_list_head = tctx;
-  else
-    ctx->dead_list_tail->dead_next = tctx;
-  ctx->dead_list_tail = tctx;
-  ctx->dead_list_size++;
+  uptr result;
+  ctx->thread_registry->GetNumberOfThreads(0, 0, &result);
+  return (int)result;
 }
 
 int ThreadCreate(ThreadState *thr, uptr pc, uptr uid, bool detached) {
   CHECK_GT(thr->in_rtl, 0);
-  Context *ctx = CTX();
-  Lock l(&ctx->thread_mtx);
   StatInc(thr, StatThreadCreate);
-  int tid = -1;
-  ThreadContext *tctx = 0;
-  if (ctx->dead_list_size > kThreadQuarantineSize
-      || ctx->thread_seq >= kMaxTid) {
-    // Reusing old thread descriptor and tid.
-    if (ctx->dead_list_size == 0) {
-      Printf("ThreadSanitizer: %d thread limit exceeded. Dying.\n",
-                 kMaxTid);
-      Die();
-    }
-    StatInc(thr, StatThreadReuse);
-    tctx = ctx->dead_list_head;
-    ctx->dead_list_head = tctx->dead_next;
-    ctx->dead_list_size--;
-    if (ctx->dead_list_size == 0) {
-      CHECK_EQ(tctx->dead_next, 0);
-      ctx->dead_list_head = 0;
-    }
-    CHECK_EQ(tctx->status, ThreadStatusDead);
-    tctx->status = ThreadStatusInvalid;
-    tctx->reuse_count++;
-    tctx->sync.Reset();
-    tid = tctx->tid;
-    DestroyAndFree(tctx->dead_info);
-    if (tctx->name) {
-      internal_free(tctx->name);
-      tctx->name = 0;
-    }
-  } else {
-    // Allocating new thread descriptor and tid.
-    StatInc(thr, StatThreadMaxTid);
-    tid = ctx->thread_seq++;
-    void *mem = internal_alloc(MBlockThreadContex, sizeof(ThreadContext));
-    tctx = new(mem) ThreadContext(tid);
-    ctx->threads[tid] = tctx;
-    MapThreadTrace(GetThreadTrace(tid), TraceSize() * sizeof(Event));
-  }
-  CHECK_NE(tctx, 0);
-  CHECK_GE(tid, 0);
-  CHECK_LT(tid, kMaxTid);
+  Context *ctx = CTX();
+  OnCreatedArgs args = { thr, pc };
+  int tid = ctx->thread_registry->CreateThread(uid, detached, thr->tid, &args);
   DPrintf("#%d: ThreadCreate tid=%d uid=%zu\n", thr->tid, tid, uid);
-  CHECK_EQ(tctx->status, ThreadStatusInvalid);
-  ctx->alive_threads++;
-  if (ctx->max_alive_threads < ctx->alive_threads) {
-    ctx->max_alive_threads++;
-    CHECK_EQ(ctx->max_alive_threads, ctx->alive_threads);
-    StatInc(thr, StatThreadMaxAlive);
-  }
-  tctx->status = ThreadStatusCreated;
-  tctx->thr = 0;
-  tctx->user_id = uid;
-  tctx->unique_id = ctx->unique_thread_seq++;
-  tctx->detached = detached;
-  if (tid) {
-    thr->fast_state.IncrementEpoch();
-    // Can't increment epoch w/o writing to the trace as well.
-    TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
-    thr->clock.set(thr->tid, thr->fast_state.epoch());
-    thr->fast_synch_epoch = thr->fast_state.epoch();
-    thr->clock.release(&tctx->sync);
-    StatInc(thr, StatSyncRelease);
-    tctx->creation_stack.ObtainCurrent(thr, pc);
-    tctx->creation_tid = thr->tid;
-  }
+  StatSet(thr, StatThreadMaxAlive, ctx->thread_registry->GetMaxAliveThreads());
   return tid;
 }
 
@@ -170,9 +200,8 @@
   GetThreadStackAndTls(tid == 0, &stk_addr, &stk_size, &tls_addr, &tls_size);
 
   if (tid) {
-    if (stk_addr && stk_size) {
-      MemoryResetRange(thr, /*pc=*/ 1, stk_addr, stk_size);
-    }
+    if (stk_addr && stk_size)
+      MemoryRangeImitateWrite(thr, /*pc=*/ 1, stk_addr, stk_size);
 
     if (tls_addr && tls_size) {
       // Check that the thr object is in tls;
@@ -183,116 +212,41 @@
       CHECK_GE(thr_end, tls_addr);
       CHECK_LE(thr_end, tls_addr + tls_size);
       // Since the thr object is huge, skip it.
-      MemoryResetRange(thr, /*pc=*/ 2, tls_addr, thr_beg - tls_addr);
-      MemoryResetRange(thr, /*pc=*/ 2, thr_end, tls_addr + tls_size - thr_end);
+      MemoryRangeImitateWrite(thr, /*pc=*/ 2, tls_addr, thr_beg - tls_addr);
+      MemoryRangeImitateWrite(thr, /*pc=*/ 2,
+          thr_end, tls_addr + tls_size - thr_end);
     }
   }
 
-  Lock l(&CTX()->thread_mtx);
-  ThreadContext *tctx = CTX()->threads[tid];
-  CHECK_NE(tctx, 0);
-  CHECK_EQ(tctx->status, ThreadStatusCreated);
-  tctx->status = ThreadStatusRunning;
-  tctx->os_id = os_id;
-  // RoundUp so that one trace part does not contain events
-  // from different threads.
-  tctx->epoch0 = RoundUp(tctx->epoch1 + 1, kTracePartSize);
-  tctx->epoch1 = (u64)-1;
-  new(thr) ThreadState(CTX(), tid, tctx->unique_id,
-      tctx->epoch0, stk_addr, stk_size,
-      tls_addr, tls_size);
-#ifdef TSAN_GO
-  // Setup dynamic shadow stack.
-  const int kInitStackSize = 8;
-  thr->shadow_stack = (uptr*)internal_alloc(MBlockShadowStack,
-      kInitStackSize * sizeof(uptr));
-  thr->shadow_stack_pos = thr->shadow_stack;
-  thr->shadow_stack_end = thr->shadow_stack + kInitStackSize;
-#endif
-#ifndef TSAN_GO
-  AllocatorThreadStart(thr);
-#endif
-  tctx->thr = thr;
-  thr->fast_synch_epoch = tctx->epoch0;
-  thr->clock.set(tid, tctx->epoch0);
-  thr->clock.acquire(&tctx->sync);
-  thr->fast_state.SetHistorySize(flags()->history_size);
-  const uptr trace = (tctx->epoch0 / kTracePartSize) % TraceParts();
-  thr->trace.headers[trace].epoch0 = tctx->epoch0;
-  StatInc(thr, StatSyncAcquire);
-  DPrintf("#%d: ThreadStart epoch=%zu stk_addr=%zx stk_size=%zx "
-          "tls_addr=%zx tls_size=%zx\n",
-          tid, (uptr)tctx->epoch0, stk_addr, stk_size, tls_addr, tls_size);
-  thr->is_alive = true;
+  OnStartedArgs args = { thr, stk_addr, stk_size, tls_addr, tls_size };
+  CTX()->thread_registry->StartThread(tid, os_id, &args);
 }
 
 void ThreadFinish(ThreadState *thr) {
   CHECK_GT(thr->in_rtl, 0);
   StatInc(thr, StatThreadFinish);
-  // FIXME: Treat it as write.
   if (thr->stk_addr && thr->stk_size)
-    MemoryResetRange(thr, /*pc=*/ 3, thr->stk_addr, thr->stk_size);
-  if (thr->tls_addr && thr->tls_size) {
-    const uptr thr_beg = (uptr)thr;
-    const uptr thr_end = (uptr)thr + sizeof(*thr);
-    // Since the thr object is huge, skip it.
-    MemoryResetRange(thr, /*pc=*/ 4, thr->tls_addr, thr_beg - thr->tls_addr);
-    MemoryResetRange(thr, /*pc=*/ 5,
-        thr_end, thr->tls_addr + thr->tls_size - thr_end);
-  }
+    DontNeedShadowFor(thr->stk_addr, thr->stk_size);
+  if (thr->tls_addr && thr->tls_size)
+    DontNeedShadowFor(thr->tls_addr, thr->tls_size);
   thr->is_alive = false;
   Context *ctx = CTX();
-  Lock l(&ctx->thread_mtx);
-  ThreadContext *tctx = ctx->threads[thr->tid];
-  CHECK_NE(tctx, 0);
-  CHECK_EQ(tctx->status, ThreadStatusRunning);
-  CHECK_GT(ctx->alive_threads, 0);
-  ctx->alive_threads--;
-  if (tctx->detached) {
-    ThreadDead(thr, tctx);
-  } else {
-    thr->fast_state.IncrementEpoch();
-    // Can't increment epoch w/o writing to the trace as well.
-    TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
-    thr->clock.set(thr->tid, thr->fast_state.epoch());
-    thr->fast_synch_epoch = thr->fast_state.epoch();
-    thr->clock.release(&tctx->sync);
-    StatInc(thr, StatSyncRelease);
-    tctx->status = ThreadStatusFinished;
-  }
+  ctx->thread_registry->FinishThread(thr->tid);
+}
 
-  // Save from info about the thread.
-  tctx->dead_info = new(internal_alloc(MBlockDeadInfo, sizeof(ThreadDeadInfo)))
-      ThreadDeadInfo();
-  for (uptr i = 0; i < TraceParts(); i++) {
-    tctx->dead_info->trace.headers[i].epoch0 = thr->trace.headers[i].epoch0;
-    tctx->dead_info->trace.headers[i].stack0.CopyFrom(
-        thr->trace.headers[i].stack0);
+static bool FindThreadByUid(ThreadContextBase *tctx, void *arg) {
+  uptr uid = (uptr)arg;
+  if (tctx->user_id == uid && tctx->status != ThreadStatusInvalid) {
+    tctx->user_id = 0;
+    return true;
   }
-  tctx->epoch1 = thr->fast_state.epoch();
-
-#ifndef TSAN_GO
-  AllocatorThreadFinish(thr);
-#endif
-  thr->~ThreadState();
-  StatAggregate(ctx->stat, thr->stat);
-  tctx->thr = 0;
+  return false;
 }
 
 int ThreadTid(ThreadState *thr, uptr pc, uptr uid) {
   CHECK_GT(thr->in_rtl, 0);
   Context *ctx = CTX();
-  Lock l(&ctx->thread_mtx);
-  int res = -1;
-  for (unsigned tid = 0; tid < kMaxTid; tid++) {
-    ThreadContext *tctx = ctx->threads[tid];
-    if (tctx != 0 && tctx->user_id == uid
-        && tctx->status != ThreadStatusInvalid) {
-      tctx->user_id = 0;
-      res = tid;
-      break;
-    }
-  }
+  int res = ctx->thread_registry->FindThread(FindThreadByUid, (void*)uid);
   DPrintf("#%d: ThreadTid uid=%zu tid=%d\n", thr->tid, uid, res);
   return res;
 }
@@ -303,18 +257,7 @@
   CHECK_LT(tid, kMaxTid);
   DPrintf("#%d: ThreadJoin tid=%d\n", thr->tid, tid);
   Context *ctx = CTX();
-  Lock l(&ctx->thread_mtx);
-  ThreadContext *tctx = ctx->threads[tid];
-  if (tctx->status == ThreadStatusInvalid) {
-    Printf("ThreadSanitizer: join of non-existent thread\n");
-    return;
-  }
-  // FIXME(dvyukov): print message and continue (it's user error).
-  CHECK_EQ(tctx->detached, false);
-  CHECK_EQ(tctx->status, ThreadStatusFinished);
-  thr->clock.acquire(&tctx->sync);
-  StatInc(thr, StatSyncAcquire);
-  ThreadDead(thr, tctx);
+  ctx->thread_registry->JoinThread(tid, thr);
 }
 
 void ThreadDetach(ThreadState *thr, uptr pc, int tid) {
@@ -322,31 +265,12 @@
   CHECK_GT(tid, 0);
   CHECK_LT(tid, kMaxTid);
   Context *ctx = CTX();
-  Lock l(&ctx->thread_mtx);
-  ThreadContext *tctx = ctx->threads[tid];
-  if (tctx->status == ThreadStatusInvalid) {
-    Printf("ThreadSanitizer: detach of non-existent thread\n");
-    return;
-  }
-  if (tctx->status == ThreadStatusFinished) {
-    ThreadDead(thr, tctx);
-  } else {
-    tctx->detached = true;
-  }
+  ctx->thread_registry->DetachThread(tid);
 }
 
 void ThreadSetName(ThreadState *thr, const char *name) {
-  Context *ctx = CTX();
-  Lock l(&ctx->thread_mtx);
-  ThreadContext *tctx = ctx->threads[thr->tid];
-  CHECK_NE(tctx, 0);
-  CHECK_EQ(tctx->status, ThreadStatusRunning);
-  if (tctx->name) {
-    internal_free(tctx->name);
-    tctx->name = 0;
-  }
-  if (name)
-    tctx->name = internal_strdup(name);
+  CHECK_GT(thr->in_rtl, 0);
+  CTX()->thread_registry->SetThreadName(thr->tid, name);
 }
 
 void MemoryAccessRange(ThreadState *thr, uptr pc, uptr addr,
diff --git a/lib/tsan/rtl/tsan_stat.cc b/lib/tsan/rtl/tsan_stat.cc
index 2fd3a69..4a0d0f4 100644
--- a/lib/tsan/rtl/tsan_stat.cc
+++ b/lib/tsan/rtl/tsan_stat.cc
@@ -273,6 +273,9 @@
   name[StatInt_ctime_r]                  = "  ctime_r                         ";
   name[StatInt_asctime]                  = "  asctime                         ";
   name[StatInt_asctime_r]                = "  asctime_r                       ";
+  name[StatInt_frexp]                    = "  frexp                           ";
+  name[StatInt_frexpf]                   = "  frexpf                          ";
+  name[StatInt_frexpl]                   = "  frexpl                          ";
 
   name[StatAnnotation]                   = "Dynamic annotations               ";
   name[StatAnnotateHappensBefore]        = "  HappensBefore                   ";
diff --git a/lib/tsan/rtl/tsan_stat.h b/lib/tsan/rtl/tsan_stat.h
index e4362b0..131dd66 100644
--- a/lib/tsan/rtl/tsan_stat.h
+++ b/lib/tsan/rtl/tsan_stat.h
@@ -273,6 +273,9 @@
   StatInt_ctime_r,
   StatInt_asctime,
   StatInt_asctime_r,
+  StatInt_frexp,
+  StatInt_frexpf,
+  StatInt_frexpl,
 
   // Dynamic annotations.
   StatAnnotation,
diff --git a/lib/tsan/rtl/tsan_sync.cc b/lib/tsan/rtl/tsan_sync.cc
index b25346e..94bad21 100644
--- a/lib/tsan/rtl/tsan_sync.cc
+++ b/lib/tsan/rtl/tsan_sync.cc
@@ -63,7 +63,7 @@
   const u64 uid = atomic_fetch_add(&uid_gen_, 1, memory_order_relaxed);
   SyncVar *res = new(mem) SyncVar(addr, uid);
 #ifndef TSAN_GO
-  res->creation_stack.ObtainCurrent(thr, pc);
+  res->creation_stack_id = CurrentStackId(thr, pc);
 #endif
   return res;
 }
@@ -82,9 +82,9 @@
   // the hashmap anyway.
   if (PrimaryAllocator::PointerIsMine((void*)addr)) {
     MBlock *b = user_mblock(thr, (void*)addr);
-    Lock l(&b->mtx);
+    MBlock::ScopedLock l(b);
     SyncVar *res = 0;
-    for (res = b->head; res; res = res->next) {
+    for (res = b->ListHead(); res; res = res->next) {
       if (res->addr == addr)
         break;
     }
@@ -92,8 +92,7 @@
       if (!create)
         return 0;
       res = Create(thr, pc, addr);
-      res->next = b->head;
-      b->head = res;
+      b->ListPush(res);
     }
     if (write_lock)
       res->mtx.Lock();
@@ -149,25 +148,34 @@
     MBlock *b = user_mblock(thr, (void*)addr);
     SyncVar *res = 0;
     {
-      Lock l(&b->mtx);
-      SyncVar **prev = &b->head;
-      res = *prev;
-      while (res) {
+      MBlock::ScopedLock l(b);
+      res = b->ListHead();
+      if (res) {
         if (res->addr == addr) {
           if (res->is_linker_init)
             return 0;
-          *prev = res->next;
-          break;
+          b->ListPop();
+        } else {
+          SyncVar **prev = &res->next;
+          res = *prev;
+          while (res) {
+            if (res->addr == addr) {
+              if (res->is_linker_init)
+                return 0;
+              *prev = res->next;
+              break;
+            }
+            prev = &res->next;
+            res = *prev;
+          }
         }
-        prev = &res->next;
-        res = *prev;
+        if (res) {
+          StatInc(thr, StatSyncDestroyed);
+          res->mtx.Lock();
+          res->mtx.Unlock();
+        }
       }
     }
-    if (res) {
-      StatInc(thr, StatSyncDestroyed);
-      res->mtx.Lock();
-      res->mtx.Unlock();
-    }
     return res;
   }
 #endif
@@ -197,26 +205,6 @@
   return res;
 }
 
-uptr SyncVar::GetMemoryConsumption() {
-  return sizeof(*this)
-      + clock.size() * sizeof(u64)
-      + read_clock.size() * sizeof(u64)
-      + creation_stack.Size() * sizeof(uptr);
-}
-
-uptr SyncTab::GetMemoryConsumption(uptr *nsync) {
-  uptr mem = 0;
-  for (int i = 0; i < kPartCount; i++) {
-    Part *p = &tab_[i];
-    Lock l(&p->mtx);
-    for (SyncVar *s = p->val; s; s = s->next) {
-      *nsync += 1;
-      mem += s->GetMemoryConsumption();
-    }
-  }
-  return mem;
-}
-
 int SyncTab::PartIdx(uptr addr) {
   return (addr >> 3) % kPartCount;
 }
diff --git a/lib/tsan/rtl/tsan_sync.h b/lib/tsan/rtl/tsan_sync.h
index 77749e2..823af54 100644
--- a/lib/tsan/rtl/tsan_sync.h
+++ b/lib/tsan/rtl/tsan_sync.h
@@ -59,7 +59,7 @@
   const u64 uid;  // Globally unique id.
   SyncClock clock;
   SyncClock read_clock;  // Used for rw mutexes only.
-  StackTrace creation_stack;
+  u32 creation_stack_id;
   int owner_tid;  // Set only by exclusive owners.
   u64 last_lock;
   int recursion;
diff --git a/lib/tsan/tests/unit/tsan_mman_test.cc b/lib/tsan/tests/unit/tsan_mman_test.cc
index ecbe874..0961d2b 100644
--- a/lib/tsan/tests/unit/tsan_mman_test.cc
+++ b/lib/tsan/tests/unit/tsan_mman_test.cc
@@ -55,10 +55,10 @@
   EXPECT_NE(p2, p);
   MBlock *b = user_mblock(thr, p);
   EXPECT_NE(b, (MBlock*)0);
-  EXPECT_EQ(b->size, (uptr)10);
+  EXPECT_EQ(b->Size(), (uptr)10);
   MBlock *b2 = user_mblock(thr, p2);
   EXPECT_NE(b2, (MBlock*)0);
-  EXPECT_EQ(b2->size, (uptr)20);
+  EXPECT_EQ(b2->Size(), (uptr)20);
   for (int i = 0; i < 10; i++) {
     p[i] = 42;
     EXPECT_EQ(b, user_mblock(thr, p + i));
diff --git a/lib/ubsan/ubsan_diag.cc b/lib/ubsan/ubsan_diag.cc
index 95fad74..0727ed7 100644
--- a/lib/ubsan/ubsan_diag.cc
+++ b/lib/ubsan/ubsan_diag.cc
@@ -14,6 +14,7 @@
 #include "ubsan_diag.h"
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_libc.h"
+#include "sanitizer_common/sanitizer_report_decorator.h"
 #include "sanitizer_common/sanitizer_stacktrace.h"
 #include "sanitizer_common/sanitizer_symbolizer.h"
 #include <stdio.h>
@@ -70,7 +71,7 @@
   case Location::LK_Source: {
     SourceLocation SLoc = Loc.getSourceLocation();
     if (SLoc.isInvalid())
-      RawWrite("<unknown>:");
+      Printf("<unknown>:");
     else {
       Printf("%s:%d:", SLoc.getFilename(), SLoc.getLine());
       if (SLoc.getColumn())
@@ -86,7 +87,7 @@
     Printf("%p:", Loc.getMemoryLocation());
     break;
   case Location::LK_Null:
-    RawWrite("<unknown>:");
+    Printf("<unknown>:");
     break;
   }
 }
@@ -99,7 +100,7 @@
       for (I = 0; Msg[I] && Msg[I] != '%' && I != 63; ++I)
         Buffer[I] = Msg[I];
       Buffer[I] = '\0';
-      RawWrite(Buffer);
+      Printf(Buffer);
       Msg += I - 1;
     } else {
       const Diag::Arg &A = Args[*++Msg - '0'];
@@ -108,9 +109,7 @@
         Printf("%s", A.String);
         break;
       case Diag::AK_Mangled: {
-        RawWrite("'");
-        RawWrite(Demangle(A.String));
-        RawWrite("'");
+        Printf("'%s'", Demangle(A.String));
         break;
       }
       case Diag::AK_SInt:
@@ -156,7 +155,8 @@
 }
 
 /// Render a snippet of the address space near a location.
-static void renderMemorySnippet(bool UseAnsiColor, MemoryLocation Loc,
+static void renderMemorySnippet(const __sanitizer::AnsiColorDecorator &Decor,
+                                MemoryLocation Loc,
                                 Range *Ranges, unsigned NumRanges,
                                 const Diag::Arg *Args) {
   const unsigned BytesToShow = 32;
@@ -180,11 +180,10 @@
     unsigned char C = *reinterpret_cast<const unsigned char*>(P);
     Printf("%s%02x", (P % 8 == 0) ? "  " : " ", C);
   }
-  RawWrite("\n");
+  Printf("\n");
 
   // Emit highlights.
-  if (UseAnsiColor)
-    RawWrite("\033[1;32m");
+  Printf(Decor.Green());
   Range *InRange = upperBound(Min, Ranges, NumRanges);
   for (uptr P = Min; P != Max; ++P) {
     char Pad = ' ', Byte = ' ';
@@ -197,11 +196,9 @@
     if (InRange && InRange->getStart().getMemoryLocation() <= P)
       Byte = '~';
     char Buffer[] = { Pad, Pad, P == Loc ? '^' : Byte, Byte, 0 };
-    RawWrite((P % 8 == 0) ? Buffer : &Buffer[1]);
+    Printf((P % 8 == 0) ? Buffer : &Buffer[1]);
   }
-  if (UseAnsiColor)
-    RawWrite("\033[0m");
-  RawWrite("\n");
+  Printf("%s\n", Decor.Default());
 
   // Go over the line again, and print names for the ranges.
   InRange = 0;
@@ -216,9 +213,9 @@
 
     if (InRange && InRange->getStart().getMemoryLocation() == P) {
       while (Spaces--)
-        RawWrite(" ");
+        Printf(" ");
       renderText(InRange->getText(), Args);
-      RawWrite("\n");
+      Printf("\n");
       // FIXME: We only support naming one range for now!
       break;
     }
@@ -239,38 +236,27 @@
 }
 
 Diag::~Diag() {
-  bool UseAnsiColor = PrintsToTty();
-  if (UseAnsiColor)
-    RawWrite("\033[1m");
+  __sanitizer::AnsiColorDecorator Decor(PrintsToTty());
+  Printf(Decor.Bold());
 
   renderLocation(Loc);
 
   switch (Level) {
   case DL_Error:
-    if (UseAnsiColor)
-      RawWrite("\033[31m");
-    RawWrite(" runtime error: ");
-    if (UseAnsiColor)
-      RawWrite("\033[0;1m");
+    Printf("%s runtime error: %s%s",
+           Decor.Red(), Decor.Default(), Decor.Bold());
     break;
 
   case DL_Note:
-    if (UseAnsiColor)
-      RawWrite("\033[30m");
-    RawWrite(" note: ");
-    if (UseAnsiColor)
-      RawWrite("\033[0m");
+    Printf("%s note: %s", Decor.Black(), Decor.Default());
     break;
   }
 
   renderText(Message, Args);
 
-  if (UseAnsiColor)
-    RawWrite("\033[0m");
-
-  RawWrite("\n");
+  Printf("%s\n", Decor.Default());
 
   if (Loc.isMemoryLocation())
-    renderMemorySnippet(UseAnsiColor, Loc.getMemoryLocation(), Ranges,
+    renderMemorySnippet(Decor, Loc.getMemoryLocation(), Ranges,
                         NumRanges, Args);
 }
diff --git a/make/AppleBI.mk b/make/AppleBI.mk
index b5e702b..bb78853 100644
--- a/make/AppleBI.mk
+++ b/make/AppleBI.mk
@@ -57,7 +57,13 @@
 	   $(OBJROOT)/version.c -arch $* -dynamiclib \
 	   -install_name /usr/lib/system/libcompiler_rt.dylib \
 	   -compatibility_version 1 -current_version $(RC_ProjectSourceVersion) \
-	   -nodefaultlibs -lSystem -umbrella System -dead_strip \
+	   -nodefaultlibs -umbrella System -dead_strip \
+	   -Wl,-upward-lunwind \
+	   -Wl,-upward-lsystem_m \
+	   -Wl,-upward-lsystem_c \
+	   -Wl,-ldyld \
+	   -Wl,-lsystem_kernel \
+	   -L$(SDKROOT)/usr/lib/system \
 	   $(DYLIB_FLAGS) -Wl,-force_load,$^ -o $@ 
 
 # Rule to make fat dylib
diff --git a/make/platform/clang_darwin.mk b/make/platform/clang_darwin.mk
index 5179ce7..61c5b53 100644
--- a/make/platform/clang_darwin.mk
+++ b/make/platform/clang_darwin.mk
@@ -132,7 +132,7 @@
 CFLAGS.10.4		:= $(CFLAGS) $(OSX_DEPLOYMENT_ARGS)
 # FIXME: We can't build ASAN with our stub SDK yet.
 CFLAGS.asan_osx         := $(CFLAGS) -mmacosx-version-min=10.5 -fno-builtin \
-                           -DASAN_FLEXIBLE_MAPPING_AND_OFFSET=1
+                           -fno-rtti -DASAN_FLEXIBLE_MAPPING_AND_OFFSET=1
 CFLAGS.asan_osx_dynamic := \
 	$(CFLAGS) -mmacosx-version-min=10.5 -fno-builtin \
 	-DMAC_INTERPOSE_FUNCTIONS=1 \
diff --git a/make/platform/clang_linux.mk b/make/platform/clang_linux.mk
index 89f7268..d3ddc71 100644
--- a/make/platform/clang_linux.mk
+++ b/make/platform/clang_linux.mk
@@ -86,12 +86,12 @@
 CFLAGS.full-x86_64 := $(CFLAGS) -m64
 CFLAGS.profile-i386 := $(CFLAGS) -m32
 CFLAGS.profile-x86_64 := $(CFLAGS) -m64
-CFLAGS.asan-i386 := $(CFLAGS) -m32 -fPIE -fno-builtin \
+CFLAGS.asan-i386 := $(CFLAGS) -m32 -fPIE -fno-builtin -fno-rtti \
                     -DASAN_FLEXIBLE_MAPPING_AND_OFFSET=1
-CFLAGS.asan-x86_64 := $(CFLAGS) -m64 -fPIE -fno-builtin \
+CFLAGS.asan-x86_64 := $(CFLAGS) -m64 -fPIE -fno-builtin -fno-rtti \
                     -DASAN_FLEXIBLE_MAPPING_AND_OFFSET=1
-CFLAGS.tsan-x86_64 := $(CFLAGS) -m64 -fPIE -fno-builtin
-CFLAGS.msan-x86_64 := $(CFLAGS) -m64 -fPIE -fno-builtin
+CFLAGS.tsan-x86_64 := $(CFLAGS) -m64 -fPIE -fno-builtin -fno-rtti
+CFLAGS.msan-x86_64 := $(CFLAGS) -m64 -fPIE -fno-builtin -fno-rtti
 CFLAGS.ubsan-i386 := $(CFLAGS) -m32 -fPIE -fno-builtin
 CFLAGS.ubsan-x86_64 := $(CFLAGS) -m64 -fPIE -fno-builtin