Update aosp/master compiler-rt for rebase to r275480

Bug: http://b/31320715

This merges commit db963a21048fd7242daae74666cb221f796f33c3 from
aosp/dev.

Test: Build AOSP and run RenderScript tests (host tests for slang and
libbcc, RsTest, CTS)

Change-Id: I8bdd651c1759c4306d62c34edec1d8d9536c9715
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 4bc6f7a..a2b55c4 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -15,6 +15,7 @@
 
   if(COMPILER_RT_HAS_SANITIZER_COMMON)
     add_subdirectory(sanitizer_common)
+    add_subdirectory(stats)
     add_subdirectory(lsan)
     add_subdirectory(ubsan)
   endif()
@@ -47,4 +48,12 @@
   if(COMPILER_RT_HAS_CFI)
     add_subdirectory(cfi)
   endif()
+
+  if(COMPILER_RT_HAS_ESAN)
+    add_subdirectory(esan)
+  endif()
+
+  if(COMPILER_RT_HAS_SCUDO)
+    add_subdirectory(scudo)
+  endif()
 endif()
diff --git a/lib/Makefile.mk b/lib/Makefile.mk
index 7eb6489..b1540bd 100644
--- a/lib/Makefile.mk
+++ b/lib/Makefile.mk
@@ -10,10 +10,4 @@
 SubDirs :=
 
 # Add submodules.
-SubDirs += asan
 SubDirs += builtins
-SubDirs += interception
-SubDirs += lsan
-SubDirs += profile
-SubDirs += sanitizer_common
-SubDirs += ubsan
diff --git a/lib/asan/Android.bp b/lib/asan/Android.bp
index b64cddc..e815237 100644
--- a/lib/asan/Android.bp
+++ b/lib/asan/Android.bp
@@ -75,11 +75,11 @@
         },
         host: {
             srcs: asan_rtl_files,
-            whole_static_libs: ["libubsan"],
-            static_libs: [
+            whole_static_libs: [
+                "libubsan",
                 "libinterception",
                 "liblsan",
-                "libsan",
+                "libsan"
             ],
         },
     },
@@ -115,6 +115,95 @@
     },
 }
 
+cc_defaults {
+    name: "libclang_rt_defaults",
+
+    // TODO:
+    // This library must go on /system partition, even in SANITIZE_TARGET mode (when all libraries are
+    // installed on /data). That's because /data may not be available until vold does some magic and
+    // vold itself depends on this library.
+
+    arch: {
+        arm: {
+            instruction_set: "arm",
+        },
+    },
+    include_dirs: [
+        "external/compiler-rt/lib",
+        "external/compiler-rt/include",
+    ],
+    cflags: asan_rtl_cflags,
+    ldflags: ["-Wl,-z,global"],
+    srcs: asan_rtl_files + asan_rtl_cxx_files,
+    static_libs: [
+        "libinterception",
+        "liblsan",
+        "libsan",
+        "libubsan",
+    ],
+    shared_libs: [
+        "liblog",
+        "libdl",
+    ],
+    clang: true,
+    sanitize: {
+        never: true,
+    },
+    sdk_version: "19",
+    stl: "none",
+    enabled: false,
+}
+
+cc_library_shared {
+    name: "libclang_rt.asan-arm-android",
+    defaults: ["libclang_rt_defaults"],
+    arch: {
+        arm: {
+            enabled: true,
+        },
+    },
+}
+
+cc_library_shared {
+    name: "libclang_rt.asan-aarch64-android",
+    defaults: ["libclang_rt_defaults"],
+    arch: {
+        arm64: {
+            enabled: true,
+        },
+    },
+}
+
+cc_library_shared {
+    name: "libclang_rt.asan-mips-android",
+    defaults: ["libclang_rt_defaults"],
+    arch: {
+        mips: {
+            enabled: true,
+        },
+    },
+}
+
+cc_library_shared {
+    name: "libclang_rt.asan-mips64-android",
+    defaults: ["libclang_rt_defaults"],
+    arch: {
+        mips64: {
+            enabled: true,
+        },
+    },
+}
+
+cc_library_shared {
+    name: "libclang_rt.asan-i686-android",
+    defaults: ["libclang_rt_defaults"],
+    arch: {
+        x86: {
+            enabled: true,
+        },
+    },
+}
+
 cc_binary {
     name: "asanwrapper",
     defaults: ["asan_arch_defaults"],
diff --git a/lib/asan/CMakeLists.txt b/lib/asan/CMakeLists.txt
index 6716f48..b7e41fc 100644
--- a/lib/asan/CMakeLists.txt
+++ b/lib/asan/CMakeLists.txt
@@ -13,6 +13,7 @@
   asan_malloc_linux.cc
   asan_malloc_mac.cc
   asan_malloc_win.cc
+  asan_memory_profile.cc
   asan_poisoning.cc
   asan_posix.cc
   asan_report.cc
@@ -32,7 +33,7 @@
 include_directories(..)
 
 set(ASAN_CFLAGS ${SANITIZER_COMMON_CFLAGS})
-append_no_rtti_flag(ASAN_CFLAGS)
+append_rtti_flag(OFF ASAN_CFLAGS)
 
 set(ASAN_COMMON_DEFINITIONS
   ASAN_HAS_EXCEPTIONS=1)
@@ -62,7 +63,7 @@
 set(ASAN_DYNAMIC_CFLAGS ${ASAN_CFLAGS})
 append_list_if(COMPILER_RT_HAS_FTLS_MODEL_INITIAL_EXEC
   -ftls-model=initial-exec ASAN_DYNAMIC_CFLAGS)
-append_list_if(MSVC /DEBUG ASAN_DYNAMIC_CFLAGS)
+append_list_if(MSVC /DEBUG ASAN_DYNAMIC_LINK_FLAGS)
 
 append_list_if(COMPILER_RT_HAS_LIBC c ASAN_DYNAMIC_LIBS)
 append_list_if(COMPILER_RT_HAS_LIBDL dl ASAN_DYNAMIC_LIBS)
@@ -74,7 +75,7 @@
 
 # Compile ASan sources into an object library.
 
-add_compiler_rt_object_libraries(RTAsan_dynamic 
+add_compiler_rt_object_libraries(RTAsan_dynamic
   OS ${SANITIZER_COMMON_SUPPORTED_OS}
   ARCHS ${ASAN_SUPPORTED_ARCH}
   SOURCES ${ASAN_SOURCES} ${ASAN_CXX_SOURCES}
@@ -82,15 +83,15 @@
   DEFS ${ASAN_DYNAMIC_DEFINITIONS})
 
 if(NOT APPLE)
-  add_compiler_rt_object_libraries(RTAsan 
+  add_compiler_rt_object_libraries(RTAsan
     ARCHS ${ASAN_SUPPORTED_ARCH}
     SOURCES ${ASAN_SOURCES} CFLAGS ${ASAN_CFLAGS}
     DEFS ${ASAN_COMMON_DEFINITIONS})
-  add_compiler_rt_object_libraries(RTAsan_cxx 
+  add_compiler_rt_object_libraries(RTAsan_cxx
     ARCHS ${ASAN_SUPPORTED_ARCH}
     SOURCES ${ASAN_CXX_SOURCES} CFLAGS ${ASAN_CFLAGS}
     DEFS ${ASAN_COMMON_DEFINITIONS})
-  add_compiler_rt_object_libraries(RTAsan_preinit 
+  add_compiler_rt_object_libraries(RTAsan_preinit
     ARCHS ${ASAN_SUPPORTED_ARCH}
     SOURCES ${ASAN_PREINIT_SOURCES} CFLAGS ${ASAN_CFLAGS}
     DEFS ${ASAN_COMMON_DEFINITIONS})
@@ -105,6 +106,8 @@
 
 # Build ASan runtimes shipped with Clang.
 add_custom_target(asan)
+set_target_properties(asan PROPERTIES FOLDER "Compiler-RT Misc")
+
 if(APPLE)
   add_compiler_rt_runtime(clang_rt.asan
     SHARED
@@ -121,40 +124,40 @@
     PARENT_TARGET asan)
 else()
   # Build separate libraries for each target.
-  
-    set(ASAN_COMMON_RUNTIME_OBJECT_LIBS
-      RTInterception
-      RTSanitizerCommon
-      RTSanitizerCommonLibc
-      RTLSanCommon
-      RTUbsan)
 
-    add_compiler_rt_runtime(clang_rt.asan
-      STATIC
-      ARCHS ${ASAN_SUPPORTED_ARCH}
-      OBJECT_LIBS RTAsan_preinit
-                  RTAsan
-                  ${ASAN_COMMON_RUNTIME_OBJECT_LIBS}
-      CFLAGS ${ASAN_CFLAGS}
-      DEFS ${ASAN_COMMON_DEFINITIONS}
-      PARENT_TARGET asan)
+  set(ASAN_COMMON_RUNTIME_OBJECT_LIBS
+    RTInterception
+    RTSanitizerCommon
+    RTSanitizerCommonLibc
+    RTLSanCommon
+    RTUbsan)
 
-    add_compiler_rt_runtime(clang_rt.asan_cxx
-      STATIC
-      ARCHS ${ASAN_SUPPORTED_ARCH}
-      OBJECT_LIBS RTAsan_cxx
-                  RTUbsan_cxx
-      CFLAGS ${ASAN_CFLAGS}
-      DEFS ${ASAN_COMMON_DEFINITIONS}
-      PARENT_TARGET asan)
+  add_compiler_rt_runtime(clang_rt.asan
+    STATIC
+    ARCHS ${ASAN_SUPPORTED_ARCH}
+    OBJECT_LIBS RTAsan_preinit
+                RTAsan
+                ${ASAN_COMMON_RUNTIME_OBJECT_LIBS}
+    CFLAGS ${ASAN_CFLAGS}
+    DEFS ${ASAN_COMMON_DEFINITIONS}
+    PARENT_TARGET asan)
 
-    add_compiler_rt_runtime(clang_rt.asan-preinit
-      STATIC
-      ARCHS ${ASAN_SUPPORTED_ARCH}
-      OBJECT_LIBS RTAsan_preinit
-      CFLAGS ${ASAN_CFLAGS}
-      DEFS ${ASAN_COMMON_DEFINITIONS}
-      PARENT_TARGET asan)
+  add_compiler_rt_runtime(clang_rt.asan_cxx
+    STATIC
+    ARCHS ${ASAN_SUPPORTED_ARCH}
+    OBJECT_LIBS RTAsan_cxx
+                RTUbsan_cxx
+    CFLAGS ${ASAN_CFLAGS}
+    DEFS ${ASAN_COMMON_DEFINITIONS}
+    PARENT_TARGET asan)
+
+  add_compiler_rt_runtime(clang_rt.asan-preinit
+    STATIC
+    ARCHS ${ASAN_SUPPORTED_ARCH}
+    OBJECT_LIBS RTAsan_preinit
+    CFLAGS ${ASAN_CFLAGS}
+    DEFS ${ASAN_COMMON_DEFINITIONS}
+    PARENT_TARGET asan)
 
   foreach(arch ${ASAN_SUPPORTED_ARCH})
     if (UNIX AND NOT ${arch} MATCHES "i386|i686")
@@ -165,8 +168,8 @@
            -Wl,--version-script,${CMAKE_CURRENT_BINARY_DIR}/clang_rt.asan-dynamic-${arch}.vers)
       set_source_files_properties(
         ${CMAKE_CURRENT_BINARY_DIR}/dummy.cc
-	PROPERTIES
-	OBJECT_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/clang_rt.asan-dynamic-${arch}.vers)
+        PROPERTIES
+        OBJECT_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/clang_rt.asan-dynamic-${arch}.vers)
     else()
       set(VERSION_SCRIPT_FLAG)
     endif()
@@ -194,7 +197,7 @@
         ARCHS ${arch})
       add_dependencies(asan clang_rt.asan_cxx-${arch}-symbols)
       add_sanitizer_rt_symbols(clang_rt.asan
-        ARCHS ${arch} 
+        ARCHS ${arch}
         EXTRA asan.syms.extra)
       add_dependencies(asan clang_rt.asan-${arch}-symbols)
     endif()
@@ -219,8 +222,7 @@
   endforeach()
 endif()
 
-add_compiler_rt_resource_file(asan_blacklist asan_blacklist.txt)
-add_dependencies(asan asan_blacklist)
+add_compiler_rt_resource_file(asan_blacklist asan_blacklist.txt asan)
 add_dependencies(compiler-rt asan)
 
 add_subdirectory(scripts)
diff --git a/lib/asan/Makefile.mk b/lib/asan/Makefile.mk
deleted file mode 100644
index 0dafefc..0000000
--- a/lib/asan/Makefile.mk
+++ /dev/null
@@ -1,29 +0,0 @@
-#===- lib/asan/Makefile.mk ---------------------------------*- Makefile -*--===#
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-#===------------------------------------------------------------------------===#
-
-ModuleName := asan
-SubDirs := 
-
-CCSources := $(foreach file,$(wildcard $(Dir)/*.cc),$(notdir $(file)))
-CXXOnlySources := asan_new_delete.cc
-COnlySources := $(filter-out $(CXXOnlySources),$(CCSources))
-SSources := $(foreach file,$(wildcard $(Dir)/*.S),$(notdir $(file)))
-Sources := $(CCSources) $(SSources)
-ObjNames := $(CCSources:%.cc=%.o) $(SSources:%.S=%.o)
-
-Implementation := Generic
-
-# FIXME: use automatic dependencies?
-Dependencies := $(wildcard $(Dir)/*.h)
-Dependencies += $(wildcard $(Dir)/../interception/*.h)
-Dependencies += $(wildcard $(Dir)/../sanitizer_common/*.h)
-
-# Define a convenience variable for all the asan functions.
-AsanFunctions := $(COnlySources:%.cc=%) $(SSources:%.S=%)
-AsanCXXFunctions := $(CXXOnlySources:%.cc=%)
diff --git a/lib/asan/asan_activation.cc b/lib/asan/asan_activation.cc
index 9df3b97..a5ace85 100644
--- a/lib/asan/asan_activation.cc
+++ b/lib/asan/asan_activation.cc
@@ -47,6 +47,7 @@
     FlagParser parser;
     RegisterActivationFlags(&parser, &f, &cf);
 
+    cf.SetDefaults();
     // Copy the current activation flags.
     allocator_options.CopyTo(&f, &cf);
     cf.malloc_context_size = malloc_context_size;
@@ -61,7 +62,7 @@
       parser.ParseString(env);
     }
 
-    SetVerbosity(cf.verbosity);
+    InitializeCommonFlags(&cf);
 
     if (Verbosity()) ReportUnrecognizedFlags();
 
diff --git a/lib/asan/asan_allocator.cc b/lib/asan/asan_allocator.cc
index 56f184a..6a5d227 100644
--- a/lib/asan/asan_allocator.cc
+++ b/lib/asan/asan_allocator.cc
@@ -223,7 +223,7 @@
 
 struct Allocator {
   static const uptr kMaxAllowedMallocSize =
-      FIRST_32_SECOND_64(3UL << 30, 1UL << 40);
+      FIRST_32_SECOND_64(3UL << 30, 1ULL << 40);
   static const uptr kMaxThreadLocalQuarantine =
       FIRST_32_SECOND_64(1 << 18, 1 << 20);
 
@@ -457,29 +457,28 @@
     return res;
   }
 
-  void AtomicallySetQuarantineFlag(AsanChunk *m, void *ptr,
+  // Set quarantine flag if chunk is allocated, issue ASan error report on
+  // available and quarantined chunks. Return true on success, false otherwise.
+  bool AtomicallySetQuarantineFlagIfAllocated(AsanChunk *m, void *ptr,
                                    BufferedStackTrace *stack) {
     u8 old_chunk_state = CHUNK_ALLOCATED;
     // Flip the chunk_state atomically to avoid race on double-free.
-    if (!atomic_compare_exchange_strong((atomic_uint8_t*)m, &old_chunk_state,
-                                        CHUNK_QUARANTINE, memory_order_acquire))
+    if (!atomic_compare_exchange_strong((atomic_uint8_t *)m, &old_chunk_state,
+                                        CHUNK_QUARANTINE,
+                                        memory_order_acquire)) {
       ReportInvalidFree(ptr, old_chunk_state, stack);
+      // It's not safe to push a chunk in quarantine on invalid free.
+      return false;
+    }
     CHECK_EQ(CHUNK_ALLOCATED, old_chunk_state);
+    return true;
   }
 
   // Expects the chunk to already be marked as quarantined by using
-  // AtomicallySetQuarantineFlag.
+  // AtomicallySetQuarantineFlagIfAllocated.
   void QuarantineChunk(AsanChunk *m, void *ptr, BufferedStackTrace *stack,
                        AllocType alloc_type) {
     CHECK_EQ(m->chunk_state, CHUNK_QUARANTINE);
-
-    if (m->alloc_type != alloc_type) {
-      if (atomic_load(&alloc_dealloc_mismatch, memory_order_acquire)) {
-        ReportAllocTypeMismatch((uptr)ptr, stack, (AllocType)m->alloc_type,
-                                (AllocType)alloc_type);
-      }
-    }
-
     CHECK_GE(m->alloc_tid, 0);
     if (SANITIZER_WORDSIZE == 64)  // On 32-bits this resides in user area.
       CHECK_EQ(m->free_tid, kInvalidTid);
@@ -516,13 +515,24 @@
 
     uptr chunk_beg = p - kChunkHeaderSize;
     AsanChunk *m = reinterpret_cast<AsanChunk *>(chunk_beg);
-    if (delete_size && flags()->new_delete_type_mismatch &&
-        delete_size != m->UsedSize()) {
-      ReportNewDeleteSizeMismatch(p, delete_size, stack);
-    }
+
     ASAN_FREE_HOOK(ptr);
     // Must mark the chunk as quarantined before any changes to its metadata.
-    AtomicallySetQuarantineFlag(m, ptr, stack);
+    // Do not quarantine given chunk if we failed to set CHUNK_QUARANTINE flag.
+    if (!AtomicallySetQuarantineFlagIfAllocated(m, ptr, stack)) return;
+
+    if (m->alloc_type != alloc_type) {
+      if (atomic_load(&alloc_dealloc_mismatch, memory_order_acquire)) {
+        ReportAllocTypeMismatch((uptr)ptr, stack, (AllocType)m->alloc_type,
+                                (AllocType)alloc_type);
+      }
+    }
+
+    if (delete_size && flags()->new_delete_type_mismatch &&
+        delete_size != m->UsedSize()) {
+      ReportNewDeleteSizeMismatch(p, m->UsedSize(), delete_size, stack);
+    }
+
     QuarantineChunk(m, ptr, stack, alloc_type);
   }
 
@@ -655,6 +665,9 @@
 bool AsanChunkView::IsValid() {
   return chunk_ && chunk_->chunk_state != CHUNK_AVAILABLE;
 }
+bool AsanChunkView::IsAllocated() {
+  return chunk_ && chunk_->chunk_state == CHUNK_ALLOCATED;
+}
 uptr AsanChunkView::Beg() { return chunk_->Beg(); }
 uptr AsanChunkView::End() { return Beg() + UsedSize(); }
 uptr AsanChunkView::UsedSize() { return chunk_->UsedSize(); }
@@ -668,12 +681,15 @@
   return res;
 }
 
+u32 AsanChunkView::GetAllocStackId() { return chunk_->alloc_context_id; }
+u32 AsanChunkView::GetFreeStackId() { return chunk_->free_context_id; }
+
 StackTrace AsanChunkView::GetAllocStack() {
-  return GetStackTraceFromId(chunk_->alloc_context_id);
+  return GetStackTraceFromId(GetAllocStackId());
 }
 
 StackTrace AsanChunkView::GetFreeStack() {
-  return GetStackTraceFromId(chunk_->free_context_id);
+  return GetStackTraceFromId(GetFreeStackId());
 }
 
 void InitializeAllocator(const AllocatorOptions &options) {
@@ -754,7 +770,7 @@
   return 0;
 }
 
-uptr asan_malloc_usable_size(void *ptr, uptr pc, uptr bp) {
+uptr asan_malloc_usable_size(const void *ptr, uptr pc, uptr bp) {
   if (!ptr) return 0;
   uptr usable_size = instance.AllocationSize(reinterpret_cast<uptr>(ptr));
   if (flags()->check_malloc_usable_size && (usable_size == 0)) {
diff --git a/lib/asan/asan_allocator.h b/lib/asan/asan_allocator.h
index e3d5333..2f9f7aa 100644
--- a/lib/asan/asan_allocator.h
+++ b/lib/asan/asan_allocator.h
@@ -49,14 +49,17 @@
 class AsanChunkView {
  public:
   explicit AsanChunkView(AsanChunk *chunk) : chunk_(chunk) {}
-  bool IsValid();   // Checks if AsanChunkView points to a valid allocated
-                    // or quarantined chunk.
-  uptr Beg();       // First byte of user memory.
-  uptr End();       // Last byte of user memory.
-  uptr UsedSize();  // Size requested by the user.
+  bool IsValid();        // Checks if AsanChunkView points to a valid allocated
+                         // or quarantined chunk.
+  bool IsAllocated();    // Checks if the memory is currently allocated.
+  uptr Beg();            // First byte of user memory.
+  uptr End();            // Last byte of user memory.
+  uptr UsedSize();       // Size requested by the user.
   uptr AllocTid();
   uptr FreeTid();
   bool Eq(const AsanChunkView &c) const { return chunk_ == c.chunk_; }
+  u32 GetAllocStackId();
+  u32 GetFreeStackId();
   StackTrace GetAllocStack();
   StackTrace GetFreeStack();
   bool AddrIsInside(uptr addr, uptr access_size, sptr *offset) {
@@ -171,7 +174,7 @@
 
 int asan_posix_memalign(void **memptr, uptr alignment, uptr size,
                         BufferedStackTrace *stack);
-uptr asan_malloc_usable_size(void *ptr, uptr pc, uptr bp);
+uptr asan_malloc_usable_size(const void *ptr, uptr pc, uptr bp);
 
 uptr asan_mz_size(const void *ptr);
 void asan_mz_force_lock();
diff --git a/lib/asan/asan_android_stub.cc b/lib/asan/asan_android_stub.cc
deleted file mode 100644
index cf68f58..0000000
--- a/lib/asan/asan_android_stub.cc
+++ /dev/null
@@ -1,4 +0,0 @@
-#include "sanitizer/asan_interface.h"
-
-__attribute__((section(".preinit_array")))
-  typeof(__asan_init) *__asan_preinit =__asan_init;
diff --git a/lib/asan/asan_fake_stack.cc b/lib/asan/asan_fake_stack.cc
index 91fdf0a..16feccd 100644
--- a/lib/asan/asan_fake_stack.cc
+++ b/lib/asan/asan_fake_stack.cc
@@ -31,7 +31,7 @@
   CHECK_EQ(SHADOW_SCALE, 3);  // This code expects SHADOW_SCALE=3.
   u64 *shadow = reinterpret_cast<u64*>(MemToShadow(ptr));
   if (class_id <= 6) {
-    for (uptr i = 0; i < (1U << class_id); i++) {
+    for (uptr i = 0; i < (((uptr)1) << class_id); i++) {
       shadow[i] = magic;
       // Make sure this does not become memset.
       SanitizerBreakOptimization(nullptr);
@@ -121,7 +121,7 @@
   uptr class_id = (ptr - beg) >> stack_size_log;
   uptr base = beg + (class_id << stack_size_log);
   CHECK_LE(base, ptr);
-  CHECK_LT(ptr, base + (1UL << stack_size_log));
+  CHECK_LT(ptr, base + (((uptr)1) << stack_size_log));
   uptr pos = (ptr - base) >> (kMinStackFrameSizeLog + class_id);
   uptr res = base + pos * BytesInSizeClass(class_id);
   *frame_end = res + BytesInSizeClass(class_id);
diff --git a/lib/asan/asan_fake_stack.h b/lib/asan/asan_fake_stack.h
index 3b1d9eb..74ca02d 100644
--- a/lib/asan/asan_fake_stack.h
+++ b/lib/asan/asan_fake_stack.h
@@ -69,12 +69,12 @@
 
   // stack_size_log is at least 15 (stack_size >= 32K).
   static uptr SizeRequiredForFlags(uptr stack_size_log) {
-    return 1UL << (stack_size_log + 1 - kMinStackFrameSizeLog);
+    return ((uptr)1) << (stack_size_log + 1 - kMinStackFrameSizeLog);
   }
 
   // Each size class occupies stack_size bytes.
   static uptr SizeRequiredForFrames(uptr stack_size_log) {
-    return (1ULL << stack_size_log) * kNumberOfSizeClasses;
+    return (((uptr)1) << stack_size_log) * kNumberOfSizeClasses;
   }
 
   // Number of bytes requires for the whole object.
@@ -91,12 +91,12 @@
   // and so on.
   static uptr FlagsOffset(uptr stack_size_log, uptr class_id) {
     uptr t = kNumberOfSizeClasses - 1 - class_id;
-    const uptr all_ones = (1 << (kNumberOfSizeClasses - 1)) - 1;
+    const uptr all_ones = (((uptr)1) << (kNumberOfSizeClasses - 1)) - 1;
     return ((all_ones >> t) << t) << (stack_size_log - 15);
   }
 
   static uptr NumberOfFrames(uptr stack_size_log, uptr class_id) {
-    return 1UL << (stack_size_log - kMinStackFrameSizeLog - class_id);
+    return ((uptr)1) << (stack_size_log - kMinStackFrameSizeLog - class_id);
   }
 
   // Divide n by the numbe of frames in size class.
@@ -114,7 +114,8 @@
   u8 *GetFrame(uptr stack_size_log, uptr class_id, uptr pos) {
     return reinterpret_cast<u8 *>(this) + kFlagsOffset +
            SizeRequiredForFlags(stack_size_log) +
-           (1 << stack_size_log) * class_id + BytesInSizeClass(class_id) * pos;
+           (((uptr)1) << stack_size_log) * class_id +
+           BytesInSizeClass(class_id) * pos;
   }
 
   // Allocate the fake frame.
@@ -137,7 +138,7 @@
 
   // Number of bytes in a fake frame of this size class.
   static uptr BytesInSizeClass(uptr class_id) {
-    return 1UL << (class_id + kMinStackFrameSizeLog);
+    return ((uptr)1) << (class_id + kMinStackFrameSizeLog);
   }
 
   // The fake frame is guaranteed to have a right redzone.
@@ -159,7 +160,7 @@
   static const uptr kFlagsOffset = 4096;  // This is were the flags begin.
   // Must match the number of uses of DEFINE_STACK_MALLOC_FREE_WITH_CLASS_ID
   COMPILER_CHECK(kNumberOfSizeClasses == 11);
-  static const uptr kMaxStackMallocSize = 1 << kMaxStackFrameSizeLog;
+  static const uptr kMaxStackMallocSize = ((uptr)1) << kMaxStackFrameSizeLog;
 
   uptr hint_position_[kNumberOfSizeClasses];
   uptr stack_size_log_;
diff --git a/lib/asan/asan_flags.cc b/lib/asan/asan_flags.cc
index 363ee67..345a35c 100644
--- a/lib/asan/asan_flags.cc
+++ b/lib/asan/asan_flags.cc
@@ -116,7 +116,7 @@
   ubsan_parser.ParseString(GetEnv("UBSAN_OPTIONS"));
 #endif
 
-  SetVerbosity(common_flags()->verbosity);
+  InitializeCommonFlags();
 
   // TODO(eugenis): dump all flags at verbosity>=2?
   if (Verbosity()) ReportUnrecognizedFlags();
@@ -159,6 +159,14 @@
         (ASAN_LOW_MEMORY) ? 1UL << 6 : 1UL << 8;
     f->quarantine_size_mb = kDefaultQuarantineSizeMb;
   }
+  if (!f->replace_str && common_flags()->intercept_strlen) {
+    Report("WARNING: strlen interceptor is enabled even though replace_str=0. "
+           "Use intercept_strlen=0 to disable it.");
+  }
+  if (!f->replace_str && common_flags()->intercept_strchr) {
+    Report("WARNING: strchr* interceptors are enabled even though "
+           "replace_str=0. Use intercept_strchr=0 to disable them.");
+  }
 }
 
 }  // namespace __asan
diff --git a/lib/asan/asan_flags.inc b/lib/asan/asan_flags.inc
index 5e69242..ea63383 100644
--- a/lib/asan/asan_flags.inc
+++ b/lib/asan/asan_flags.inc
@@ -43,7 +43,7 @@
     "If set, uses custom wrappers and replacements for libc string functions "
     "to find more errors.")
 ASAN_FLAG(bool, replace_intrin, true,
-          "If set, uses custom wrappers for memset/memcpy/memmove intinsics.")
+          "If set, uses custom wrappers for memset/memcpy/memmove intrinsics.")
 ASAN_FLAG(bool, detect_stack_use_after_return, false,
           "Enables stack-use-after-return checking at run-time.")
 ASAN_FLAG(int, min_uar_stack_size_log, 16, // We can't do smaller anyway.
@@ -77,6 +77,8 @@
           "Print various statistics after printing an error message or if "
           "atexit=1.")
 ASAN_FLAG(bool, print_legend, true, "Print the legend for the shadow bytes.")
+ASAN_FLAG(bool, print_scariness, false,
+          "Print the scariness score. Experimental.")
 ASAN_FLAG(bool, atexit, false,
           "If set, prints ASan exit stats even after program terminates "
           "successfully.")
@@ -100,11 +102,11 @@
 // https://github.com/google/sanitizers/issues/309
 // TODO(glider,timurrrr): Fix known issues and enable this back.
 ASAN_FLAG(bool, alloc_dealloc_mismatch,
-          (SANITIZER_MAC == 0) && (SANITIZER_WINDOWS == 0),
+          !SANITIZER_MAC && !SANITIZER_WINDOWS && !SANITIZER_ANDROID,
           "Report errors on malloc/delete, new/free, new/delete[], etc.")
 
 ASAN_FLAG(bool, new_delete_type_mismatch, true,
-          "Report errors on mismatch betwen size of new and delete.")
+          "Report errors on mismatch between size of new and delete.")
 ASAN_FLAG(
     bool, strict_init_order, false,
     "If true, assume that dynamic initializers can never access globals from "
@@ -135,3 +137,5 @@
 ASAN_FLAG(bool, halt_on_error, true,
           "Crash the program after printing the first error report "
           "(WARNING: USE AT YOUR OWN RISK!)")
+ASAN_FLAG(bool, use_odr_indicator, false,
+          "Use special ODR indicator symbol for ODR violation detection")
diff --git a/lib/asan/asan_globals.cc b/lib/asan/asan_globals.cc
index eb9f1bf..f185761 100644
--- a/lib/asan/asan_globals.cc
+++ b/lib/asan/asan_globals.cc
@@ -135,6 +135,70 @@
   return false;
 }
 
+enum GlobalSymbolState {
+  UNREGISTERED = 0,
+  REGISTERED = 1
+};
+
+// Check ODR violation for given global G via special ODR indicator. We use
+// this method in case compiler instruments global variables through their
+// local aliases.
+static void CheckODRViolationViaIndicator(const Global *g) {
+  u8 *odr_indicator = reinterpret_cast<u8 *>(g->odr_indicator);
+  if (*odr_indicator == UNREGISTERED) {
+    *odr_indicator = REGISTERED;
+    return;
+  }
+  // If *odr_indicator is DEFINED, some module have already registered
+  // externally visible symbol with the same name. This is an ODR violation.
+  for (ListOfGlobals *l = list_of_all_globals; l; l = l->next) {
+    if (g->odr_indicator == l->g->odr_indicator &&
+        (flags()->detect_odr_violation >= 2 || g->size != l->g->size) &&
+        !IsODRViolationSuppressed(g->name))
+      ReportODRViolation(g, FindRegistrationSite(g),
+                         l->g, FindRegistrationSite(l->g));
+  }
+}
+
+// Check ODR violation for given global G by checking if it's already poisoned.
+// We use this method in case compiler doesn't use private aliases for global
+// variables.
+static void CheckODRViolationViaPoisoning(const Global *g) {
+  if (__asan_region_is_poisoned(g->beg, g->size_with_redzone)) {
+    // This check may not be enough: if the first global is much larger
+    // the entire redzone of the second global may be within the first global.
+    for (ListOfGlobals *l = list_of_all_globals; l; l = l->next) {
+      if (g->beg == l->g->beg &&
+          (flags()->detect_odr_violation >= 2 || g->size != l->g->size) &&
+          !IsODRViolationSuppressed(g->name))
+        ReportODRViolation(g, FindRegistrationSite(g),
+                           l->g, FindRegistrationSite(l->g));
+    }
+  }
+}
+
+// Clang provides two different ways for global variables protection:
+// it can poison the global itself or its private alias. In former
+// case we may poison same symbol multiple times, that can help us to
+// cheaply detect ODR violation: if we try to poison an already poisoned
+// global, we have ODR violation error.
+// In latter case, we poison each symbol exactly once, so we use special
+// indicator symbol to perform similar check.
+// In either case, compiler provides a special odr_indicator field to Global
+// structure, that can contain two kinds of values:
+//   1) Non-zero value. In this case, odr_indicator is an address of
+//      corresponding indicator variable for given global.
+//   2) Zero. This means that we don't use private aliases for global variables
+//      and can freely check ODR violation with the first method.
+//
+// This routine chooses between two different methods of ODR violation
+// detection.
+static inline bool UseODRIndicator(const Global *g) {
+  // Use ODR indicator method iff use_odr_indicator flag is set and
+  // indicator symbol address is not 0.
+  return flags()->use_odr_indicator && g->odr_indicator > 0;
+}
+
 // Register a global variable.
 // This function may be called more than once for every global
 // so we store the globals in a map.
@@ -144,22 +208,24 @@
     ReportGlobal(*g, "Added");
   CHECK(flags()->report_globals);
   CHECK(AddrIsInMem(g->beg));
-  CHECK(AddrIsAlignedByGranularity(g->beg));
+  if (!AddrIsAlignedByGranularity(g->beg)) {
+    Report("The following global variable is not properly aligned.\n");
+    Report("This may happen if another global with the same name\n");
+    Report("resides in another non-instrumented module.\n");
+    Report("Or the global comes from a C file built w/o -fno-common.\n");
+    Report("In either case this is likely an ODR violation bug,\n");
+    Report("but AddressSanitizer can not provide more details.\n");
+    ReportODRViolation(g, FindRegistrationSite(g), g, FindRegistrationSite(g));
+    CHECK(AddrIsAlignedByGranularity(g->beg));
+  }
   CHECK(AddrIsAlignedByGranularity(g->size_with_redzone));
   if (flags()->detect_odr_violation) {
     // Try detecting ODR (One Definition Rule) violation, i.e. the situation
     // where two globals with the same name are defined in different modules.
-    if (__asan_region_is_poisoned(g->beg, g->size_with_redzone)) {
-      // This check may not be enough: if the first global is much larger
-      // the entire redzone of the second global may be within the first global.
-      for (ListOfGlobals *l = list_of_all_globals; l; l = l->next) {
-        if (g->beg == l->g->beg &&
-            (flags()->detect_odr_violation >= 2 || g->size != l->g->size) &&
-            !IsODRViolationSuppressed(g->name))
-          ReportODRViolation(g, FindRegistrationSite(g),
-                             l->g, FindRegistrationSite(l->g));
-      }
-    }
+    if (UseODRIndicator(g))
+      CheckODRViolationViaIndicator(g);
+    else
+      CheckODRViolationViaPoisoning(g);
   }
   if (CanPoisonMemory())
     PoisonRedZones(*g);
@@ -190,6 +256,12 @@
   // We unpoison the shadow memory for the global but we do not remove it from
   // the list because that would require O(n^2) time with the current list
   // implementation. It might not be worth doing anyway.
+
+  // Release ODR indicator.
+  if (UseODRIndicator(g)) {
+    u8 *odr_indicator = reinterpret_cast<u8 *>(g->odr_indicator);
+    *odr_indicator = UNREGISTERED;
+  }
 }
 
 void StopInitOrderChecking() {
@@ -212,6 +284,25 @@
 // ---------------------- Interface ---------------- {{{1
 using namespace __asan;  // NOLINT
 
+
+// Apply __asan_register_globals to all globals found in the same loaded
+// executable or shared library as `flag'. The flag tracks whether globals have
+// already been registered or not for this image.
+void __asan_register_image_globals(uptr *flag) {
+  if (*flag)
+    return;
+  AsanApplyToGlobals(__asan_register_globals, flag);
+  *flag = 1;
+}
+
+// This mirrors __asan_register_image_globals.
+void __asan_unregister_image_globals(uptr *flag) {
+  if (!*flag)
+    return;
+  AsanApplyToGlobals(__asan_unregister_globals, flag);
+  *flag = 0;
+}
+
 // Register an array of globals.
 void __asan_register_globals(__asan_global *globals, uptr n) {
   if (!flags()->report_globals) return;
diff --git a/lib/asan/asan_init_version.h b/lib/asan/asan_init_version.h
index bc8a622..f48cc19 100644
--- a/lib/asan/asan_init_version.h
+++ b/lib/asan/asan_init_version.h
@@ -19,16 +19,20 @@
   // Every time the ASan ABI changes we also change the version number in the
   // __asan_init function name.  Objects built with incompatible ASan ABI
   // versions will not link with run-time.
+  //
   // Changes between ABI versions:
   // v1=>v2: added 'module_name' to __asan_global
   // v2=>v3: stack frame description (created by the compiler)
-  //         contains the function PC as the 3-rd field (see
-  //         DescribeAddressIfStack).
-  // v3=>v4: added '__asan_global_source_location' to __asan_global.
+  //         contains the function PC as the 3rd field (see
+  //         DescribeAddressIfStack)
+  // v3=>v4: added '__asan_global_source_location' to __asan_global
   // v4=>v5: changed the semantics and format of __asan_stack_malloc_ and
-  //         __asan_stack_free_ functions.
+  //         __asan_stack_free_ functions
   // v5=>v6: changed the name of the version check symbol
-  #define __asan_version_mismatch_check __asan_version_mismatch_check_v6
+  // v6=>v7: added 'odr_indicator' to __asan_global
+  // v7=>v8: added '__asan_(un)register_image_globals' functions for dead
+  //         stripping support on Mach-O platforms
+  #define __asan_version_mismatch_check __asan_version_mismatch_check_v8
 }
 
 #endif  // ASAN_INIT_VERSION_H
diff --git a/lib/asan/asan_interceptors.cc b/lib/asan/asan_interceptors.cc
index d9a0c71..3b3b0f3 100644
--- a/lib/asan/asan_interceptors.cc
+++ b/lib/asan/asan_interceptors.cc
@@ -21,6 +21,7 @@
 #include "asan_stack.h"
 #include "asan_stats.h"
 #include "asan_suppressions.h"
+#include "lsan/lsan_common.h"
 #include "sanitizer_common/sanitizer_libc.h"
 
 #if SANITIZER_POSIX
@@ -110,7 +111,7 @@
 } while (0)
 
 static inline uptr MaybeRealStrnlen(const char *s, uptr maxlen) {
-#if ASAN_INTERCEPT_STRNLEN
+#if SANITIZER_INTERCEPT_STRNLEN
   if (REAL(strnlen)) {
     return REAL(strnlen)(s, maxlen);
   }
@@ -143,6 +144,8 @@
   (void) ctx;                                                                  \
 
 #define COMMON_INTERCEPT_FUNCTION(name) ASAN_INTERCEPT_FUNC(name)
+#define COMMON_INTERCEPT_FUNCTION_VER(name, ver)                          \
+  ASAN_INTERCEPT_FUNC_VER(name, ver)
 #define COMMON_INTERCEPTOR_WRITE_RANGE(ctx, ptr, size) \
   ASAN_WRITE_RANGE(ctx, ptr, size)
 #define COMMON_INTERCEPTOR_READ_RANGE(ctx, ptr, size) \
@@ -195,6 +198,10 @@
   } else {                                                                     \
     *begin = *end = 0;                                                         \
   }
+// Asan needs custom handling of these:
+#undef SANITIZER_INTERCEPT_MEMSET
+#undef SANITIZER_INTERCEPT_MEMMOVE
+#undef SANITIZER_INTERCEPT_MEMCPY
 #include "sanitizer_common/sanitizer_common_interceptors.inc"
 
 // Syscall interceptors don't have contexts, we don't support suppressions
@@ -218,6 +225,7 @@
   atomic_uintptr_t is_registered;
 };
 
+#if ASAN_INTERCEPT_PTHREAD_CREATE
 static thread_return_t THREAD_CALLING_CONV asan_thread_start(void *arg) {
   ThreadStartParam *param = reinterpret_cast<ThreadStartParam *>(arg);
   AsanThread *t = nullptr;
@@ -228,7 +236,6 @@
   return t->ThreadStart(GetTid(), &param->is_registered);
 }
 
-#if ASAN_INTERCEPT_PTHREAD_CREATE
 INTERCEPTOR(int, pthread_create, void *thread,
     void *attr, void *(*start_routine)(void*), void *arg) {
   EnsureMainThreadIDIsCorrect();
@@ -242,7 +249,17 @@
   ThreadStartParam param;
   atomic_store(&param.t, 0, memory_order_relaxed);
   atomic_store(&param.is_registered, 0, memory_order_relaxed);
-  int result = REAL(pthread_create)(thread, attr, asan_thread_start, &param);
+  int result;
+  {
+    // Ignore all allocations made by pthread_create: thread stack/TLS may be
+    // stored by pthread for future reuse even after thread destruction, and
+    // the linked list it's stored in doesn't even hold valid pointers to the
+    // objects, the latter are calculated by obscure pointer arithmetic.
+#if CAN_SANITIZE_LEAKS
+    __lsan::ScopedInterceptorDisabler disabler;
+#endif
+    result = REAL(pthread_create)(thread, attr, asan_thread_start, &param);
+  }
   if (result == 0) {
     u32 current_tid = GetCurrentTidOrInvalid();
     AsanThread *t =
@@ -271,7 +288,8 @@
 
 #if SANITIZER_ANDROID
 INTERCEPTOR(void*, bsd_signal, int signum, void *handler) {
-  if (!IsDeadlySignal(signum) || common_flags()->allow_user_segv_handler) {
+  if (!IsHandledDeadlySignal(signum) ||
+      common_flags()->allow_user_segv_handler) {
     return REAL(bsd_signal)(signum, handler);
   }
   return 0;
@@ -279,7 +297,8 @@
 #endif
 
 INTERCEPTOR(void*, signal, int signum, void *handler) {
-  if (!IsDeadlySignal(signum) || common_flags()->allow_user_segv_handler) {
+  if (!IsHandledDeadlySignal(signum) ||
+      common_flags()->allow_user_segv_handler) {
     return REAL(signal)(signum, handler);
   }
   return nullptr;
@@ -287,7 +306,8 @@
 
 INTERCEPTOR(int, sigaction, int signum, const struct sigaction *act,
                             struct sigaction *oldact) {
-  if (!IsDeadlySignal(signum) || common_flags()->allow_user_segv_handler) {
+  if (!IsHandledDeadlySignal(signum) ||
+      common_flags()->allow_user_segv_handler) {
     return REAL(sigaction)(signum, act, oldact);
   }
   return 0;
@@ -453,25 +473,6 @@
   ASAN_MEMSET_IMPL(ctx, block, c, size);
 }
 
-INTERCEPTOR(char*, strchr, const char *str, int c) {
-  void *ctx;
-  ASAN_INTERCEPTOR_ENTER(ctx, strchr);
-  if (UNLIKELY(!asan_inited)) return internal_strchr(str, c);
-  // strchr is called inside create_purgeable_zone() when MallocGuardEdges=1 is
-  // used.
-  if (asan_init_is_running) {
-    return REAL(strchr)(str, c);
-  }
-  ENSURE_ASAN_INITED();
-  char *result = REAL(strchr)(str, c);
-  if (flags()->replace_str) {
-    uptr len = REAL(strlen)(str);
-    uptr bytes_read = (result ? result - str : len) + 1;
-    ASAN_READ_STRING_OF_LEN(ctx, str, len, bytes_read);
-  }
-  return result;
-}
-
 #if ASAN_INTERCEPT_INDEX
 # if ASAN_USE_ALIAS_ATTRIBUTE_FOR_INDEX
 INTERCEPTOR(char*, index, const char *string, int c)
@@ -549,7 +550,6 @@
   return REAL(strcpy)(to, from);  // NOLINT
 }
 
-#if ASAN_INTERCEPT_STRDUP
 INTERCEPTOR(char*, strdup, const char *s) {
   void *ctx;
   ASAN_INTERCEPTOR_ENTER(ctx, strdup);
@@ -564,24 +564,23 @@
   REAL(memcpy)(new_mem, s, length + 1);
   return reinterpret_cast<char*>(new_mem);
 }
-#endif
 
-INTERCEPTOR(SIZE_T, strlen, const char *s) {
+#if ASAN_INTERCEPT___STRDUP
+INTERCEPTOR(char*, __strdup, const char *s) {
   void *ctx;
-  ASAN_INTERCEPTOR_ENTER(ctx, strlen);
-  if (UNLIKELY(!asan_inited)) return internal_strlen(s);
-  // strlen is called from malloc_default_purgeable_zone()
-  // in __asan::ReplaceSystemAlloc() on Mac.
-  if (asan_init_is_running) {
-    return REAL(strlen)(s);
-  }
+  ASAN_INTERCEPTOR_ENTER(ctx, strdup);
+  if (UNLIKELY(!asan_inited)) return internal_strdup(s);
   ENSURE_ASAN_INITED();
-  SIZE_T length = REAL(strlen)(s);
+  uptr length = REAL(strlen)(s);
   if (flags()->replace_str) {
     ASAN_READ_RANGE(ctx, s, length + 1);
   }
-  return length;
+  GET_STACK_TRACE_MALLOC;
+  void *new_mem = asan_malloc(length + 1, &stack);
+  REAL(memcpy)(new_mem, s, length + 1);
+  return reinterpret_cast<char*>(new_mem);
 }
+#endif // ASAN_INTERCEPT___STRDUP
 
 INTERCEPTOR(SIZE_T, wcslen, const wchar_t *s) {
   void *ctx;
@@ -607,19 +606,6 @@
   return REAL(strncpy)(to, from, size);
 }
 
-#if ASAN_INTERCEPT_STRNLEN
-INTERCEPTOR(uptr, strnlen, const char *s, uptr maxlen) {
-  void *ctx;
-  ASAN_INTERCEPTOR_ENTER(ctx, strnlen);
-  ENSURE_ASAN_INITED();
-  uptr length = REAL(strnlen)(s, maxlen);
-  if (flags()->replace_str) {
-    ASAN_READ_RANGE(ctx, s, Min(length + 1, maxlen));
-  }
-  return length;
-}
-#endif  // ASAN_INTERCEPT_STRNLEN
-
 INTERCEPTOR(long, strtol, const char *nptr,  // NOLINT
             char **endptr, int base) {
   void *ctx;
@@ -702,12 +688,12 @@
 }
 #endif  // ASAN_INTERCEPT_ATOLL_AND_STRTOLL
 
+#if ASAN_INTERCEPT___CXA_ATEXIT
 static void AtCxaAtexit(void *unused) {
   (void)unused;
   StopInitOrderChecking();
 }
 
-#if ASAN_INTERCEPT___CXA_ATEXIT
 INTERCEPTOR(int, __cxa_atexit, void (*func)(void *), void *arg,
             void *dso_handle) {
 #if SANITIZER_MAC
@@ -739,25 +725,23 @@
   InitializeCommonInterceptors();
 
   // Intercept mem* functions.
-  ASAN_INTERCEPT_FUNC(memmove);
+  ASAN_INTERCEPT_FUNC(memcpy);
   ASAN_INTERCEPT_FUNC(memset);
   if (PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE) {
-    ASAN_INTERCEPT_FUNC(memcpy);
+    // In asan, REAL(memmove) is not used, but it is used in msan.
+    ASAN_INTERCEPT_FUNC(memmove);
   }
+  CHECK(REAL(memcpy));
 
   // Intercept str* functions.
   ASAN_INTERCEPT_FUNC(strcat);  // NOLINT
-  ASAN_INTERCEPT_FUNC(strchr);
   ASAN_INTERCEPT_FUNC(strcpy);  // NOLINT
-  ASAN_INTERCEPT_FUNC(strlen);
   ASAN_INTERCEPT_FUNC(wcslen);
   ASAN_INTERCEPT_FUNC(strncat);
   ASAN_INTERCEPT_FUNC(strncpy);
-#if ASAN_INTERCEPT_STRDUP
   ASAN_INTERCEPT_FUNC(strdup);
-#endif
-#if ASAN_INTERCEPT_STRNLEN
-  ASAN_INTERCEPT_FUNC(strnlen);
+#if ASAN_INTERCEPT___STRDUP
+  ASAN_INTERCEPT_FUNC(__strdup);
 #endif
 #if ASAN_INTERCEPT_INDEX && ASAN_USE_ALIAS_ATTRIBUTE_FOR_INDEX
   ASAN_INTERCEPT_FUNC(index);
diff --git a/lib/asan/asan_interceptors.h b/lib/asan/asan_interceptors.h
index 279c5f3..d747c31 100644
--- a/lib/asan/asan_interceptors.h
+++ b/lib/asan/asan_interceptors.h
@@ -23,14 +23,12 @@
 #if !SANITIZER_WINDOWS
 # define ASAN_INTERCEPT_ATOLL_AND_STRTOLL 1
 # define ASAN_INTERCEPT__LONGJMP 1
-# define ASAN_INTERCEPT_STRDUP 1
 # define ASAN_INTERCEPT_INDEX 1
 # define ASAN_INTERCEPT_PTHREAD_CREATE 1
 # define ASAN_INTERCEPT_FORK 1
 #else
 # define ASAN_INTERCEPT_ATOLL_AND_STRTOLL 0
 # define ASAN_INTERCEPT__LONGJMP 0
-# define ASAN_INTERCEPT_STRDUP 0
 # define ASAN_INTERCEPT_INDEX 0
 # define ASAN_INTERCEPT_PTHREAD_CREATE 0
 # define ASAN_INTERCEPT_FORK 0
@@ -42,12 +40,6 @@
 # define ASAN_USE_ALIAS_ATTRIBUTE_FOR_INDEX 0
 #endif
 
-#if !SANITIZER_MAC
-# define ASAN_INTERCEPT_STRNLEN 1
-#else
-# define ASAN_INTERCEPT_STRNLEN 0
-#endif
-
 #if SANITIZER_LINUX && !SANITIZER_ANDROID
 # define ASAN_INTERCEPT_SWAPCONTEXT 1
 #else
@@ -80,6 +72,12 @@
 # define ASAN_INTERCEPT___CXA_ATEXIT 0
 #endif
 
+#if SANITIZER_LINUX && !SANITIZER_ANDROID
+# define ASAN_INTERCEPT___STRDUP 1
+#else
+# define ASAN_INTERCEPT___STRDUP 0
+#endif
+
 DECLARE_REAL(int, memcmp, const void *a1, const void *a2, uptr size)
 DECLARE_REAL(void*, memcpy, void *to, const void *from, uptr size)
 DECLARE_REAL(void*, memset, void *block, int c, uptr size)
diff --git a/lib/asan/asan_interface_internal.h b/lib/asan/asan_interface_internal.h
index 9efddcb..3cf3413 100644
--- a/lib/asan/asan_interface_internal.h
+++ b/lib/asan/asan_interface_internal.h
@@ -54,8 +54,17 @@
     uptr has_dynamic_init;   // Non-zero if the global has dynamic initializer.
     __asan_global_source_location *location;  // Source location of a global,
                                               // or NULL if it is unknown.
+    uptr odr_indicator;      // The address of the ODR indicator symbol.
   };
 
+  // These functions can be called on some platforms to find globals in the same
+  // loaded image as `flag' and apply __asan_(un)register_globals to them,
+  // filtering out redundant calls.
+  SANITIZER_INTERFACE_ATTRIBUTE
+  void __asan_register_image_globals(uptr *flag);
+  SANITIZER_INTERFACE_ATTRIBUTE
+  void __asan_unregister_image_globals(uptr *flag);
+
   // These two functions should be called by the instrumented code.
   // 'globals' is an array of structures describing 'n' globals.
   SANITIZER_INTERFACE_ATTRIBUTE
diff --git a/lib/asan/asan_internal.h b/lib/asan/asan_internal.h
index 0ef0d0e..2014237 100644
--- a/lib/asan/asan_internal.h
+++ b/lib/asan/asan_internal.h
@@ -36,9 +36,9 @@
 // If set, values like allocator chunk size, as well as defaults for some flags
 // will be changed towards less memory overhead.
 #ifndef ASAN_LOW_MEMORY
-#if SANITIZER_WORDSIZE == 32
+# if SANITIZER_IOS || (SANITIZER_WORDSIZE == 32)
 #  define ASAN_LOW_MEMORY 1
-#else
+# else
 #  define ASAN_LOW_MEMORY 0
 # endif
 #endif
@@ -62,6 +62,9 @@
 
 void AsanInitFromRtl();
 
+// asan_win.cc
+void InitializePlatformExceptionHandlers();
+
 // asan_rtl.cc
 void NORETURN ShowStatsAndAbort();
 
@@ -73,6 +76,13 @@
 void AsanCheckDynamicRTPrereqs();
 void AsanCheckIncompatibleRT();
 
+// Support function for __asan_(un)register_image_globals. Searches for the
+// loaded image containing `needle' and then enumerates all global metadata
+// structures declared in that image, applying `op' (e.g.,
+// __asan_(un)register_globals) to them.
+typedef void (*globals_op_fptr)(__asan_global *, uptr);
+void AsanApplyToGlobals(globals_op_fptr op, const void *needle);
+
 void AsanOnDeadlySignal(int, void *siginfo, void *context);
 
 void ReadContextStack(void *context, uptr *stack, uptr *ssize);
@@ -95,16 +105,24 @@
 bool PlatformHasDifferentMemcpyAndMemmove();
 # define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE \
     (PlatformHasDifferentMemcpyAndMemmove())
+#elif SANITIZER_WINDOWS64
+# define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE false
 #else
 # define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE true
 #endif  // SANITIZER_MAC
 
 // Add convenient macro for interface functions that may be represented as
 // weak hooks.
-#define ASAN_MALLOC_HOOK(ptr, size) \
-  if (&__sanitizer_malloc_hook) __sanitizer_malloc_hook(ptr, size)
-#define ASAN_FREE_HOOK(ptr) \
-  if (&__sanitizer_free_hook) __sanitizer_free_hook(ptr)
+#define ASAN_MALLOC_HOOK(ptr, size)                                   \
+  do {                                                                \
+    if (&__sanitizer_malloc_hook) __sanitizer_malloc_hook(ptr, size); \
+    RunMallocHooks(ptr, size);                                        \
+  } while (false)
+#define ASAN_FREE_HOOK(ptr)                                 \
+  do {                                                      \
+    if (&__sanitizer_free_hook) __sanitizer_free_hook(ptr); \
+    RunFreeHooks(ptr);                                      \
+  } while (false)
 #define ASAN_ON_ERROR() \
   if (&__asan_on_error) __asan_on_error()
 
@@ -112,7 +130,6 @@
 // Used to avoid infinite recursion in __asan_init().
 extern bool asan_init_is_running;
 extern void (*death_callback)(void);
-
 // These magic values are written to shadow for better error reporting.
 const int kAsanHeapLeftRedzoneMagic = 0xfa;
 const int kAsanHeapRightRedzoneMagic = 0xfb;
diff --git a/lib/asan/asan_linux.cc b/lib/asan/asan_linux.cc
index e26b400..c051573 100644
--- a/lib/asan/asan_linux.cc
+++ b/lib/asan/asan_linux.cc
@@ -69,12 +69,17 @@
 namespace __asan {
 
 void InitializePlatformInterceptors() {}
+void InitializePlatformExceptionHandlers() {}
 
 void *AsanDoesNotSupportStaticLinkage() {
   // This will fail to link with -static.
   return &_DYNAMIC;  // defined in link.h
 }
 
+void AsanApplyToGlobals(globals_op_fptr op, const void *needle) {
+  UNIMPLEMENTED();
+}
+
 #if SANITIZER_ANDROID
 // FIXME: should we do anything for Android?
 void AsanCheckDynamicRTPrereqs() {}
diff --git a/lib/asan/asan_mac.cc b/lib/asan/asan_mac.cc
index f00d98f..525864f 100644
--- a/lib/asan/asan_mac.cc
+++ b/lib/asan/asan_mac.cc
@@ -24,9 +24,11 @@
 #include "sanitizer_common/sanitizer_libc.h"
 #include "sanitizer_common/sanitizer_mac.h"
 
+#include <dlfcn.h>
 #include <fcntl.h>
 #include <libkern/OSAtomic.h>
 #include <mach-o/dyld.h>
+#include <mach-o/getsect.h>
 #include <mach-o/loader.h>
 #include <pthread.h>
 #include <stdlib.h>  // for free()
@@ -36,9 +38,16 @@
 #include <sys/ucontext.h>
 #include <unistd.h>
 
+// from <crt_externs.h>, but we don't have that file on iOS
+extern "C" {
+  extern char ***_NSGetArgv(void);
+  extern char ***_NSGetEnviron(void);
+}
+
 namespace __asan {
 
 void InitializePlatformInterceptors() {}
+void InitializePlatformExceptionHandlers() {}
 
 bool PlatformHasDifferentMemcpyAndMemmove() {
   // On OS X 10.7 memcpy() and memmove() are both resolved
@@ -60,6 +69,30 @@
 // No-op. Mac does not support static linkage anyway.
 void AsanCheckIncompatibleRT() {}
 
+void AsanApplyToGlobals(globals_op_fptr op, const void *needle) {
+  // Find the Mach-O header for the image containing the needle
+  Dl_info info;
+  int err = dladdr(needle, &info);
+  if (err == 0) return;
+
+#if __LP64__
+  const struct mach_header_64 *mh = (struct mach_header_64 *)info.dli_fbase;
+#else
+  const struct mach_header *mh = (struct mach_header *)info.dli_fbase;
+#endif
+
+  // Look up the __asan_globals section in that image and register its globals
+  unsigned long size = 0;
+  __asan_global *globals = (__asan_global *)getsectiondata(
+      mh,
+      "__DATA", "__asan_globals",
+      &size);
+
+  if (!globals) return;
+  if (size % sizeof(__asan_global) != 0) return;
+  op(globals, size / sizeof(__asan_global));
+}
+
 void ReadContextStack(void *context, uptr *stack, uptr *ssize) {
   UNIMPLEMENTED();
 }
diff --git a/lib/asan/asan_malloc_linux.cc b/lib/asan/asan_malloc_linux.cc
index d5089f9..162abd2 100644
--- a/lib/asan/asan_malloc_linux.cc
+++ b/lib/asan/asan_malloc_linux.cc
@@ -26,52 +26,58 @@
 // ---------------------- Replacement functions ---------------- {{{1
 using namespace __asan;  // NOLINT
 
-static const uptr kCallocPoolSize = 1024;
-static uptr calloc_memory_for_dlsym[kCallocPoolSize];
+static uptr allocated_for_dlsym;
+static const uptr kDlsymAllocPoolSize = 1024;
+static uptr alloc_memory_for_dlsym[kDlsymAllocPoolSize];
 
-static bool IsInCallocPool(const void *ptr) {
-  sptr off = (sptr)ptr - (sptr)calloc_memory_for_dlsym;
-  return 0 <= off && off < (sptr)kCallocPoolSize;
+static bool IsInDlsymAllocPool(const void *ptr) {
+  uptr off = (uptr)ptr - (uptr)alloc_memory_for_dlsym;
+  return off < sizeof(alloc_memory_for_dlsym);
+}
+
+static void *AllocateFromLocalPool(uptr size_in_bytes) {
+  uptr size_in_words = RoundUpTo(size_in_bytes, kWordSize) / kWordSize;
+  void *mem = (void*)&alloc_memory_for_dlsym[allocated_for_dlsym];
+  allocated_for_dlsym += size_in_words;
+  CHECK_LT(allocated_for_dlsym, kDlsymAllocPoolSize);
+  return mem;
 }
 
 INTERCEPTOR(void, free, void *ptr) {
   GET_STACK_TRACE_FREE;
-  if (UNLIKELY(IsInCallocPool(ptr)))
+  if (UNLIKELY(IsInDlsymAllocPool(ptr)))
     return;
   asan_free(ptr, &stack, FROM_MALLOC);
 }
 
 INTERCEPTOR(void, cfree, void *ptr) {
   GET_STACK_TRACE_FREE;
-  if (UNLIKELY(IsInCallocPool(ptr)))
+  if (UNLIKELY(IsInDlsymAllocPool(ptr)))
     return;
   asan_free(ptr, &stack, FROM_MALLOC);
 }
 
 INTERCEPTOR(void*, malloc, uptr size) {
+  if (UNLIKELY(!asan_inited))
+    // Hack: dlsym calls malloc before REAL(malloc) is retrieved from dlsym.
+    return AllocateFromLocalPool(size);
   GET_STACK_TRACE_MALLOC;
   return asan_malloc(size, &stack);
 }
 
 INTERCEPTOR(void*, calloc, uptr nmemb, uptr size) {
-  if (UNLIKELY(!asan_inited)) {
+  if (UNLIKELY(!asan_inited))
     // Hack: dlsym calls calloc before REAL(calloc) is retrieved from dlsym.
-    static uptr allocated;
-    uptr size_in_words = ((nmemb * size) + kWordSize - 1) / kWordSize;
-    void *mem = (void*)&calloc_memory_for_dlsym[allocated];
-    allocated += size_in_words;
-    CHECK(allocated < kCallocPoolSize);
-    return mem;
-  }
+    return AllocateFromLocalPool(nmemb * size);
   GET_STACK_TRACE_MALLOC;
   return asan_calloc(nmemb, size, &stack);
 }
 
 INTERCEPTOR(void*, realloc, void *ptr, uptr size) {
   GET_STACK_TRACE_MALLOC;
-  if (UNLIKELY(IsInCallocPool(ptr))) {
-    uptr offset = (uptr)ptr - (uptr)calloc_memory_for_dlsym;
-    uptr copy_size = Min(size, kCallocPoolSize - offset);
+  if (UNLIKELY(IsInDlsymAllocPool(ptr))) {
+    uptr offset = (uptr)ptr - (uptr)alloc_memory_for_dlsym;
+    uptr copy_size = Min(size, kDlsymAllocPoolSize - offset);
     void *new_ptr = asan_malloc(size, &stack);
     internal_memcpy(new_ptr, ptr, copy_size);
     return new_ptr;
@@ -92,7 +98,7 @@
 INTERCEPTOR(void*, __libc_memalign, uptr boundary, uptr size) {
   GET_STACK_TRACE_MALLOC;
   void *res = asan_memalign(boundary, size, &stack, FROM_MALLOC);
-  DTLS_on_libc_memalign(res, size * boundary);
+  DTLS_on_libc_memalign(res, size);
   return res;
 }
 
diff --git a/lib/asan/asan_malloc_win.cc b/lib/asan/asan_malloc_win.cc
index c99e312..4a233df 100644
--- a/lib/asan/asan_malloc_win.cc
+++ b/lib/asan/asan_malloc_win.cc
@@ -14,6 +14,8 @@
 
 #include "sanitizer_common/sanitizer_platform.h"
 #if SANITIZER_WINDOWS
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
 
 #include "asan_allocator.h"
 #include "asan_interceptors.h"
@@ -49,6 +51,11 @@
 }
 
 ALLOCATION_FUNCTION_ATTRIBUTE
+void _free_base(void *ptr) {
+  free(ptr);
+}
+
+ALLOCATION_FUNCTION_ATTRIBUTE
 void cfree(void *ptr) {
   CHECK(!"cfree() should not be used on Windows");
 }
@@ -60,6 +67,11 @@
 }
 
 ALLOCATION_FUNCTION_ATTRIBUTE
+void *_malloc_base(size_t size) {
+  return malloc(size);
+}
+
+ALLOCATION_FUNCTION_ATTRIBUTE
 void *_malloc_dbg(size_t size, int, const char *, int) {
   return malloc(size);
 }
@@ -71,6 +83,11 @@
 }
 
 ALLOCATION_FUNCTION_ATTRIBUTE
+void *_calloc_base(size_t nmemb, size_t size) {
+  return calloc(nmemb, size);
+}
+
+ALLOCATION_FUNCTION_ATTRIBUTE
 void *_calloc_dbg(size_t nmemb, size_t size, int, const char *, int) {
   return calloc(nmemb, size);
 }
@@ -93,6 +110,11 @@
 }
 
 ALLOCATION_FUNCTION_ATTRIBUTE
+void *_realloc_base(void *ptr, size_t size) {
+  return realloc(ptr, size);
+}
+
+ALLOCATION_FUNCTION_ATTRIBUTE
 void *_recalloc(void *p, size_t n, size_t elem_size) {
   if (!p)
     return calloc(n, elem_size);
@@ -103,7 +125,7 @@
 }
 
 ALLOCATION_FUNCTION_ATTRIBUTE
-size_t _msize(void *ptr) {
+size_t _msize(const void *ptr) {
   GET_CURRENT_PC_BP_SP;
   (void)sp;
   return asan_malloc_usable_size(ptr, pc, bp);
@@ -139,38 +161,89 @@
 }
 }  // extern "C"
 
+INTERCEPTOR_WINAPI(LPVOID, HeapAlloc, HANDLE hHeap, DWORD dwFlags,
+                   SIZE_T dwBytes) {
+  GET_STACK_TRACE_MALLOC;
+  void *p = asan_malloc(dwBytes, &stack);
+  // Reading MSDN suggests that the *entire* usable allocation is zeroed out.
+  // Otherwise it is difficult to HeapReAlloc with HEAP_ZERO_MEMORY.
+  // https://blogs.msdn.microsoft.com/oldnewthing/20120316-00/?p=8083
+  if (dwFlags == HEAP_ZERO_MEMORY)
+    internal_memset(p, 0, asan_mz_size(p));
+  else
+    CHECK(dwFlags == 0 && "unsupported heap flags");
+  return p;
+}
+
+INTERCEPTOR_WINAPI(BOOL, HeapFree, HANDLE hHeap, DWORD dwFlags, LPVOID lpMem) {
+  CHECK(dwFlags == 0 && "unsupported heap flags");
+  GET_STACK_TRACE_FREE;
+  asan_free(lpMem, &stack, FROM_MALLOC);
+  return true;
+}
+
+INTERCEPTOR_WINAPI(LPVOID, HeapReAlloc, HANDLE hHeap, DWORD dwFlags,
+                   LPVOID lpMem, SIZE_T dwBytes) {
+  GET_STACK_TRACE_MALLOC;
+  // Realloc should never reallocate in place.
+  if (dwFlags & HEAP_REALLOC_IN_PLACE_ONLY)
+    return nullptr;
+  CHECK(dwFlags == 0 && "unsupported heap flags");
+  return asan_realloc(lpMem, dwBytes, &stack);
+}
+
+INTERCEPTOR_WINAPI(SIZE_T, HeapSize, HANDLE hHeap, DWORD dwFlags,
+                   LPCVOID lpMem) {
+  CHECK(dwFlags == 0 && "unsupported heap flags");
+  GET_CURRENT_PC_BP_SP;
+  (void)sp;
+  return asan_malloc_usable_size(lpMem, pc, bp);
+}
+
 namespace __asan {
+
+static void TryToOverrideFunction(const char *fname, uptr new_func) {
+  // Failure here is not fatal. The CRT may not be present, and different CRT
+  // versions use different symbols.
+  if (!__interception::OverrideFunction(fname, new_func))
+    VPrintf(2, "Failed to override function %s\n", fname);
+}
+
 void ReplaceSystemMalloc() {
 #if defined(ASAN_DYNAMIC)
-  // We don't check the result because CRT might not be used in the process.
-  __interception::OverrideFunction("free", (uptr)free);
-  __interception::OverrideFunction("malloc", (uptr)malloc);
-  __interception::OverrideFunction("_malloc_crt", (uptr)malloc);
-  __interception::OverrideFunction("calloc", (uptr)calloc);
-  __interception::OverrideFunction("_calloc_crt", (uptr)calloc);
-  __interception::OverrideFunction("realloc", (uptr)realloc);
-  __interception::OverrideFunction("_realloc_crt", (uptr)realloc);
-  __interception::OverrideFunction("_recalloc", (uptr)_recalloc);
-  __interception::OverrideFunction("_recalloc_crt", (uptr)_recalloc);
-  __interception::OverrideFunction("_msize", (uptr)_msize);
-  __interception::OverrideFunction("_expand", (uptr)_expand);
+  TryToOverrideFunction("free", (uptr)free);
+  TryToOverrideFunction("_free_base", (uptr)free);
+  TryToOverrideFunction("malloc", (uptr)malloc);
+  TryToOverrideFunction("_malloc_base", (uptr)malloc);
+  TryToOverrideFunction("_malloc_crt", (uptr)malloc);
+  TryToOverrideFunction("calloc", (uptr)calloc);
+  TryToOverrideFunction("_calloc_base", (uptr)calloc);
+  TryToOverrideFunction("_calloc_crt", (uptr)calloc);
+  TryToOverrideFunction("realloc", (uptr)realloc);
+  TryToOverrideFunction("_realloc_base", (uptr)realloc);
+  TryToOverrideFunction("_realloc_crt", (uptr)realloc);
+  TryToOverrideFunction("_recalloc", (uptr)_recalloc);
+  TryToOverrideFunction("_recalloc_crt", (uptr)_recalloc);
+  TryToOverrideFunction("_msize", (uptr)_msize);
+  TryToOverrideFunction("_expand", (uptr)_expand);
+  TryToOverrideFunction("_expand_base", (uptr)_expand);
 
-  // Override different versions of 'operator new' and 'operator delete'.
-  // No need to override the nothrow versions as they just wrap the throw
-  // versions.
-  // FIXME: Unfortunately, MSVC miscompiles the statements that take the
-  // addresses of the array versions of these operators,
-  // see https://connect.microsoft.com/VisualStudio/feedbackdetail/view/946992
-  // We might want to try to work around this by [inline] assembly or compiling
-  // parts of the RTL with Clang.
-  void *(*op_new)(size_t sz) = operator new;
-  void (*op_delete)(void *p) = operator delete;
-  void *(*op_array_new)(size_t sz) = operator new[];
-  void (*op_array_delete)(void *p) = operator delete[];
-  __interception::OverrideFunction("??2@YAPAXI@Z", (uptr)op_new);
-  __interception::OverrideFunction("??3@YAXPAX@Z", (uptr)op_delete);
-  __interception::OverrideFunction("??_U@YAPAXI@Z", (uptr)op_array_new);
-  __interception::OverrideFunction("??_V@YAXPAX@Z", (uptr)op_array_delete);
+  // Recent versions of ucrtbase.dll appear to be built with PGO and LTCG, which
+  // enable cross-module inlining. This means our _malloc_base hook won't catch
+  // all CRT allocations. This code here patches the import table of
+  // ucrtbase.dll so that all attempts to use the lower-level win32 heap
+  // allocation API will be directed to ASan's heap. We don't currently
+  // intercept all calls to HeapAlloc. If we did, we would have to check on
+  // HeapFree whether the pointer came from ASan of from the system.
+#define INTERCEPT_UCRT_FUNCTION(func)                                         \
+  if (!INTERCEPT_FUNCTION_DLLIMPORT("ucrtbase.dll",                           \
+                                    "api-ms-win-core-heap-l1-1-0.dll", func)) \
+    VPrintf(2, "Failed to intercept ucrtbase.dll import %s\n", #func);
+  INTERCEPT_UCRT_FUNCTION(HeapAlloc);
+  INTERCEPT_UCRT_FUNCTION(HeapFree);
+  INTERCEPT_UCRT_FUNCTION(HeapReAlloc);
+  INTERCEPT_UCRT_FUNCTION(HeapSize);
+#undef INTERCEPT_UCRT_FUNCTION
 #endif
 }
 }  // namespace __asan
diff --git a/lib/asan/asan_mapping.h b/lib/asan/asan_mapping.h
index 8fe347c..52c4f67 100644
--- a/lib/asan/asan_mapping.h
+++ b/lib/asan/asan_mapping.h
@@ -87,6 +87,20 @@
 // || `[0x08000000000, 0x08fffffffff]` || lowshadow  ||
 // || `[0x00000000000, 0x07fffffffff]` || lowmem     ||
 //
+// Default Linux/S390 mapping:
+// || `[0x30000000, 0x7fffffff]` || HighMem    ||
+// || `[0x26000000, 0x2fffffff]` || HighShadow ||
+// || `[0x24000000, 0x25ffffff]` || ShadowGap  ||
+// || `[0x20000000, 0x23ffffff]` || LowShadow  ||
+// || `[0x00000000, 0x1fffffff]` || LowMem     ||
+//
+// Default Linux/SystemZ mapping:
+// || `[0x14000000000000, 0x1fffffffffffff]` || HighMem    ||
+// || `[0x12800000000000, 0x13ffffffffffff]` || HighShadow ||
+// || `[0x12000000000000, 0x127fffffffffff]` || ShadowGap  ||
+// || `[0x10000000000000, 0x11ffffffffffff]` || LowShadow  ||
+// || `[0x00000000000000, 0x0fffffffffffff]` || LowMem     ||
+//
 // Shadow mapping on FreeBSD/x86-64 with SHADOW_OFFSET == 0x400000000000:
 // || `[0x500000000000, 0x7fffffffffff]` || HighMem    ||
 // || `[0x4a0000000000, 0x4fffffffffff]` || HighShadow ||
@@ -115,16 +129,18 @@
 static const u64 kDefaultShadowOffset64 = 1ULL << 44;
 static const u64 kDefaultShort64bitShadowOffset = 0x7FFF8000;  // < 2G.
 static const u64 kIosShadowOffset32 = 1ULL << 30;  // 0x40000000
-static const u64 kIosShadowOffset64 = 0x130000000;
+static const u64 kIosShadowOffset64 = 0x120200000;
 static const u64 kIosSimShadowOffset32 = 1ULL << 30;
 static const u64 kIosSimShadowOffset64 = kDefaultShadowOffset64;
 static const u64 kAArch64_ShadowOffset64 = 1ULL << 36;
 static const u64 kMIPS32_ShadowOffset32 = 0x0aaa0000;
 static const u64 kMIPS64_ShadowOffset64 = 1ULL << 37;
 static const u64 kPPC64_ShadowOffset64 = 1ULL << 41;
+static const u64 kSystemZ_ShadowOffset64 = 1ULL << 52;
 static const u64 kFreeBSD_ShadowOffset32 = 1ULL << 30;  // 0x40000000
 static const u64 kFreeBSD_ShadowOffset64 = 1ULL << 46;  // 0x400000000000
 static const u64 kWindowsShadowOffset32 = 3ULL << 28;  // 0x30000000
+static const u64 kWindowsShadowOffset64 = 1ULL << 45;  // 32TB
 
 #define SHADOW_SCALE kDefaultShadowScale
 
@@ -138,28 +154,36 @@
 #    define SHADOW_OFFSET kFreeBSD_ShadowOffset32
 #  elif SANITIZER_WINDOWS
 #    define SHADOW_OFFSET kWindowsShadowOffset32
-#  elif SANITIZER_IOSSIM
-#    define SHADOW_OFFSET kIosSimShadowOffset32
 #  elif SANITIZER_IOS
-#    define SHADOW_OFFSET kIosShadowOffset32
+#    if SANITIZER_IOSSIM
+#      define SHADOW_OFFSET kIosSimShadowOffset32
+#    else
+#      define SHADOW_OFFSET kIosShadowOffset32
+#    endif
 #  else
 #    define SHADOW_OFFSET kDefaultShadowOffset32
 #  endif
 #else
-#  if defined(__aarch64__)
+#  if SANITIZER_IOS
+#    if SANITIZER_IOSSIM
+#      define SHADOW_OFFSET kIosSimShadowOffset64
+#    else
+#      define SHADOW_OFFSET kIosShadowOffset64
+#    endif
+#  elif defined(__aarch64__)
 #    define SHADOW_OFFSET kAArch64_ShadowOffset64
 #  elif defined(__powerpc64__)
 #    define SHADOW_OFFSET kPPC64_ShadowOffset64
+#  elif defined(__s390x__)
+#    define SHADOW_OFFSET kSystemZ_ShadowOffset64
 #  elif SANITIZER_FREEBSD
 #    define SHADOW_OFFSET kFreeBSD_ShadowOffset64
 #  elif SANITIZER_MAC
 #   define SHADOW_OFFSET kDefaultShadowOffset64
 #  elif defined(__mips64)
 #   define SHADOW_OFFSET kMIPS64_ShadowOffset64
-#  elif SANITIZER_IOSSIM
-#    define SHADOW_OFFSET kIosSimShadowOffset64
-#  elif SANITIZER_IOS
-#    define SHADOW_OFFSET kIosShadowOffset64
+#  elif SANITIZER_WINDOWS64
+#   define SHADOW_OFFSET kWindowsShadowOffset64
 #  else
 #   define SHADOW_OFFSET kDefaultShort64bitShadowOffset
 #  endif
diff --git a/lib/asan/asan_memory_profile.cc b/lib/asan/asan_memory_profile.cc
new file mode 100644
index 0000000..ba00516
--- /dev/null
+++ b/lib/asan/asan_memory_profile.cc
@@ -0,0 +1,100 @@
+//===-- asan_memory_profile.cc.cc -----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of AddressSanitizer, an address sanity checker.
+//
+// This file implements __sanitizer_print_memory_profile.
+//===----------------------------------------------------------------------===//
+
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_stackdepot.h"
+#include "sanitizer_common/sanitizer_stacktrace.h"
+#include "sanitizer_common/sanitizer_stoptheworld.h"
+#include "lsan/lsan_common.h"
+#include "asan/asan_allocator.h"
+
+#if CAN_SANITIZE_LEAKS
+
+namespace __asan {
+
+struct AllocationSite {
+  u32 id;
+  uptr total_size;
+  uptr count;
+};
+
+class HeapProfile {
+ public:
+  HeapProfile() : allocations_(1024) {}
+  void Insert(u32 id, uptr size) {
+    total_allocated_ += size;
+    total_count_++;
+    // Linear lookup will be good enough for most cases (although not all).
+    for (uptr i = 0; i < allocations_.size(); i++) {
+      if (allocations_[i].id == id) {
+        allocations_[i].total_size += size;
+        allocations_[i].count++;
+        return;
+      }
+    }
+    allocations_.push_back({id, size, 1});
+  }
+
+  void Print(uptr top_percent) {
+    InternalSort(&allocations_, allocations_.size(),
+                 [](const AllocationSite &a, const AllocationSite &b) {
+                   return a.total_size > b.total_size;
+                 });
+    CHECK(total_allocated_);
+    uptr total_shown = 0;
+    Printf("Live Heap Allocations: %zd bytes from %zd allocations; "
+           "showing top %zd%%\n", total_allocated_, total_count_, top_percent);
+    for (uptr i = 0; i < allocations_.size(); i++) {
+      auto &a = allocations_[i];
+      Printf("%zd byte(s) (%zd%%) in %zd allocation(s)\n", a.total_size,
+             a.total_size * 100 / total_allocated_, a.count);
+      StackDepotGet(a.id).Print();
+      total_shown += a.total_size;
+      if (total_shown * 100 / total_allocated_ > top_percent)
+        break;
+    }
+  }
+
+ private:
+  uptr total_allocated_ = 0;
+  uptr total_count_ = 0;
+  InternalMmapVector<AllocationSite> allocations_;
+};
+
+static void ChunkCallback(uptr chunk, void *arg) {
+  HeapProfile *hp = reinterpret_cast<HeapProfile*>(arg);
+  AsanChunkView cv = FindHeapChunkByAddress(chunk);
+  if (!cv.IsAllocated()) return;
+  u32 id = cv.GetAllocStackId();
+  if (!id) return;
+  hp->Insert(id, cv.UsedSize());
+}
+
+static void MemoryProfileCB(const SuspendedThreadsList &suspended_threads_list,
+                            void *argument) {
+  HeapProfile hp;
+  __lsan::ForEachChunk(ChunkCallback, &hp);
+  hp.Print(reinterpret_cast<uptr>(argument));
+}
+
+}  // namespace __asan
+
+extern "C" {
+SANITIZER_INTERFACE_ATTRIBUTE
+void __sanitizer_print_memory_profile(uptr top_percent) {
+  __sanitizer::StopTheWorld(__asan::MemoryProfileCB, (void*)top_percent);
+}
+}  // extern "C"
+
+#endif  // CAN_SANITIZE_LEAKS
diff --git a/lib/asan/asan_new_delete.cc b/lib/asan/asan_new_delete.cc
index b5ba13e..fef6604 100644
--- a/lib/asan/asan_new_delete.cc
+++ b/lib/asan/asan_new_delete.cc
@@ -20,9 +20,25 @@
 
 #include <stddef.h>
 
-// C++ operators can't have visibility attributes on Windows.
+// C++ operators can't have dllexport attributes on Windows. We export them
+// anyway by passing extra -export flags to the linker, which is exactly that
+// dllexport would normally do. We need to export them in order to make the
+// VS2015 dynamic CRT (MD) work.
 #if SANITIZER_WINDOWS
 # define CXX_OPERATOR_ATTRIBUTE
+# ifdef _WIN64
+#  pragma comment(linker, "/export:??2@YAPEAX_K@Z")   // operator new
+#  pragma comment(linker, "/export:??3@YAXPEAX@Z")    // operator delete
+#  pragma comment(linker, "/export:??3@YAXPEAX_K@Z")  // sized operator delete
+#  pragma comment(linker, "/export:??_U@YAPEAX_K@Z")  // operator new[]
+#  pragma comment(linker, "/export:??_V@YAXPEAX@Z")   // operator delete[]
+# else
+#  pragma comment(linker, "/export:??2@YAPAXI@Z")   // operator new
+#  pragma comment(linker, "/export:??3@YAXPAX@Z")   // operator delete
+#  pragma comment(linker, "/export:??3@YAXPAXI@Z")  // sized operator delete
+#  pragma comment(linker, "/export:??_U@YAPAXI@Z")  // operator new[]
+#  pragma comment(linker, "/export:??_V@YAXPAX@Z")  // operator delete[]
+# endif
 #else
 # define CXX_OPERATOR_ATTRIBUTE INTERCEPTOR_ATTRIBUTE
 #endif
diff --git a/lib/asan/asan_poisoning.cc b/lib/asan/asan_poisoning.cc
index f77ab87..50877ae 100644
--- a/lib/asan/asan_poisoning.cc
+++ b/lib/asan/asan_poisoning.cc
@@ -343,7 +343,7 @@
                                                  &stack);
   }
   CHECK_LE(end - beg,
-           FIRST_32_SECOND_64(1UL << 30, 1UL << 34)); // Sanity check.
+           FIRST_32_SECOND_64(1UL << 30, 1ULL << 34)); // Sanity check.
 
   uptr a = RoundDownTo(Min(old_mid, new_mid), granularity);
   uptr c = RoundUpTo(Max(old_mid, new_mid), granularity);
diff --git a/lib/asan/asan_posix.cc b/lib/asan/asan_posix.cc
index 9e01bcd..84a29ec 100644
--- a/lib/asan/asan_posix.cc
+++ b/lib/asan/asan_posix.cc
@@ -36,14 +36,23 @@
 void AsanOnDeadlySignal(int signo, void *siginfo, void *context) {
   ScopedDeadlySignal signal_scope(GetCurrentThread());
   int code = (int)((siginfo_t*)siginfo)->si_code;
-  // Write the first message using the bullet-proof write.
-  if (18 != internal_write(2, "ASAN:DEADLYSIGNAL\n", 18)) Die();
+  // Write the first message using fd=2, just in case.
+  // It may actually fail to write in case stderr is closed.
+  internal_write(2, "ASAN:DEADLYSIGNAL\n", 18);
   SignalContext sig = SignalContext::Create(siginfo, context);
 
   // Access at a reasonable offset above SP, or slightly below it (to account
   // for x86_64 or PowerPC redzone, ARM push of multiple registers, etc) is
   // probably a stack overflow.
+#ifdef __s390__
+  // On s390, the fault address in siginfo points to start of the page, not
+  // to the precise word that was accessed.  Mask off the low bits of sp to
+  // take it into account.
+  bool IsStackAccess = sig.addr >= (sig.sp & ~0xFFF) &&
+                       sig.addr < sig.sp + 0xFFFF;
+#else
   bool IsStackAccess = sig.addr + 512 > sig.sp && sig.addr < sig.sp + 0xFFFF;
+#endif
 
 #if __powerpc__
   // Large stack frames can be allocated with e.g.
diff --git a/lib/asan/asan_report.cc b/lib/asan/asan_report.cc
index 0fb6084..9f2f12d 100644
--- a/lib/asan/asan_report.cc
+++ b/lib/asan/asan_report.cc
@@ -16,6 +16,7 @@
 #include "asan_internal.h"
 #include "asan_mapping.h"
 #include "asan_report.h"
+#include "asan_scariness_score.h"
 #include "asan_stack.h"
 #include "asan_thread.h"
 #include "sanitizer_common/sanitizer_common.h"
@@ -470,7 +471,7 @@
   // previously. That's unfortunate, but I have no better solution,
   // especially given that the alloca may be from entirely different place
   // (e.g. use-after-scope, or different thread's stack).
-#if defined(__powerpc64__) && defined(__BIG_ENDIAN__)
+#if SANITIZER_PPC64V1
   // On PowerPC64 ELFv1, the address of a function actually points to a
   // three-doubleword data structure with the first field containing
   // the address of the function's code.
@@ -687,6 +688,9 @@
     if (flags()->print_stats)
       __asan_print_accumulated_stats();
 
+    if (common_flags()->print_cmdline)
+      PrintCmdline();
+
     // Copy the message buffer so that we could start logging without holding a
     // lock that gets aquired during printing.
     InternalScopedBuffer<char> buffer_copy(kErrorMessageBufferSize);
@@ -696,9 +700,6 @@
                       error_message_buffer, kErrorMessageBufferSize);
     }
 
-    // Remove color sequences since logs cannot print them.
-    RemoveANSIEscapeSequencesFromString(buffer_copy.data());
-
     LogFullErrorReport(buffer_copy.data());
 
     if (error_report_callback) {
@@ -735,10 +736,10 @@
 };
 
 StaticSpinMutex ScopedInErrorReport::lock_;
-u32 ScopedInErrorReport::reporting_thread_tid_;
+u32 ScopedInErrorReport::reporting_thread_tid_ = kInvalidTid;
 
 void ReportStackOverflow(const SignalContext &sig) {
-  ScopedInErrorReport in_report;
+  ScopedInErrorReport in_report(/*report*/ nullptr, /*fatal*/ true);
   Decorator d;
   Printf("%s", d.Warning());
   Report(
@@ -747,13 +748,14 @@
       (void *)sig.addr, (void *)sig.pc, (void *)sig.bp, (void *)sig.sp,
       GetCurrentTidOrInvalid());
   Printf("%s", d.EndWarning());
+  ScarinessScore::PrintSimple(10, "stack-overflow");
   GET_STACK_TRACE_SIGNAL(sig);
   stack.Print();
   ReportErrorSummary("stack-overflow", &stack);
 }
 
 void ReportDeadlySignal(const char *description, const SignalContext &sig) {
-  ScopedInErrorReport in_report(/*report*/nullptr, /*fatal*/true);
+  ScopedInErrorReport in_report(/*report*/ nullptr, /*fatal*/ true);
   Decorator d;
   Printf("%s", d.Warning());
   Report(
@@ -761,10 +763,32 @@
       " (pc %p bp %p sp %p T%d)\n",
       description, (void *)sig.addr, (void *)sig.pc, (void *)sig.bp,
       (void *)sig.sp, GetCurrentTidOrInvalid());
-  if (sig.pc < GetPageSizeCached()) {
-    Report("Hint: pc points to the zero page.\n");
-  }
   Printf("%s", d.EndWarning());
+  ScarinessScore SS;
+  if (sig.pc < GetPageSizeCached())
+    Report("Hint: pc points to the zero page.\n");
+  if (sig.is_memory_access) {
+    const char *access_type =
+        sig.write_flag == SignalContext::WRITE
+            ? "WRITE"
+            : (sig.write_flag == SignalContext::READ ? "READ" : "UNKNOWN");
+    Report("The signal is caused by a %s memory access.\n", access_type);
+    if (sig.addr < GetPageSizeCached()) {
+      Report("Hint: address points to the zero page.\n");
+      SS.Scare(10, "null-deref");
+    } else if (sig.addr == sig.pc) {
+      SS.Scare(60, "wild-jump");
+    } else if (sig.write_flag == SignalContext::WRITE) {
+      SS.Scare(30, "wild-addr-write");
+    } else if (sig.write_flag == SignalContext::READ) {
+      SS.Scare(20, "wild-addr-read");
+    } else {
+      SS.Scare(25, "wild-addr");
+    }
+  } else {
+    SS.Scare(10, "signal");
+  }
+  SS.Print();
   GET_STACK_TRACE_SIGNAL(sig);
   stack.Print();
   MaybeDumpInstructionBytes(sig.pc);
@@ -784,13 +808,14 @@
          ThreadNameWithParenthesis(curr_tid, tname, sizeof(tname)));
   Printf("%s", d.EndWarning());
   CHECK_GT(free_stack->size, 0);
+  ScarinessScore::PrintSimple(42, "double-free");
   GET_STACK_TRACE_FATAL(free_stack->trace[0], free_stack->top_frame_bp);
   stack.Print();
   DescribeHeapAddress(addr, 1);
   ReportErrorSummary("double-free", &stack);
 }
 
-void ReportNewDeleteSizeMismatch(uptr addr, uptr delete_size,
+void ReportNewDeleteSizeMismatch(uptr addr, uptr alloc_size, uptr delete_size,
                                  BufferedStackTrace *free_stack) {
   ScopedInErrorReport in_report;
   Decorator d;
@@ -804,8 +829,9 @@
   Printf("%s  object passed to delete has wrong type:\n", d.EndWarning());
   Printf("  size of the allocated type:   %zd bytes;\n"
          "  size of the deallocated type: %zd bytes.\n",
-         asan_mz_size(reinterpret_cast<void*>(addr)), delete_size);
+         alloc_size, delete_size);
   CHECK_GT(free_stack->size, 0);
+  ScarinessScore::PrintSimple(10, "new-delete-type-mismatch");
   GET_STACK_TRACE_FATAL(free_stack->trace[0], free_stack->top_frame_bp);
   stack.Print();
   DescribeHeapAddress(addr, 1);
@@ -825,6 +851,7 @@
          curr_tid, ThreadNameWithParenthesis(curr_tid, tname, sizeof(tname)));
   Printf("%s", d.EndWarning());
   CHECK_GT(free_stack->size, 0);
+  ScarinessScore::PrintSimple(40, "bad-free");
   GET_STACK_TRACE_FATAL(free_stack->trace[0], free_stack->top_frame_bp);
   stack.Print();
   DescribeHeapAddress(addr, 1);
@@ -846,6 +873,7 @@
         alloc_names[alloc_type], dealloc_names[dealloc_type], addr);
   Printf("%s", d.EndWarning());
   CHECK_GT(free_stack->size, 0);
+  ScarinessScore::PrintSimple(10, "alloc-dealloc-mismatch");
   GET_STACK_TRACE_FATAL(free_stack->trace[0], free_stack->top_frame_bp);
   stack.Print();
   DescribeHeapAddress(addr, 1);
@@ -894,6 +922,7 @@
              "memory ranges [%p,%p) and [%p, %p) overlap\n", \
              bug_type, offset1, offset1 + length1, offset2, offset2 + length2);
   Printf("%s", d.EndWarning());
+  ScarinessScore::PrintSimple(10, bug_type);
   stack->Print();
   DescribeAddress((uptr)offset1, length1, bug_type);
   DescribeAddress((uptr)offset2, length2, bug_type);
@@ -908,6 +937,7 @@
   Printf("%s", d.Warning());
   Report("ERROR: AddressSanitizer: %s: (size=%zd)\n", bug_type, size);
   Printf("%s", d.EndWarning());
+  ScarinessScore::PrintSimple(10, bug_type);
   stack->Print();
   DescribeAddress(offset, size, bug_type);
   ReportErrorSummary(bug_type, stack);
@@ -982,10 +1012,10 @@
   uptr a2 = reinterpret_cast<uptr>(p2);
   AsanChunkView chunk1 = FindHeapChunkByAddress(a1);
   AsanChunkView chunk2 = FindHeapChunkByAddress(a2);
-  bool valid1 = chunk1.IsValid();
-  bool valid2 = chunk2.IsValid();
-  if ((valid1 != valid2) || (valid1 && valid2 && !chunk1.Eq(chunk2))) {
-    GET_CALLER_PC_BP_SP;                                              \
+  bool valid1 = chunk1.IsAllocated();
+  bool valid2 = chunk2.IsAllocated();
+  if (!valid1 || !valid2 || !chunk1.Eq(chunk2)) {
+    GET_CALLER_PC_BP_SP;
     return ReportInvalidPointerPair(pc, bp, sp, a1, a2);
   }
 }
@@ -1016,10 +1046,34 @@
   Die();
 }
 
+static void PrintContainerOverflowHint() {
+  Printf("HINT: if you don't care about these errors you may set "
+         "ASAN_OPTIONS=detect_container_overflow=0.\n"
+         "If you suspect a false positive see also: "
+         "https://github.com/google/sanitizers/wiki/"
+         "AddressSanitizerContainerOverflow.\n");
+}
+
+static bool AdjacentShadowValuesAreFullyPoisoned(u8 *s) {
+  return s[-1] > 127 && s[1] > 127;
+}
+
 void ReportGenericError(uptr pc, uptr bp, uptr sp, uptr addr, bool is_write,
                         uptr access_size, u32 exp, bool fatal) {
   if (!fatal && SuppressErrorReport(pc)) return;
   ENABLE_FRAME_POINTER;
+  ScarinessScore SS;
+
+  if (access_size) {
+    if (access_size <= 9) {
+      char desr[] = "?-byte";
+      desr[0] = '0' + access_size;
+      SS.Scare(access_size + access_size / 2, desr);
+    } else if (access_size >= 10) {
+      SS.Scare(15, "multi-byte");
+    }
+    is_write ? SS.Scare(20, "write") : SS.Scare(1, "read");
+  }
 
   // Optimization experiments.
   // The experiments can be used to evaluate potential optimizations that remove
@@ -1032,6 +1086,7 @@
 
   // Determine the error type.
   const char *bug_descr = "unknown-crash";
+  u8 shadow_val = 0;
   if (AddrIsInMem(addr)) {
     u8 *shadow_addr = (u8*)MemToShadow(addr);
     // If we are accessing 16 bytes, look at the second shadow byte.
@@ -1040,49 +1095,76 @@
     // If we are in the partial right redzone, look at the next shadow byte.
     if (*shadow_addr > 0 && *shadow_addr < 128)
       shadow_addr++;
-    switch (*shadow_addr) {
+    bool far_from_bounds = false;
+    shadow_val = *shadow_addr;
+    int bug_type_score = 0;
+    // For use-after-frees reads are almost as bad as writes.
+    int read_after_free_bonus = 0;
+    switch (shadow_val) {
       case kAsanHeapLeftRedzoneMagic:
       case kAsanHeapRightRedzoneMagic:
       case kAsanArrayCookieMagic:
         bug_descr = "heap-buffer-overflow";
+        bug_type_score = 10;
+        far_from_bounds = AdjacentShadowValuesAreFullyPoisoned(shadow_addr);
         break;
       case kAsanHeapFreeMagic:
         bug_descr = "heap-use-after-free";
+        bug_type_score = 20;
+        if (!is_write) read_after_free_bonus = 18;
         break;
       case kAsanStackLeftRedzoneMagic:
         bug_descr = "stack-buffer-underflow";
+        bug_type_score = 25;
+        far_from_bounds = AdjacentShadowValuesAreFullyPoisoned(shadow_addr);
         break;
       case kAsanInitializationOrderMagic:
         bug_descr = "initialization-order-fiasco";
+        bug_type_score = 1;
         break;
       case kAsanStackMidRedzoneMagic:
       case kAsanStackRightRedzoneMagic:
       case kAsanStackPartialRedzoneMagic:
         bug_descr = "stack-buffer-overflow";
+        bug_type_score = 25;
+        far_from_bounds = AdjacentShadowValuesAreFullyPoisoned(shadow_addr);
         break;
       case kAsanStackAfterReturnMagic:
         bug_descr = "stack-use-after-return";
+        bug_type_score = 30;
+        if (!is_write) read_after_free_bonus = 18;
         break;
       case kAsanUserPoisonedMemoryMagic:
         bug_descr = "use-after-poison";
+        bug_type_score = 20;
         break;
       case kAsanContiguousContainerOOBMagic:
         bug_descr = "container-overflow";
+        bug_type_score = 10;
         break;
       case kAsanStackUseAfterScopeMagic:
         bug_descr = "stack-use-after-scope";
+        bug_type_score = 10;
         break;
       case kAsanGlobalRedzoneMagic:
         bug_descr = "global-buffer-overflow";
+        bug_type_score = 10;
+        far_from_bounds = AdjacentShadowValuesAreFullyPoisoned(shadow_addr);
         break;
       case kAsanIntraObjectRedzone:
         bug_descr = "intra-object-overflow";
+        bug_type_score = 10;
         break;
       case kAsanAllocaLeftMagic:
       case kAsanAllocaRightMagic:
         bug_descr = "dynamic-stack-buffer-overflow";
+        bug_type_score = 25;
+        far_from_bounds = AdjacentShadowValuesAreFullyPoisoned(shadow_addr);
         break;
     }
+    SS.Scare(bug_type_score + read_after_free_bonus, bug_descr);
+    if (far_from_bounds)
+      SS.Scare(10, "far-from-bounds");
   }
 
   ReportData report = { pc, sp, bp, addr, (bool)is_write, access_size,
@@ -1105,10 +1187,13 @@
          ThreadNameWithParenthesis(curr_tid, tname, sizeof(tname)),
          d.EndAccess());
 
+  SS.Print();
   GET_STACK_TRACE_FATAL(pc, bp);
   stack.Print();
 
   DescribeAddress(addr, access_size, bug_descr);
+  if (shadow_val == kAsanContiguousContainerOOBMagic)
+    PrintContainerOverflowHint();
   ReportErrorSummary(bug_descr, &stack);
   PrintShadowMemoryForAddress(addr);
 }
diff --git a/lib/asan/asan_report.h b/lib/asan/asan_report.h
index 559b8ad..03f0965 100644
--- a/lib/asan/asan_report.h
+++ b/lib/asan/asan_report.h
@@ -53,7 +53,7 @@
                         uptr access_size, u32 exp, bool fatal);
 void ReportStackOverflow(const SignalContext &sig);
 void ReportDeadlySignal(const char *description, const SignalContext &sig);
-void ReportNewDeleteSizeMismatch(uptr addr, uptr delete_size,
+void ReportNewDeleteSizeMismatch(uptr addr, uptr alloc_size, uptr delete_size,
                                  BufferedStackTrace *free_stack);
 void ReportDoubleFree(uptr addr, BufferedStackTrace *free_stack);
 void ReportFreeNotMalloced(uptr addr, BufferedStackTrace *free_stack);
diff --git a/lib/asan/asan_rtl.cc b/lib/asan/asan_rtl.cc
index 7b8b5dd..4962b9e 100644
--- a/lib/asan/asan_rtl.cc
+++ b/lib/asan/asan_rtl.cc
@@ -86,8 +86,8 @@
 // Reserve memory range [beg, end].
 // We need to use inclusive range because end+1 may not be representable.
 void ReserveShadowMemoryRange(uptr beg, uptr end, const char *name) {
-  CHECK_EQ((beg % GetPageSizeCached()), 0);
-  CHECK_EQ(((end + 1) % GetPageSizeCached()), 0);
+  CHECK_EQ((beg % GetMmapGranularity()), 0);
+  CHECK_EQ(((end + 1) % GetMmapGranularity()), 0);
   uptr size = end - beg + 1;
   DecreaseTotalMmap(size);  // Don't count the shadow against mmap_limit_mb.
   void *res = MmapFixedNoReserve(beg, size, name);
@@ -320,26 +320,26 @@
   kHighMemEnd = GetMaxVirtualAddress();
   // Increase kHighMemEnd to make sure it's properly
   // aligned together with kHighMemBeg:
-  kHighMemEnd |= SHADOW_GRANULARITY * GetPageSizeCached() - 1;
+  kHighMemEnd |= SHADOW_GRANULARITY * GetMmapGranularity() - 1;
 #endif  // !ASAN_FIXED_MAPPING
-  CHECK_EQ((kHighMemBeg % GetPageSizeCached()), 0);
+  CHECK_EQ((kHighMemBeg % GetMmapGranularity()), 0);
 }
 
 static void ProtectGap(uptr addr, uptr size) {
   if (!flags()->protect_shadow_gap)
     return;
-  void *res = MmapNoAccess(addr, size, "shadow gap");
+  void *res = MmapFixedNoAccess(addr, size, "shadow gap");
   if (addr == (uptr)res)
     return;
   // A few pages at the start of the address space can not be protected.
   // But we really want to protect as much as possible, to prevent this memory
   // being returned as a result of a non-FIXED mmap().
   if (addr == kZeroBaseShadowStart) {
-    uptr step = GetPageSizeCached();
+    uptr step = GetMmapGranularity();
     while (size > step && addr < kZeroBaseMaxShadowStart) {
       addr += step;
       size -= step;
-      void *res = MmapNoAccess(addr, size, "shadow gap");
+      void *res = MmapFixedNoAccess(addr, size, "shadow gap");
       if (addr == (uptr)res)
         return;
     }
@@ -415,10 +415,13 @@
 
   AsanCheckIncompatibleRT();
   AsanCheckDynamicRTPrereqs();
+  AvoidCVE_2016_2143();
 
   SetCanPoisonMemory(flags()->poison_heap);
   SetMallocContextSize(common_flags()->malloc_context_size);
 
+  InitializePlatformExceptionHandlers();
+
   InitializeHighMemEnd();
 
   // Make sure we are not statically linked.
@@ -462,6 +465,12 @@
     kMidMemBeg = kLowMemEnd < 0x3000000000ULL ? 0x3000000000ULL : 0;
     kMidMemEnd = kLowMemEnd < 0x3000000000ULL ? 0x4fffffffffULL : 0;
   }
+#elif SANITIZER_WINDOWS64
+  // Disable the "mid mem" shadow layout.
+  if (!full_shadow_is_available) {
+    kMidMemBeg = 0;
+    kMidMemEnd = 0;
+  }
 #endif
 
   if (Verbosity()) PrintAddressSpaceLayout();
@@ -539,12 +548,12 @@
   force_interface_symbols();  // no-op.
   SanitizerInitializeUnwinder();
 
-#if CAN_SANITIZE_LEAKS
-  __lsan::InitCommonLsan();
-  if (common_flags()->detect_leaks && common_flags()->leak_check_at_exit) {
-    Atexit(__lsan::DoLeakCheck);
+  if (CAN_SANITIZE_LEAKS) {
+    __lsan::InitCommonLsan();
+    if (common_flags()->detect_leaks && common_flags()->leak_check_at_exit) {
+      Atexit(__lsan::DoLeakCheck);
+    }
   }
-#endif  // CAN_SANITIZE_LEAKS
 
 #if CAN_SANITIZE_UB
   __ubsan::InitAsPlugin();
@@ -552,6 +561,15 @@
 
   InitializeSuppressions();
 
+  if (CAN_SANITIZE_LEAKS) {
+    // LateInitialize() calls dlsym, which can allocate an error string buffer
+    // in the TLS.  Let's ignore the allocation to avoid reporting a leak.
+    __lsan::ScopedInterceptorDisabler disabler;
+    Symbolizer::LateInitialize();
+  } else {
+    Symbolizer::LateInitialize();
+  }
+
   VReport(1, "AddressSanitizer Init done\n");
 }
 
diff --git a/lib/asan/asan_scariness_score.h b/lib/asan/asan_scariness_score.h
new file mode 100644
index 0000000..492eb56
--- /dev/null
+++ b/lib/asan/asan_scariness_score.h
@@ -0,0 +1,67 @@
+//===-- asan_scariness_score.h ----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of AddressSanitizer, an address sanity checker.
+//
+// Compute the level of scariness of the error message.
+// Don't expect any deep science here, just a set of heuristics that suggest
+// that e.g. 1-byte-read-global-buffer-overflow is less scary than
+// 8-byte-write-stack-use-after-return.
+//
+// Every error report has one or more features, such as memory access size,
+// type (read or write), type of accessed memory (e.g. free-d heap, or a global
+// redzone), etc. Every such feature has an int score and a string description.
+// The overall score is the sum of all feature scores and the description
+// is a concatenation of feature descriptions.
+// Examples:
+//  17 (4-byte-read-heap-buffer-overflow)
+//  65 (multi-byte-write-stack-use-after-return)
+//  10 (null-deref)
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ASAN_SCARINESS_SCORE_H
+#define ASAN_SCARINESS_SCORE_H
+
+#include "asan_flags.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_libc.h"
+
+namespace __asan {
+class ScarinessScore {
+ public:
+  ScarinessScore() {
+    descr[0] = 0;
+  }
+  void Scare(int add_to_score, const char *reason) {
+    if (descr[0])
+      internal_strlcat(descr, "-", sizeof(descr));
+    internal_strlcat(descr, reason, sizeof(descr));
+    score += add_to_score;
+  };
+  int GetScore() const { return score; }
+  const char *GetDescription() const { return descr; }
+  void Print() {
+    if (score && flags()->print_scariness)
+      Printf("SCARINESS: %d (%s)\n", score, descr);
+  }
+  static void PrintSimple(int score, const char *descr) {
+    ScarinessScore SS;
+    SS.Scare(score, descr);
+    SS.Print();
+  }
+
+ private:
+  int score = 0;
+  char descr[1024];
+};
+
+}  // namespace __asan
+
+#endif  // ASAN_SCARINESS_SCORE_H
diff --git a/lib/asan/asan_stack.h b/lib/asan/asan_stack.h
index 5c51815..cc95e0f 100644
--- a/lib/asan/asan_stack.h
+++ b/lib/asan/asan_stack.h
@@ -48,7 +48,10 @@
       uptr stack_top = t->stack_top();
       uptr stack_bottom = t->stack_bottom();
       ScopedUnwinding unwind_scope(t);
-      stack->Unwind(max_depth, pc, bp, context, stack_top, stack_bottom, fast);
+      if (!SANITIZER_MIPS || IsValidFrame(bp, stack_top, stack_bottom)) {
+        stack->Unwind(max_depth, pc, bp, context, stack_top, stack_bottom,
+                      fast);
+      }
     } else if (!t && !fast) {
       /* If GetCurrentThread() has failed, try to do slow unwind anyways. */
       stack->Unwind(max_depth, pc, bp, context, 0, 0, false);
diff --git a/lib/asan/asan_suppressions.cc b/lib/asan/asan_suppressions.cc
index 41887b5..62c868d 100644
--- a/lib/asan/asan_suppressions.cc
+++ b/lib/asan/asan_suppressions.cc
@@ -89,6 +89,7 @@
 
     if (suppression_ctx->HasSuppressionType(kInterceptorViaFunction)) {
       SymbolizedStack *frames = symbolizer->SymbolizePC(addr);
+      CHECK(frames);
       for (SymbolizedStack *cur = frames; cur; cur = cur->next) {
         const char *function_name = cur->info.function;
         if (!function_name) {
diff --git a/lib/asan/asan_thread.cc b/lib/asan/asan_thread.cc
index 6981354..d7e2cca 100644
--- a/lib/asan/asan_thread.cc
+++ b/lib/asan/asan_thread.cc
@@ -120,6 +120,71 @@
   DTLS_Destroy();
 }
 
+void AsanThread::StartSwitchFiber(FakeStack **fake_stack_save, uptr bottom,
+                                  uptr size) {
+  if (atomic_load(&stack_switching_, memory_order_relaxed)) {
+    Report("ERROR: starting fiber switch while in fiber switch\n");
+    Die();
+  }
+
+  next_stack_bottom_ = bottom;
+  next_stack_top_ = bottom + size;
+  atomic_store(&stack_switching_, 1, memory_order_release);
+
+  FakeStack *current_fake_stack = fake_stack_;
+  if (fake_stack_save)
+    *fake_stack_save = fake_stack_;
+  fake_stack_ = nullptr;
+  SetTLSFakeStack(nullptr);
+  // if fake_stack_save is null, the fiber will die, delete the fakestack
+  if (!fake_stack_save && current_fake_stack)
+    current_fake_stack->Destroy(this->tid());
+}
+
+void AsanThread::FinishSwitchFiber(FakeStack *fake_stack_save) {
+  if (!atomic_load(&stack_switching_, memory_order_relaxed)) {
+    Report("ERROR: finishing a fiber switch that has not started\n");
+    Die();
+  }
+
+  if (fake_stack_save) {
+    SetTLSFakeStack(fake_stack_save);
+    fake_stack_ = fake_stack_save;
+  }
+
+  stack_bottom_ = next_stack_bottom_;
+  stack_top_ = next_stack_top_;
+  atomic_store(&stack_switching_, 0, memory_order_release);
+  next_stack_top_ = 0;
+  next_stack_bottom_ = 0;
+}
+
+inline AsanThread::StackBounds AsanThread::GetStackBounds() const {
+  if (!atomic_load(&stack_switching_, memory_order_acquire))
+    return StackBounds{stack_bottom_, stack_top_};  // NOLINT
+  char local;
+  const uptr cur_stack = (uptr)&local;
+  // Note: need to check next stack first, because FinishSwitchFiber
+  // may be in process of overwriting stack_top_/bottom_. But in such case
+  // we are already on the next stack.
+  if (cur_stack >= next_stack_bottom_ && cur_stack < next_stack_top_)
+    return StackBounds{next_stack_bottom_, next_stack_top_};  // NOLINT
+  return StackBounds{stack_bottom_, stack_top_};              // NOLINT
+}
+
+uptr AsanThread::stack_top() {
+  return GetStackBounds().top;
+}
+
+uptr AsanThread::stack_bottom() {
+  return GetStackBounds().bottom;
+}
+
+uptr AsanThread::stack_size() {
+  const auto bounds = GetStackBounds();
+  return bounds.top - bounds.bottom;
+}
+
 // We want to create the FakeStack lazyly on the first use, but not eralier
 // than the stack size is known and the procedure has to be async-signal safe.
 FakeStack *AsanThread::AsyncSignalSafeLazyInitFakeStack() {
@@ -150,6 +215,8 @@
 }
 
 void AsanThread::Init() {
+  next_stack_top_ = next_stack_bottom_ = 0;
+  atomic_store(&stack_switching_, false, memory_order_release);
   fake_stack_ = nullptr;  // Will be initialized lazily if needed.
   CHECK_EQ(this->stack_size(), 0U);
   SetThreadStackAndTls();
@@ -195,10 +262,12 @@
 
 void AsanThread::SetThreadStackAndTls() {
   uptr tls_size = 0;
-  GetThreadStackAndTls(tid() == 0, &stack_bottom_, &stack_size_, &tls_begin_,
-                       &tls_size);
-  stack_top_ = stack_bottom_ + stack_size_;
+  uptr stack_size = 0;
+  GetThreadStackAndTls(tid() == 0, const_cast<uptr *>(&stack_bottom_),
+                       const_cast<uptr *>(&stack_size), &tls_begin_, &tls_size);
+  stack_top_ = stack_bottom_ + stack_size;
   tls_end_ = tls_begin_ + tls_size;
+  dtls_ = DTLS_Get();
 
   int local;
   CHECK(AddrIsInStack((uptr)&local));
@@ -249,6 +318,11 @@
   return true;
 }
 
+bool AsanThread::AddrIsInStack(uptr addr) {
+  const auto bounds = GetStackBounds();
+  return addr >= bounds.bottom && addr < bounds.top;
+}
+
 static bool ThreadStackContainsAddress(ThreadContextBase *tctx_base,
                                        void *addr) {
   AsanThreadContext *tctx = static_cast<AsanThreadContext*>(tctx_base);
@@ -322,8 +396,8 @@
 // --- Implementation of LSan-specific functions --- {{{1
 namespace __lsan {
 bool GetThreadRangesLocked(uptr os_id, uptr *stack_begin, uptr *stack_end,
-                           uptr *tls_begin, uptr *tls_end,
-                           uptr *cache_begin, uptr *cache_end) {
+                           uptr *tls_begin, uptr *tls_end, uptr *cache_begin,
+                           uptr *cache_end, DTLS **dtls) {
   __asan::AsanThread *t = __asan::GetAsanThreadByOsIDLocked(os_id);
   if (!t) return false;
   *stack_begin = t->stack_bottom();
@@ -333,6 +407,7 @@
   // ASan doesn't keep allocator caches in TLS, so these are unused.
   *cache_begin = 0;
   *cache_end = 0;
+  *dtls = t->dtls();
   return true;
 }
 
@@ -355,3 +430,29 @@
   __asan::EnsureMainThreadIDIsCorrect();
 }
 } // namespace __lsan
+
+// ---------------------- Interface ---------------- {{{1
+using namespace __asan;  // NOLINT
+
+extern "C" {
+SANITIZER_INTERFACE_ATTRIBUTE
+void __sanitizer_start_switch_fiber(void **fakestacksave, const void *bottom,
+                                    uptr size) {
+  AsanThread *t = GetCurrentThread();
+  if (!t) {
+    VReport(1, "__asan_start_switch_fiber called from unknown thread\n");
+    return;
+  }
+  t->StartSwitchFiber((FakeStack**)fakestacksave, (uptr)bottom, size);
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE
+void __sanitizer_finish_switch_fiber(void* fakestack) {
+  AsanThread *t = GetCurrentThread();
+  if (!t) {
+    VReport(1, "__asan_finish_switch_fiber called from unknown thread\n");
+    return;
+  }
+  t->FinishSwitchFiber((FakeStack*)fakestack);
+}
+}
diff --git a/lib/asan/asan_thread.h b/lib/asan/asan_thread.h
index ac35711..92a92a2 100644
--- a/lib/asan/asan_thread.h
+++ b/lib/asan/asan_thread.h
@@ -23,6 +23,10 @@
 #include "sanitizer_common/sanitizer_libc.h"
 #include "sanitizer_common/sanitizer_thread_registry.h"
 
+namespace __sanitizer {
+struct DTLS;
+}  // namespace __sanitizer
+
 namespace __asan {
 
 const u32 kInvalidTid = 0xffffff;  // Must fit into 24 bits.
@@ -62,11 +66,12 @@
   thread_return_t ThreadStart(uptr os_id,
                               atomic_uintptr_t *signal_thread_is_registered);
 
-  uptr stack_top() { return stack_top_; }
-  uptr stack_bottom() { return stack_bottom_; }
-  uptr stack_size() { return stack_size_; }
+  uptr stack_top();
+  uptr stack_bottom();
+  uptr stack_size();
   uptr tls_begin() { return tls_begin_; }
   uptr tls_end() { return tls_end_; }
+  DTLS *dtls() { return dtls_; }
   u32 tid() { return context_->tid; }
   AsanThreadContext *context() { return context_; }
   void set_context(AsanThreadContext *context) { context_ = context; }
@@ -78,9 +83,7 @@
   };
   bool GetStackFrameAccessByAddr(uptr addr, StackFrameAccess *access);
 
-  bool AddrIsInStack(uptr addr) {
-    return addr >= stack_bottom_ && addr < stack_top_;
-  }
+  bool AddrIsInStack(uptr addr);
 
   void DeleteFakeStack(int tid) {
     if (!fake_stack_) return;
@@ -90,13 +93,19 @@
     t->Destroy(tid);
   }
 
+  void StartSwitchFiber(FakeStack **fake_stack_save, uptr bottom, uptr size);
+  void FinishSwitchFiber(FakeStack *fake_stack_save);
+
   bool has_fake_stack() {
-    return (reinterpret_cast<uptr>(fake_stack_) > 1);
+    return !atomic_load(&stack_switching_, memory_order_relaxed) &&
+           (reinterpret_cast<uptr>(fake_stack_) > 1);
   }
 
   FakeStack *fake_stack() {
     if (!__asan_option_detect_stack_use_after_return)
       return nullptr;
+    if (atomic_load(&stack_switching_, memory_order_relaxed))
+      return nullptr;
     if (!has_fake_stack())
       return AsyncSignalSafeLazyInitFakeStack();
     return fake_stack_;
@@ -122,16 +131,27 @@
   void ClearShadowForThreadStackAndTLS();
   FakeStack *AsyncSignalSafeLazyInitFakeStack();
 
+  struct StackBounds {
+    uptr bottom;
+    uptr top;
+  };
+  StackBounds GetStackBounds() const;
+
   AsanThreadContext *context_;
   thread_callback_t start_routine_;
   void *arg_;
+
   uptr stack_top_;
   uptr stack_bottom_;
-  // stack_size_ == stack_top_ - stack_bottom_;
-  // It needs to be set in a async-signal-safe manner.
-  uptr stack_size_;
+  // these variables are used when the thread is about to switch stack
+  uptr next_stack_top_;
+  uptr next_stack_bottom_;
+  // true if switching is in progress
+  atomic_uint8_t stack_switching_;
+
   uptr tls_begin_;
   uptr tls_end_;
+  DTLS *dtls_;
 
   FakeStack *fake_stack_;
   AsanThreadLocalMallocStorage malloc_storage_;
diff --git a/lib/asan/asan_win.cc b/lib/asan/asan_win.cc
index 92bd893..94d044a 100644
--- a/lib/asan/asan_win.cc
+++ b/lib/asan/asan_win.cc
@@ -24,6 +24,7 @@
 #include "asan_report.h"
 #include "asan_stack.h"
 #include "asan_thread.h"
+#include "asan_mapping.h"
 #include "sanitizer_common/sanitizer_libc.h"
 #include "sanitizer_common/sanitizer_mutex.h"
 
@@ -46,11 +47,20 @@
 const char* __asan_default_default_options() { return ""; }
 const char* __asan_default_default_suppressions() { return ""; }
 void __asan_default_on_error() {}
+// 64-bit msvc will not prepend an underscore for symbols.
+#ifdef _WIN64
+#pragma comment(linker, "/alternatename:__sanitizer_malloc_hook=__sanitizer_default_malloc_hook")  // NOLINT
+#pragma comment(linker, "/alternatename:__sanitizer_free_hook=__sanitizer_default_free_hook")      // NOLINT
+#pragma comment(linker, "/alternatename:__asan_default_options=__asan_default_default_options")    // NOLINT
+#pragma comment(linker, "/alternatename:__asan_default_suppressions=__asan_default_default_suppressions")    // NOLINT
+#pragma comment(linker, "/alternatename:__asan_on_error=__asan_default_on_error")                  // NOLINT
+#else
 #pragma comment(linker, "/alternatename:___sanitizer_malloc_hook=___sanitizer_default_malloc_hook")  // NOLINT
 #pragma comment(linker, "/alternatename:___sanitizer_free_hook=___sanitizer_default_free_hook")      // NOLINT
 #pragma comment(linker, "/alternatename:___asan_default_options=___asan_default_default_options")    // NOLINT
 #pragma comment(linker, "/alternatename:___asan_default_suppressions=___asan_default_default_suppressions")    // NOLINT
 #pragma comment(linker, "/alternatename:___asan_on_error=___asan_default_on_error")                  // NOLINT
+#endif
 // }}}
 }  // extern "C"
 
@@ -61,6 +71,9 @@
   REAL(RaiseException)(a, b, c, d);
 }
 
+// TODO(wwchrome): Win64 has no _except_handler3/4.
+// Need to implement _C_specific_handler instead.
+#ifndef _WIN64
 INTERCEPTOR(int, _except_handler3, void *a, void *b, void *c, void *d) {
   CHECK(REAL(_except_handler3));
   __asan_handle_no_return();
@@ -76,6 +89,7 @@
   __asan_handle_no_return();
   return REAL(_except_handler4)(a, b, c, d);
 }
+#endif
 
 static thread_return_t THREAD_CALLING_CONV asan_thread_start(void *arg) {
   AsanThread *t = (AsanThread*)arg;
@@ -139,8 +153,12 @@
 void InitializePlatformInterceptors() {
   ASAN_INTERCEPT_FUNC(CreateThread);
   ASAN_INTERCEPT_FUNC(RaiseException);
+
+// TODO(wwchrome): Win64 uses _C_specific_handler instead.
+#ifndef _WIN64
   ASAN_INTERCEPT_FUNC(_except_handler3);
   ASAN_INTERCEPT_FUNC(_except_handler4);
+#endif
 
   // NtWaitForWorkViaWorkerFactory is always linked dynamically.
   CHECK(::__interception::OverrideFunction(
@@ -149,6 +167,10 @@
       (uptr *)&REAL(NtWaitForWorkViaWorkerFactory)));
 }
 
+void AsanApplyToGlobals(globals_op_fptr op, const void *needle) {
+  UNIMPLEMENTED();
+}
+
 // ---------------------- TSD ---------------- {{{
 static bool tsd_key_inited = false;
 
@@ -194,6 +216,55 @@
   UNIMPLEMENTED();
 }
 
+#if SANITIZER_WINDOWS64
+// Exception handler for dealing with shadow memory.
+static LONG CALLBACK
+ShadowExceptionHandler(PEXCEPTION_POINTERS exception_pointers) {
+  static uptr page_size = GetPageSizeCached();
+  static uptr alloc_granularity = GetMmapGranularity();
+  // Only handle access violations.
+  if (exception_pointers->ExceptionRecord->ExceptionCode !=
+      EXCEPTION_ACCESS_VIOLATION) {
+    return EXCEPTION_CONTINUE_SEARCH;
+  }
+
+  // Only handle access violations that land within the shadow memory.
+  uptr addr =
+      (uptr)(exception_pointers->ExceptionRecord->ExceptionInformation[1]);
+
+  // Check valid shadow range.
+  if (!AddrIsInShadow(addr)) return EXCEPTION_CONTINUE_SEARCH;
+
+  // This is an access violation while trying to read from the shadow. Commit
+  // the relevant page and let execution continue.
+
+  // Determine the address of the page that is being accessed.
+  uptr page = RoundDownTo(addr, page_size);
+
+  // Query the existing page.
+  MEMORY_BASIC_INFORMATION mem_info = {};
+  if (::VirtualQuery((LPVOID)page, &mem_info, sizeof(mem_info)) == 0)
+    return EXCEPTION_CONTINUE_SEARCH;
+
+  // Commit the page.
+  uptr result =
+      (uptr)::VirtualAlloc((LPVOID)page, page_size, MEM_COMMIT, PAGE_READWRITE);
+  if (result != page) return EXCEPTION_CONTINUE_SEARCH;
+
+  // The page mapping succeeded, so continue execution as usual.
+  return EXCEPTION_CONTINUE_EXECUTION;
+}
+
+#endif
+
+void InitializePlatformExceptionHandlers() {
+#if SANITIZER_WINDOWS64
+  // On Win64, we map memory on demand with access violation handler.
+  // Install our exception handler.
+  CHECK(AddVectoredExceptionHandler(TRUE, &ShadowExceptionHandler));
+#endif
+}
+
 static LPTOP_LEVEL_EXCEPTION_FILTER default_seh_handler;
 
 static long WINAPI SEHHandler(EXCEPTION_POINTERS *info) {
@@ -242,10 +313,16 @@
 }
 
 #if !ASAN_DYNAMIC
-// Put a pointer to __asan_set_seh_filter at the end of the global list
-// of C initializers, after the default EH is set by the CRT.
-#pragma section(".CRT$XIZ", long, read)  // NOLINT
-__declspec(allocate(".CRT$XIZ"))
+// The CRT runs initializers in this order:
+// - C initializers, from XIA to XIZ
+// - C++ initializers, from XCA to XCZ
+// Prior to 2015, the CRT set the unhandled exception filter at priority XIY,
+// near the end of C initialization. Starting in 2015, it was moved to the
+// beginning of C++ initialization. We set our priority to XCAB to run
+// immediately after the CRT runs. This way, our exception filter is called
+// first and we can delegate to their filter if appropriate.
+#pragma section(".CRT$XCAB", long, read)  // NOLINT
+__declspec(allocate(".CRT$XCAB"))
     int (*__intercept_seh)() = __asan_set_seh_filter;
 #endif
 // }}}
diff --git a/lib/asan/asan_win_dll_thunk.cc b/lib/asan/asan_win_dll_thunk.cc
index 308196d..eff00f9 100644
--- a/lib/asan/asan_win_dll_thunk.cc
+++ b/lib/asan/asan_win_dll_thunk.cc
@@ -315,6 +315,7 @@
 INTERFACE_FUNCTION(__sanitizer_cov_with_check)
 INTERFACE_FUNCTION(__sanitizer_get_allocated_size)
 INTERFACE_FUNCTION(__sanitizer_get_coverage_guards)
+INTERFACE_FUNCTION(__sanitizer_get_coverage_pc_buffer)
 INTERFACE_FUNCTION(__sanitizer_get_current_allocated_bytes)
 INTERFACE_FUNCTION(__sanitizer_get_estimated_allocated_size)
 INTERFACE_FUNCTION(__sanitizer_get_free_bytes)
@@ -334,6 +335,7 @@
 INTERFACE_FUNCTION(__sanitizer_sandbox_on_notify)
 INTERFACE_FUNCTION(__sanitizer_set_death_callback)
 INTERFACE_FUNCTION(__sanitizer_set_report_path)
+INTERFACE_FUNCTION(__sanitizer_set_report_fd)
 INTERFACE_FUNCTION(__sanitizer_unaligned_load16)
 INTERFACE_FUNCTION(__sanitizer_unaligned_load32)
 INTERFACE_FUNCTION(__sanitizer_unaligned_load64)
@@ -341,21 +343,28 @@
 INTERFACE_FUNCTION(__sanitizer_unaligned_store32)
 INTERFACE_FUNCTION(__sanitizer_unaligned_store64)
 INTERFACE_FUNCTION(__sanitizer_verify_contiguous_container)
+INTERFACE_FUNCTION(__sanitizer_install_malloc_and_free_hooks)
+INTERFACE_FUNCTION(__sanitizer_start_switch_fiber)
+INTERFACE_FUNCTION(__sanitizer_finish_switch_fiber)
 
 // TODO(timurrrr): Add more interface functions on the as-needed basis.
 
 // ----------------- Memory allocation functions ---------------------
 WRAP_V_W(free)
+WRAP_V_W(_free_base)
 WRAP_V_WW(_free_dbg)
 
 WRAP_W_W(malloc)
+WRAP_W_W(_malloc_base)
 WRAP_W_WWWW(_malloc_dbg)
 
 WRAP_W_WW(calloc)
+WRAP_W_WW(_calloc_base)
 WRAP_W_WWWWW(_calloc_dbg)
 WRAP_W_WWW(_calloc_impl)
 
 WRAP_W_WW(realloc)
+WRAP_W_WW(_realloc_base)
 WRAP_W_WWW(_realloc_dbg)
 WRAP_W_WWW(_recalloc)
 
@@ -391,12 +400,14 @@
 INTERCEPT_LIBRARY_FUNCTION(strcmp);
 INTERCEPT_LIBRARY_FUNCTION(strcpy);  // NOLINT
 INTERCEPT_LIBRARY_FUNCTION(strcspn);
+INTERCEPT_LIBRARY_FUNCTION(strdup);
 INTERCEPT_LIBRARY_FUNCTION(strlen);
 INTERCEPT_LIBRARY_FUNCTION(strncat);
 INTERCEPT_LIBRARY_FUNCTION(strncmp);
 INTERCEPT_LIBRARY_FUNCTION(strncpy);
 INTERCEPT_LIBRARY_FUNCTION(strnlen);
 INTERCEPT_LIBRARY_FUNCTION(strpbrk);
+INTERCEPT_LIBRARY_FUNCTION(strrchr);
 INTERCEPT_LIBRARY_FUNCTION(strspn);
 INTERCEPT_LIBRARY_FUNCTION(strstr);
 INTERCEPT_LIBRARY_FUNCTION(strtol);
diff --git a/lib/asan/asan_win_dynamic_runtime_thunk.cc b/lib/asan/asan_win_dynamic_runtime_thunk.cc
index 73e5207..1175522 100644
--- a/lib/asan/asan_win_dynamic_runtime_thunk.cc
+++ b/lib/asan/asan_win_dynamic_runtime_thunk.cc
@@ -29,7 +29,7 @@
 
 // First, declare CRT sections we'll be using in this file
 #pragma section(".CRT$XID", long, read)  // NOLINT
-#pragma section(".CRT$XIZ", long, read)  // NOLINT
+#pragma section(".CRT$XCAB", long, read)  // NOLINT
 #pragma section(".CRT$XTW", long, read)  // NOLINT
 #pragma section(".CRT$XTY", long, read)  // NOLINT
 
@@ -93,7 +93,8 @@
 
 // Unfortunately, putting a pointer to __asan_set_seh_filter into
 // __asan_intercept_seh gets optimized out, so we have to use an extra function.
-__declspec(allocate(".CRT$XIZ")) int (*__asan_seh_interceptor)() = SetSEHFilter;
+__declspec(allocate(".CRT$XCAB")) int (*__asan_seh_interceptor)() =
+    SetSEHFilter;
 }
 
 #endif // ASAN_DYNAMIC_RUNTIME_THUNK
diff --git a/lib/asan/scripts/asan_device_setup b/lib/asan/scripts/asan_device_setup
index 6cb7b94..b906c86 100755
--- a/lib/asan/scripts/asan_device_setup
+++ b/lib/asan/scripts/asan_device_setup
@@ -300,19 +300,24 @@
   cp "$ASAN_RT_PATH/$ASAN_RT64" "$TMPDIR/"
 fi
 
-# FIXME: alloc_dealloc_mismatch=0 prevents a failure in libdvm startup,
-# which may or may not be a real bug (probably not).
-ASAN_OPTIONS=start_deactivated=1,alloc_dealloc_mismatch=0,malloc_context_size=0
+ASAN_OPTIONS=start_deactivated=1,malloc_context_size=0
 
 function generate_zygote_wrapper { # from, to, asan_rt
   local _from=$1
   local _to=$2
   local _asan_rt=$3
+  if [[ PRE_L -eq 0 ]]; then
+    # LD_PRELOAD parsing is broken in N if it starts with ":". Luckily, it is
+    # unset in the system environment since L.
+    local _ld_preload=$_asan_rt
+  else
+    local _ld_preload=\$LD_PRELOAD:$_asan_rt
+  fi
   cat <<EOF >"$TMPDIR/$_from"
 #!/system/bin/sh-from-zygote
 ASAN_OPTIONS=$ASAN_OPTIONS \\
 ASAN_ACTIVATION_OPTIONS=include_if_exists=/data/local/tmp/asan.options.%b \\
-LD_PRELOAD=\$LD_PRELOAD:$_asan_rt \\
+LD_PRELOAD=$_ld_preload \\
 exec $_to \$@
 
 EOF
diff --git a/lib/asan/scripts/asan_symbolize.py b/lib/asan/scripts/asan_symbolize.py
index e6d43cd..8e6fb61 100755
--- a/lib/asan/scripts/asan_symbolize.py
+++ b/lib/asan/scripts/asan_symbolize.py
@@ -271,7 +271,7 @@
 def SystemSymbolizerFactory(system, addr, binary):
   if system == 'Darwin':
     return DarwinSymbolizer(addr, binary)
-  elif system == 'Linux':
+  elif system == 'Linux' or system == 'FreeBSD':
     return Addr2LineSymbolizer(binary)
 
 
diff --git a/lib/asan/tests/CMakeLists.txt b/lib/asan/tests/CMakeLists.txt
index 7a8d8f7..e67d0fb 100644
--- a/lib/asan/tests/CMakeLists.txt
+++ b/lib/asan/tests/CMakeLists.txt
@@ -21,7 +21,7 @@
   asan_test_utils.h)
 
 set(ASAN_UNITTEST_COMMON_CFLAGS
-  ${COMPILER_RT_TEST_CFLAGS}
+  ${COMPILER_RT_UNITTEST_CFLAGS}
   ${COMPILER_RT_GTEST_CFLAGS}
   -I${COMPILER_RT_SOURCE_DIR}/include
   -I${COMPILER_RT_SOURCE_DIR}/lib
@@ -34,12 +34,21 @@
   -Wno-non-virtual-dtor)
 append_list_if(COMPILER_RT_HAS_WVARIADIC_MACROS_FLAG -Wno-variadic-macros ASAN_UNITTEST_COMMON_CFLAGS)
 
+# This will ensure the target linker is used
+# during cross compilation
+set(ASAN_UNITTEST_COMMON_LINKFLAGS
+  ${COMPILER_RT_UNITTEST_LINKFLAGS})
+
 # -gline-tables-only must be enough for ASan, so use it if possible.
 if(COMPILER_RT_TEST_COMPILER_ID MATCHES "Clang")
   list(APPEND ASAN_UNITTEST_COMMON_CFLAGS -gline-tables-only)
 else()
   list(APPEND ASAN_UNITTEST_COMMON_CFLAGS -g)
 endif()
+if(MSVC)
+  list(APPEND ASAN_UNITTEST_COMMON_CFLAGS -gcodeview)
+endif()
+list(APPEND ASAN_UNITTEST_COMMON_LINKFLAGS -g)
 
 # Use -D instead of definitions to please custom compile command.
 list(APPEND ASAN_UNITTEST_COMMON_CFLAGS
@@ -114,7 +123,11 @@
 # options in ${ARGN}, and add it to the object list.
 macro(asan_compile obj_list source arch kind)
   get_filename_component(basename ${source} NAME)
-  set(output_obj "${obj_list}.${basename}.${arch}${kind}.o")
+  if(CMAKE_CONFIGURATION_TYPES)
+    set(output_obj "${CMAKE_CFG_INTDIR}/${obj_list}.${basename}.${arch}${kind}.o")
+  else()
+    set(output_obj "${obj_list}.${basename}.${arch}${kind}.o")
+  endif()
   get_target_flags_for_arch(${arch} TARGET_CFLAGS)
   set(COMPILE_DEPS ${ASAN_UNITTEST_HEADERS} ${ASAN_BLACKLIST_FILE})
   if(NOT COMPILER_RT_STANDALONE_BUILD)
@@ -137,11 +150,17 @@
   endif()
   if(TEST_WITH_TEST_RUNTIME)
     list(APPEND TEST_DEPS ${ASAN_TEST_RUNTIME})
-    if(NOT MSVC)
-      list(APPEND TEST_OBJECTS lib${ASAN_TEST_RUNTIME}.a)
+    if(CMAKE_CONFIGURATION_TYPES)
+     set(configuration_path "${CMAKE_CFG_INTDIR}/")
     else()
-      list(APPEND TEST_OBJECTS ${ASAN_TEST_RUNTIME}.lib)
+     set(configuration_path "")
     endif()
+    if(NOT MSVC)
+      set(asan_test_runtime_path ${configuration_path}lib${ASAN_TEST_RUNTIME}.a)
+    else()
+      set(asan_test_runtime_path ${configuration_path}${ASAN_TEST_RUNTIME}.lib)
+    endif()
+    list(APPEND TEST_OBJECTS ${asan_test_runtime_path})
   endif()
   add_compiler_rt_test(${test_suite} ${test_name}
                        SUBDIR ${TEST_SUBDIR}
@@ -153,15 +172,15 @@
 
 # Main AddressSanitizer unit tests.
 add_custom_target(AsanUnitTests)
-set_target_properties(AsanUnitTests PROPERTIES FOLDER "ASan unit tests")
+set_target_properties(AsanUnitTests PROPERTIES FOLDER "Compiler-RT Tests")
+
 # AddressSanitizer unit tests with dynamic runtime (on platforms where it's
 # not the default).
 add_custom_target(AsanDynamicUnitTests)
-set_target_properties(AsanDynamicUnitTests
-  PROPERTIES FOLDER "ASan unit tests with dynamic runtime")
+set_target_properties(AsanDynamicUnitTests PROPERTIES FOLDER "Compiler-RT Tests")
 # ASan benchmarks (not actively used now).
 add_custom_target(AsanBenchmarks)
-set_target_properties(AsanBenchmarks PROPERTIES FOLDER "Asan benchmarks")
+set_target_properties(AsanBenchmarks PROPERTIES FOLDER "Compiler-RT Tests")
 
 set(ASAN_NOINST_TEST_SOURCES
   ${COMPILER_RT_GTEST_SOURCE}
@@ -200,13 +219,30 @@
     asan_compile(ASAN_INST_TEST_OBJECTS asan_mac_test_helpers.mm ${arch} ${kind}
                  ${ASAN_UNITTEST_INSTRUMENTED_CFLAGS} -ObjC ${ARGN})
   endif()
-  file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/default")
+
+  # Create the 'default' folder where ASAN tests are produced.
+  if(CMAKE_CONFIGURATION_TYPES)
+    foreach(build_mode ${CMAKE_CONFIGURATION_TYPES})
+      file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/default/${build_mode}")
+    endforeach()
+  else()
+    file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/default")
+  endif()
+
   add_asan_test(AsanUnitTests "Asan-${arch}${kind}-Test"
                 ${arch} ${kind} SUBDIR "default"
                 OBJECTS ${ASAN_INST_TEST_OBJECTS}
                 LINKFLAGS ${ASAN_UNITTEST_INSTRUMENTED_LINKFLAGS})
   if(COMPILER_RT_ASAN_HAS_STATIC_RUNTIME)
-    file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/dynamic")
+    # Create the 'dynamic' folder where ASAN tests are produced.
+    if(CMAKE_CONFIGURATION_TYPES)
+      foreach(build_mode ${CMAKE_CONFIGURATION_TYPES})
+        file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/dynamic/${build_mode}")
+      endforeach()
+    else()
+      file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/dynamic")
+    endif()
+
     add_asan_test(AsanDynamicUnitTests "Asan-${arch}${kind}-Dynamic-Test"
                   ${arch} ${kind} SUBDIR "dynamic"
                   OBJECTS ${ASAN_INST_TEST_OBJECTS}
@@ -236,7 +272,8 @@
   endif()
   add_library(${ASAN_TEST_RUNTIME} STATIC ${ASAN_TEST_RUNTIME_OBJECTS})
   set_target_properties(${ASAN_TEST_RUNTIME} PROPERTIES
-    ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+    FOLDER "Compiler-RT Runtime tests")
   # Uninstrumented tests.
   set(ASAN_NOINST_TEST_OBJECTS)
   foreach(src ${ASAN_NOINST_TEST_SOURCES})
diff --git a/lib/asan/tests/asan_noinst_test.cc b/lib/asan/tests/asan_noinst_test.cc
index 5f5354f..3872dd7 100644
--- a/lib/asan/tests/asan_noinst_test.cc
+++ b/lib/asan/tests/asan_noinst_test.cc
@@ -34,7 +34,6 @@
 // Make sure __asan_init is called before any test case is run.
 struct AsanInitCaller {
   AsanInitCaller() {
-    DisableReexec();
     __asan_init();
   }
 };
diff --git a/lib/asan/tests/asan_str_test.cc b/lib/asan/tests/asan_str_test.cc
index 89b0d3d..dd75587 100644
--- a/lib/asan/tests/asan_str_test.cc
+++ b/lib/asan/tests/asan_str_test.cc
@@ -20,10 +20,41 @@
 static char global_string[] = "global";
 static size_t global_string_length = 6;
 
+const char kStackReadUnderflow[] =
+#if !GTEST_USES_SIMPLE_RE
+    ASAN_PCRE_DOTALL
+    "READ.*"
+#endif
+    "underflows this variable";
+const char kStackReadOverflow[] =
+#if !GTEST_USES_SIMPLE_RE
+    ASAN_PCRE_DOTALL
+    "READ.*"
+#endif
+    "overflows this variable";
+
+namespace {
+enum class OOBKind {
+  Heap,
+  Stack,
+  Global,
+};
+
+string LeftOOBReadMessage(OOBKind oob_kind, int oob_distance) {
+  return oob_kind == OOBKind::Stack ? kStackReadUnderflow
+                                    : ::LeftOOBReadMessage(oob_distance);
+}
+
+string RightOOBReadMessage(OOBKind oob_kind, int oob_distance) {
+  return oob_kind == OOBKind::Stack ? kStackReadOverflow
+                                    : ::RightOOBReadMessage(oob_distance);
+}
+}  // namespace
+
 // Input to a test is a zero-terminated string str with given length
 // Accesses to the bytes to the left and to the right of str
 // are presumed to produce OOB errors
-void StrLenOOBTestTemplate(char *str, size_t length, bool is_global) {
+void StrLenOOBTestTemplate(char *str, size_t length, OOBKind oob_kind) {
   // Normal strlen calls
   EXPECT_EQ(strlen(str), length);
   if (length > 0) {
@@ -31,17 +62,18 @@
     EXPECT_EQ(0U, strlen(str + length));
   }
   // Arg of strlen is not malloced, OOB access
-  if (!is_global) {
+  if (oob_kind != OOBKind::Global) {
     // We don't insert RedZones to the left of global variables
-    EXPECT_DEATH(Ident(strlen(str - 1)), LeftOOBReadMessage(1));
-    EXPECT_DEATH(Ident(strlen(str - 5)), LeftOOBReadMessage(5));
+    EXPECT_DEATH(Ident(strlen(str - 1)), LeftOOBReadMessage(oob_kind, 1));
+    EXPECT_DEATH(Ident(strlen(str - 5)), LeftOOBReadMessage(oob_kind, 5));
   }
-  EXPECT_DEATH(Ident(strlen(str + length + 1)), RightOOBReadMessage(0));
+  EXPECT_DEATH(Ident(strlen(str + length + 1)),
+               RightOOBReadMessage(oob_kind, 0));
   // Overwrite terminator
   str[length] = 'a';
   // String is not zero-terminated, strlen will lead to OOB access
-  EXPECT_DEATH(Ident(strlen(str)), RightOOBReadMessage(0));
-  EXPECT_DEATH(Ident(strlen(str + length)), RightOOBReadMessage(0));
+  EXPECT_DEATH(Ident(strlen(str)), RightOOBReadMessage(oob_kind, 0));
+  EXPECT_DEATH(Ident(strlen(str + length)), RightOOBReadMessage(oob_kind, 0));
   // Restore terminator
   str[length] = 0;
 }
@@ -57,11 +89,9 @@
   }
   heap_string[length] = 0;
   stack_string[length] = 0;
-  StrLenOOBTestTemplate(heap_string, length, false);
-  // TODO(samsonov): Fix expected messages in StrLenOOBTestTemplate to
-  //      make test for stack_string work. Or move it to output tests.
-  // StrLenOOBTestTemplate(stack_string, length, false);
-  StrLenOOBTestTemplate(global_string, global_string_length, true);
+  StrLenOOBTestTemplate(heap_string, length, OOBKind::Heap);
+  StrLenOOBTestTemplate(stack_string, length, OOBKind::Stack);
+  StrLenOOBTestTemplate(global_string, global_string_length, OOBKind::Global);
   free(heap_string);
 }
 
@@ -186,23 +216,8 @@
 typedef char*(*PointerToStrChr1)(const char*, int);
 typedef char*(*PointerToStrChr2)(char*, int);
 
-UNUSED static void RunStrChrTest(PointerToStrChr1 StrChr) {
-  size_t size = Ident(100);
-  char *str = MallocAndMemsetString(size);
-  str[10] = 'q';
-  str[11] = '\0';
-  EXPECT_EQ(str, StrChr(str, 'z'));
-  EXPECT_EQ(str + 10, StrChr(str, 'q'));
-  EXPECT_EQ(NULL, StrChr(str, 'a'));
-  // StrChr argument points to not allocated memory.
-  EXPECT_DEATH(Ident(StrChr(str - 1, 'z')), LeftOOBReadMessage(1));
-  EXPECT_DEATH(Ident(StrChr(str + size, 'z')), RightOOBReadMessage(0));
-  // Overwrite the terminator and hit not allocated memory.
-  str[11] = 'z';
-  EXPECT_DEATH(Ident(StrChr(str, 'a')), RightOOBReadMessage(0));
-  free(str);
-}
-UNUSED static void RunStrChrTest(PointerToStrChr2 StrChr) {
+template<typename StrChrFn>
+static void RunStrChrTestImpl(StrChrFn *StrChr) {
   size_t size = Ident(100);
   char *str = MallocAndMemsetString(size);
   str[10] = 'q';
@@ -219,11 +234,19 @@
   free(str);
 }
 
+// Prefer to use the standard signature if both are available.
+UNUSED static void RunStrChrTest(PointerToStrChr1 StrChr, ...) {
+  RunStrChrTestImpl(StrChr);
+}
+UNUSED static void RunStrChrTest(PointerToStrChr2 StrChr, int) {
+  RunStrChrTestImpl(StrChr);
+}
+
 TEST(AddressSanitizer, StrChrAndIndexOOBTest) {
-  RunStrChrTest(&strchr);
+  RunStrChrTest(&strchr, 0);
 // No index() on Windows and on Android L.
 #if !defined(_WIN32) && !defined(__ANDROID__)
-  RunStrChrTest(&index);
+  RunStrChrTest(&index, 0);
 #endif
 }
 
diff --git a/lib/asan/tests/asan_test.cc b/lib/asan/tests/asan_test.cc
index 71fb27a..6a95c3f 100644
--- a/lib/asan/tests/asan_test.cc
+++ b/lib/asan/tests/asan_test.cc
@@ -300,6 +300,7 @@
   }
 }
 
+#if !GTEST_USES_SIMPLE_RE
 TEST(AddressSanitizer, HugeMallocTest) {
   if (SANITIZER_WORDSIZE != 64 || ASAN_AVOID_EXPENSIVE_TESTS) return;
   size_t n_megs = 4100;
@@ -307,6 +308,7 @@
                "is located 1 bytes to the left|"
                "AddressSanitizer failed to allocate");
 }
+#endif
 
 #if SANITIZER_TEST_HAS_MEMALIGN
 void MemalignRun(size_t align, size_t size, int idx) {
@@ -595,9 +597,8 @@
 }
 
 #if !defined(__ANDROID__) && !defined(__arm__) && \
-    !defined(__powerpc64__) && !defined(__powerpc__) && \
     !defined(__aarch64__) && !defined(__mips__) && \
-    !defined(__mips64)
+    !defined(__mips64) && !defined(__s390__)
 NOINLINE void BuiltinLongJmpFunc1(jmp_buf buf) {
   // create three red zones for these two stack objects.
   int a;
@@ -609,7 +610,7 @@
   __builtin_longjmp((void**)buf, 1);
 }
 
-// Does not work on Power and ARM:
+// Does not work on ARM:
 // https://github.com/google/sanitizers/issues/185
 TEST(AddressSanitizer, BuiltinLongJmpTest) {
   static jmp_buf buf;
@@ -619,9 +620,9 @@
     TouchStackFunc();
   }
 }
-#endif  // !defined(__ANDROID__) && !defined(__powerpc64__) &&
-        // !defined(__powerpc__) && !defined(__arm__) &&
-        // !defined(__mips__) && !defined(__mips64)
+#endif  // !defined(__ANDROID__) && !defined(__arm__) &&
+        // !defined(__aarch64__) && !defined(__mips__)
+        // !defined(__mips64) && !defined(__s390__)
 
 TEST(AddressSanitizer, UnderscopeLongJmpTest) {
   static jmp_buf buf;
@@ -809,9 +810,6 @@
   free(s);
 }
 
-// TODO(samsonov): Add a test with malloc(0)
-// TODO(samsonov): Add tests for str* and mem* functions.
-
 NOINLINE static int LargeFunction(bool do_bad_access) {
   int *x = new int[100];
   x[0]++;
@@ -941,6 +939,8 @@
 #else
 # if defined(__powerpc64__)
   char *addr = (char*)0x024000800000;
+# elif defined(__s390x__)
+  char *addr = (char*)0x11000000000000;
 # else
   char *addr = (char*)0x0000100000080000;
 # endif
@@ -1166,15 +1166,21 @@
   return string("AddressSanitizer: alloc-dealloc-mismatch \\(") + str;
 }
 
+static string MismatchOrNewDeleteTypeStr(const string &mismatch_str) {
+  return "(" + MismatchStr(mismatch_str) +
+         ")|(AddressSanitizer: new-delete-type-mismatch)";
+}
+
 TEST(AddressSanitizer, AllocDeallocMismatch) {
   EXPECT_DEATH(free(Ident(new int)),
                MismatchStr("operator new vs free"));
   EXPECT_DEATH(free(Ident(new int[2])),
                MismatchStr("operator new \\[\\] vs free"));
-  EXPECT_DEATH(delete (Ident(new int[2])),
-               MismatchStr("operator new \\[\\] vs operator delete"));
-  EXPECT_DEATH(delete (Ident((int*)malloc(2 * sizeof(int)))),
-               MismatchStr("malloc vs operator delete"));
+  EXPECT_DEATH(
+      delete (Ident(new int[2])),
+      MismatchOrNewDeleteTypeStr("operator new \\[\\] vs operator delete"));
+  EXPECT_DEATH(delete (Ident((int *)malloc(2 * sizeof(int)))),
+               MismatchOrNewDeleteTypeStr("malloc vs operator delete"));
   EXPECT_DEATH(delete [] (Ident(new int)),
                MismatchStr("operator new vs operator delete \\[\\]"));
   EXPECT_DEATH(delete [] (Ident((int*)malloc(2 * sizeof(int)))),
diff --git a/lib/asan/tests/asan_test_main.cc b/lib/asan/tests/asan_test_main.cc
index cdaf801..d4d6de7 100644
--- a/lib/asan/tests/asan_test_main.cc
+++ b/lib/asan/tests/asan_test_main.cc
@@ -26,6 +26,12 @@
 #endif
 }
 
+namespace __sanitizer {
+bool ReexecDisabled() {
+  return true;
+}
+}
+
 int main(int argc, char **argv) {
   testing::GTEST_FLAG(death_test_style) = "threadsafe";
   testing::InitGoogleTest(&argc, argv);
diff --git a/lib/builtins/CMakeLists.txt b/lib/builtins/CMakeLists.txt
index 5ffad1d..f631c35 100644
--- a/lib/builtins/CMakeLists.txt
+++ b/lib/builtins/CMakeLists.txt
@@ -2,9 +2,27 @@
 # generic implementations of the core runtime library along with optimized
 # architecture-specific code in various subdirectories.
 
+if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
+  cmake_minimum_required(VERSION 3.4.3)
+
+  project(CompilerRTBuiltins C ASM)
+  set(COMPILER_RT_STANDALONE_BUILD TRUE)
+  set(COMPILER_RT_BUILTINS_STANDALONE_BUILD TRUE)
+  list(INSERT CMAKE_MODULE_PATH 0
+    "${CMAKE_SOURCE_DIR}/../../cmake"
+    "${CMAKE_SOURCE_DIR}/../../cmake/Modules")
+  include(base-config-ix)
+  include(CompilerRTUtils)
+  if(APPLE)
+    include(CompilerRTDarwinUtils)
+  endif()
+  include(AddCompilerRT)
+endif()
+
+include(builtin-config-ix)
+
 # TODO: Need to add a mechanism for logging errors when builtin source files are
 # added to a sub-directory and not this CMakeLists file.
-
 set(GENERIC_SOURCES
   absvdi2.c
   absvsi2.c
@@ -143,6 +161,15 @@
   umodsi3.c
   umodti3.c)
 
+set(MSVC_SOURCES
+ divsc3.c
+ divdc3.c
+ divxc3.c
+ mulsc3.c
+ muldc3.c
+ mulxc3.c)
+
+
 if(APPLE)
   set(GENERIC_SOURCES
     ${GENERIC_SOURCES}
@@ -216,14 +243,15 @@
       ${i386_SOURCES})
 else () # MSVC
   # Use C versions of functions when building on MSVC
-  # MSVC's assembler takes Intel syntax, not AT&T syntax
+  # MSVC's assembler takes Intel syntax, not AT&T syntax.
+  # Also use only MSVC compilable builtin implementations.
   set(x86_64_SOURCES
       x86_64/floatdidf.c
       x86_64/floatdisf.c
       x86_64/floatdixf.c
-      ${GENERIC_SOURCES})
+      ${MSVC_SOURCES})
   set(x86_64h_SOURCES ${x86_64_SOURCES})
-  set(i386_SOURCES ${GENERIC_SOURCES})
+  set(i386_SOURCES ${MSVC_SOURCES})
   set(i686_SOURCES ${i386_SOURCES})
 endif () # if (NOT MSVC)
 
@@ -341,6 +369,7 @@
 set(armhf_SOURCES ${arm_SOURCES})
 set(armv7_SOURCES ${arm_SOURCES})
 set(armv7s_SOURCES ${arm_SOURCES})
+set(armv7k_SOURCES ${arm_SOURCES})
 set(arm64_SOURCES ${aarch64_SOURCES})
 
 # macho_embedded archs
@@ -353,13 +382,17 @@
 set(mips64_SOURCES ${mips_SOURCES})
 set(mips64el_SOURCES ${mips_SOURCES})
 
+set(wasm32_SOURCES ${GENERIC_SOURCES})
+set(wasm64_SOURCES ${GENERIC_SOURCES})
+
 add_custom_target(builtins)
+set_target_properties(builtins PROPERTIES FOLDER "Compiler-RT Misc")
 
 if (APPLE)
   add_subdirectory(Darwin-excludes)
   add_subdirectory(macho_embedded)
   darwin_add_builtin_libraries(${BUILTIN_SUPPORTED_OS})
-elseif (NOT WIN32 OR MINGW)
+else ()
   append_string_if(COMPILER_RT_HAS_STD_C99_FLAG -std=c99 maybe_stdc99)
 
   foreach (arch ${BUILTIN_SUPPORTED_ARCH})
diff --git a/lib/builtins/Darwin-excludes/10.4-x86_64.txt b/lib/builtins/Darwin-excludes/10.4-x86_64.txt
deleted file mode 100644
index f2ee7fe..0000000
--- a/lib/builtins/Darwin-excludes/10.4-x86_64.txt
+++ /dev/null
@@ -1,35 +0,0 @@
-absvti2
-addvti3
-ashlti3
-ashrti3
-clzti2
-cmpti2
-ctzti2
-divti3
-ffsti2
-fixdfti
-fixsfti
-fixunsdfti
-fixunssfti
-fixunsxfti
-fixxfti
-floattidf
-floattisf
-floattixf
-floatuntidf
-floatuntisf
-floatuntixf
-lshrti3
-modti3
-muloti4
-multi3
-mulvti3
-negti2
-negvti2
-parityti2
-popcountti2
-subvti3
-ucmpti2
-udivmodti4
-udivti3
-umodti3
diff --git a/lib/builtins/Darwin-excludes/10.4.txt b/lib/builtins/Darwin-excludes/10.4.txt
index 70d3644..603c0b3 100644
--- a/lib/builtins/Darwin-excludes/10.4.txt
+++ b/lib/builtins/Darwin-excludes/10.4.txt
@@ -1,18 +1,34 @@
-apple_versioning
 absvdi2
 absvsi2
+absvti2
 adddf3
 addsf3
+addtf3
 addvdi3
 addvsi3
+addvti3
+apple_versioning
 ashldi3
+ashlti3
 ashrdi3
+ashrti3
+atomic_flag_clear
+atomic_flag_clear_explicit
+atomic_flag_test_and_set
+atomic_flag_test_and_set_explicit
+atomic_signal_fence
+atomic_thread_fence
 clear_cache
 clzdi2
 clzsi2
+clzti2
 cmpdi2
+cmpti2
+comparedf2
+comparesf2
 ctzdi2
 ctzsi2
+ctzti2
 divdc3
 divdf3
 divdi3
@@ -21,76 +37,101 @@
 divsc3
 divsf3
 divsi3
+divtf3
+divti3
 divxc3
 enable_execute_stack
-comparedf2
-comparesf2
 extendhfsf2
 extendsfdf2
 ffsdi2
+ffsti2
 fixdfdi
 fixdfsi
+fixdfti
 fixsfdi
 fixsfsi
+fixsfti
 fixunsdfdi
 fixunsdfsi
+fixunsdfti
 fixunssfdi
 fixunssfsi
+fixunssfti
 fixunsxfdi
 fixunsxfsi
+fixunsxfti
 fixxfdi
+fixxfti
 floatdidf
 floatdisf
 floatdixf
 floatsidf
 floatsisf
+floattidf
+floattisf
+floattixf
 floatunsidf
 floatunsisf
+floatuntidf
+floatuntisf
+floatuntixf
 gcc_personality_v0
 gnu_f2h_ieee
 gnu_h2f_ieee
 lshrdi3
+lshrti3
 moddi3
 modsi3
+modti3
 muldc3
 muldf3
 muldi3
 mulodi4
 mulosi4
+muloti4
 mulsc3
 mulsf3
+multf3
+multi3
 mulvdi3
 mulvsi3
+mulvti3
 mulxc3
 negdf2
 negdi2
 negsf2
+negti2
 negvdi2
 negvsi2
+negvti2
 paritydi2
 paritysi2
+parityti2
 popcountdi2
 popcountsi2
+popcountti2
 powidf2
 powisf2
+powitf2
 powixf2
 subdf3
 subsf3
+subtf3
 subvdi3
 subvsi3
+subvti3
+trampoline_setup
 truncdfhf2
 truncdfsf2
 truncsfhf2
 ucmpdi2
+ucmpti2
 udivdi3
 udivmoddi4
 udivmodsi4
+udivmodti4
 udivsi3
+udivti3
 umoddi3
 umodsi3
-atomic_flag_clear
-atomic_flag_clear_explicit
-atomic_flag_test_and_set
-atomic_flag_test_and_set_explicit
-atomic_signal_fence
-atomic_thread_fence
\ No newline at end of file
+umodti3
diff --git a/lib/builtins/Darwin-excludes/osx-i386.txt b/lib/builtins/Darwin-excludes/osx-i386.txt
index 60c0e2d..f2ee7fe 100644
--- a/lib/builtins/Darwin-excludes/osx-i386.txt
+++ b/lib/builtins/Darwin-excludes/osx-i386.txt
@@ -1,5 +1,4 @@
 absvti2
-addtf3
 addvti3
 ashlti3
 ashrti3
@@ -7,7 +6,6 @@
 cmpti2
 ctzti2
 divti3
-divtf3
 ffsti2
 fixdfti
 fixsfti
@@ -25,57 +23,12 @@
 modti3
 muloti4
 multi3
-multf3
 mulvti3
 negti2
 negvti2
 parityti2
 popcountti2
-powitf2
 subvti3
-subtf3
-trampoline_setup
-ucmpti2
-udivmodti4
-udivti3
-umodti3
-absvti2
-addtf3
-addvti3
-ashlti3
-ashrti3
-clzti2
-cmpti2
-ctzti2
-divti3
-divtf3
-ffsti2
-fixdfti
-fixsfti
-fixunsdfti
-fixunssfti
-fixunsxfti
-fixxfti
-floattidf
-floattisf
-floattixf
-floatuntidf
-floatuntisf
-floatuntixf
-lshrti3
-modti3
-muloti4
-multi3
-multf3
-mulvti3
-negti2
-negvti2
-parityti2
-popcountti2
-powitf2
-subvti3
-subtf3
-trampoline_setup
 ucmpti2
 udivmodti4
 udivti3
diff --git a/lib/builtins/Darwin-excludes/osx-x86_64.txt b/lib/builtins/Darwin-excludes/osx-x86_64.txt
deleted file mode 100644
index de1574e..0000000
--- a/lib/builtins/Darwin-excludes/osx-x86_64.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-addtf3
-divtf3
-multf3
-powitf2
-subtf3
-trampoline_setup
-addtf3
-divtf3
-multf3
-powitf2
-subtf3
-trampoline_setup
diff --git a/lib/builtins/Darwin-excludes/osx.txt b/lib/builtins/Darwin-excludes/osx.txt
index 5db2400..6f9d0a7 100644
--- a/lib/builtins/Darwin-excludes/osx.txt
+++ b/lib/builtins/Darwin-excludes/osx.txt
@@ -1 +1,7 @@
 apple_versioning
+addtf3
+divtf3
+multf3
+powitf2
+subtf3
+trampoline_setup
diff --git a/lib/builtins/arm/adddf3vfp.S b/lib/builtins/arm/adddf3vfp.S
index 2825ae9..f4c00a0 100644
--- a/lib/builtins/arm/adddf3vfp.S
+++ b/lib/builtins/arm/adddf3vfp.S
@@ -24,3 +24,6 @@
 	vmov	r0, r1, d6		// move result back to r0/r1 pair
 	bx	lr
 END_COMPILERRT_FUNCTION(__adddf3vfp)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/addsf3vfp.S b/lib/builtins/arm/addsf3vfp.S
index bff5a7e..af40c1c 100644
--- a/lib/builtins/arm/addsf3vfp.S
+++ b/lib/builtins/arm/addsf3vfp.S
@@ -24,3 +24,6 @@
 	vmov	r0, s14		// move result back to r0
 	bx	lr
 END_COMPILERRT_FUNCTION(__addsf3vfp)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/aeabi_cdcmp.S b/lib/builtins/arm/aeabi_cdcmp.S
index 036a6f5..8008f5f 100644
--- a/lib/builtins/arm/aeabi_cdcmp.S
+++ b/lib/builtins/arm/aeabi_cdcmp.S
@@ -94,3 +94,5 @@
         b __aeabi_cdcmple
 END_COMPILERRT_FUNCTION(__aeabi_cdrcmple)
 
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/aeabi_cfcmp.S b/lib/builtins/arm/aeabi_cfcmp.S
index 43594e5..274baf7 100644
--- a/lib/builtins/arm/aeabi_cfcmp.S
+++ b/lib/builtins/arm/aeabi_cfcmp.S
@@ -89,3 +89,5 @@
         b __aeabi_cfcmple
 END_COMPILERRT_FUNCTION(__aeabi_cfrcmple)
 
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/aeabi_dcmp.S b/lib/builtins/arm/aeabi_dcmp.S
index 310c35b..43e4392 100644
--- a/lib/builtins/arm/aeabi_dcmp.S
+++ b/lib/builtins/arm/aeabi_dcmp.S
@@ -38,3 +38,6 @@
 DEFINE_AEABI_DCMP(le)
 DEFINE_AEABI_DCMP(ge)
 DEFINE_AEABI_DCMP(gt)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/aeabi_fcmp.S b/lib/builtins/arm/aeabi_fcmp.S
index 55f49a2..0a1d92a 100644
--- a/lib/builtins/arm/aeabi_fcmp.S
+++ b/lib/builtins/arm/aeabi_fcmp.S
@@ -38,3 +38,6 @@
 DEFINE_AEABI_FCMP(le)
 DEFINE_AEABI_FCMP(ge)
 DEFINE_AEABI_FCMP(gt)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/aeabi_idivmod.S b/lib/builtins/arm/aeabi_idivmod.S
index 384add3..2fcad86 100644
--- a/lib/builtins/arm/aeabi_idivmod.S
+++ b/lib/builtins/arm/aeabi_idivmod.S
@@ -26,3 +26,6 @@
         add     sp, sp, #4
         pop     { pc }
 END_COMPILERRT_FUNCTION(__aeabi_idivmod)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/aeabi_ldivmod.S b/lib/builtins/arm/aeabi_ldivmod.S
index ad06f1d..9f161f3 100644
--- a/lib/builtins/arm/aeabi_ldivmod.S
+++ b/lib/builtins/arm/aeabi_ldivmod.S
@@ -29,3 +29,6 @@
         add     sp, sp, #16
         pop     {r11, pc}
 END_COMPILERRT_FUNCTION(__aeabi_ldivmod)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/aeabi_memcmp.S b/lib/builtins/arm/aeabi_memcmp.S
index 051ce43..33ea548 100644
--- a/lib/builtins/arm/aeabi_memcmp.S
+++ b/lib/builtins/arm/aeabi_memcmp.S
@@ -11,6 +11,7 @@
 
 //  void __aeabi_memcmp(void *dest, void *src, size_t n) { memcmp(dest, src, n); }
 
+        .syntax unified
         .p2align 2
 DEFINE_COMPILERRT_FUNCTION(__aeabi_memcmp)
         b       memcmp
@@ -18,3 +19,6 @@
 
 DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_memcmp4, __aeabi_memcmp)
 DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_memcmp8, __aeabi_memcmp)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/aeabi_memcpy.S b/lib/builtins/arm/aeabi_memcpy.S
index cf02332..eabfa49 100644
--- a/lib/builtins/arm/aeabi_memcpy.S
+++ b/lib/builtins/arm/aeabi_memcpy.S
@@ -11,6 +11,7 @@
 
 //  void __aeabi_memcpy(void *dest, void *src, size_t n) { memcpy(dest, src, n); }
 
+        .syntax unified
         .p2align 2
 DEFINE_COMPILERRT_FUNCTION(__aeabi_memcpy)
         b       memcpy
@@ -18,3 +19,6 @@
 
 DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_memcpy4, __aeabi_memcpy)
 DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_memcpy8, __aeabi_memcpy)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/aeabi_memmove.S b/lib/builtins/arm/aeabi_memmove.S
index 4dda06f..1bf08c0 100644
--- a/lib/builtins/arm/aeabi_memmove.S
+++ b/lib/builtins/arm/aeabi_memmove.S
@@ -18,3 +18,6 @@
 
 DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_memmove4, __aeabi_memmove)
 DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_memmove8, __aeabi_memmove)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/aeabi_memset.S b/lib/builtins/arm/aeabi_memset.S
index c8b49c7..48edd89 100644
--- a/lib/builtins/arm/aeabi_memset.S
+++ b/lib/builtins/arm/aeabi_memset.S
@@ -12,6 +12,7 @@
 //  void __aeabi_memset(void *dest, size_t n, int c) { memset(dest, c, n); }
 //  void __aeabi_memclr(void *dest, size_t n) { __aeabi_memset(dest, n, 0); }
 
+        .syntax unified
         .p2align 2
 DEFINE_COMPILERRT_FUNCTION(__aeabi_memset)
         mov     r3, r1
@@ -32,3 +33,5 @@
 DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_memclr4, __aeabi_memclr)
 DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_memclr8, __aeabi_memclr)
 
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/aeabi_uidivmod.S b/lib/builtins/arm/aeabi_uidivmod.S
index 8ea474d..e1e12d9 100644
--- a/lib/builtins/arm/aeabi_uidivmod.S
+++ b/lib/builtins/arm/aeabi_uidivmod.S
@@ -27,3 +27,6 @@
         add     sp, sp, #4
         pop     { pc }
 END_COMPILERRT_FUNCTION(__aeabi_uidivmod)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/aeabi_uldivmod.S b/lib/builtins/arm/aeabi_uldivmod.S
index 4e1f8e2..e8aaef2 100644
--- a/lib/builtins/arm/aeabi_uldivmod.S
+++ b/lib/builtins/arm/aeabi_uldivmod.S
@@ -29,3 +29,6 @@
         add	sp, sp, #16
         pop	{r11, pc}
 END_COMPILERRT_FUNCTION(__aeabi_uldivmod)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/bswapdi2.S b/lib/builtins/arm/bswapdi2.S
index 86f3bba..fb226ce 100644
--- a/lib/builtins/arm/bswapdi2.S
+++ b/lib/builtins/arm/bswapdi2.S
@@ -45,3 +45,6 @@
     mov r1, r2  // r1 = r2 = rev(r0)
     JMP(lr)
 END_COMPILERRT_FUNCTION(__bswapdi2)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/bswapsi2.S b/lib/builtins/arm/bswapsi2.S
index 59ba815..553c3c2 100644
--- a/lib/builtins/arm/bswapsi2.S
+++ b/lib/builtins/arm/bswapsi2.S
@@ -37,3 +37,6 @@
 #endif
     JMP(lr)
 END_COMPILERRT_FUNCTION(__bswapsi2)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/clzdi2.S b/lib/builtins/arm/clzdi2.S
index a55abac..6068c17 100644
--- a/lib/builtins/arm/clzdi2.S
+++ b/lib/builtins/arm/clzdi2.S
@@ -95,3 +95,6 @@
 	JMP(lr)
 #endif // __ARM_FEATURE_CLZ
 END_COMPILERRT_FUNCTION(__clzdi2)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/clzsi2.S b/lib/builtins/arm/clzsi2.S
index 1cd379b..c2ba3a8 100644
--- a/lib/builtins/arm/clzsi2.S
+++ b/lib/builtins/arm/clzsi2.S
@@ -74,3 +74,6 @@
 	JMP(lr)
 #endif // __ARM_FEATURE_CLZ
 END_COMPILERRT_FUNCTION(__clzsi2)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/comparesf2.S b/lib/builtins/arm/comparesf2.S
index cf71d36..52597b6 100644
--- a/lib/builtins/arm/comparesf2.S
+++ b/lib/builtins/arm/comparesf2.S
@@ -146,3 +146,6 @@
 END_COMPILERRT_FUNCTION(__unordsf2)
 
 DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_fcmpun, __unordsf2)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/divdf3vfp.S b/lib/builtins/arm/divdf3vfp.S
index 6eebef1..928f538 100644
--- a/lib/builtins/arm/divdf3vfp.S
+++ b/lib/builtins/arm/divdf3vfp.S
@@ -24,3 +24,6 @@
 	vmov	r0, r1, d5		// move result back to r0/r1 pair
 	bx	lr
 END_COMPILERRT_FUNCTION(__divdf3vfp)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/divmodsi4.S b/lib/builtins/arm/divmodsi4.S
index 646b9ab..999c310 100644
--- a/lib/builtins/arm/divmodsi4.S
+++ b/lib/builtins/arm/divmodsi4.S
@@ -72,3 +72,6 @@
     CLEAR_FRAME_AND_RETURN
 #endif
 END_COMPILERRT_FUNCTION(__divmodsi4)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/divsf3vfp.S b/lib/builtins/arm/divsf3vfp.S
index fdbaebc..a2e297f 100644
--- a/lib/builtins/arm/divsf3vfp.S
+++ b/lib/builtins/arm/divsf3vfp.S
@@ -24,3 +24,6 @@
 	vmov	r0, s13		// move result back to r0
 	bx	lr
 END_COMPILERRT_FUNCTION(__divsf3vfp)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/divsi3.S b/lib/builtins/arm/divsi3.S
index adf8f94..7e23ba4 100644
--- a/lib/builtins/arm/divsi3.S
+++ b/lib/builtins/arm/divsi3.S
@@ -63,3 +63,6 @@
     CLEAR_FRAME_AND_RETURN
 #endif
 END_COMPILERRT_FUNCTION(__divsi3)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/eqdf2vfp.S b/lib/builtins/arm/eqdf2vfp.S
index 7f2fbc3..95e6bb3 100644
--- a/lib/builtins/arm/eqdf2vfp.S
+++ b/lib/builtins/arm/eqdf2vfp.S
@@ -27,3 +27,6 @@
 	movne	r0, #0
 	bx	lr
 END_COMPILERRT_FUNCTION(__eqdf2vfp)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/eqsf2vfp.S b/lib/builtins/arm/eqsf2vfp.S
index a318b33..fbac139 100644
--- a/lib/builtins/arm/eqsf2vfp.S
+++ b/lib/builtins/arm/eqsf2vfp.S
@@ -27,3 +27,6 @@
 	movne	r0, #0
 	bx	lr
 END_COMPILERRT_FUNCTION(__eqsf2vfp)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/extendsfdf2vfp.S b/lib/builtins/arm/extendsfdf2vfp.S
index b998e58..563bf92 100644
--- a/lib/builtins/arm/extendsfdf2vfp.S
+++ b/lib/builtins/arm/extendsfdf2vfp.S
@@ -24,3 +24,6 @@
 	vmov	r0, r1, d7   // return result in r0/r1 pair
 	bx	lr
 END_COMPILERRT_FUNCTION(__extendsfdf2vfp)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/fixdfsivfp.S b/lib/builtins/arm/fixdfsivfp.S
index e3bd8e0..8263ff9 100644
--- a/lib/builtins/arm/fixdfsivfp.S
+++ b/lib/builtins/arm/fixdfsivfp.S
@@ -24,3 +24,6 @@
 	vmov	r0, s15	      // move s15 to result register
 	bx	lr
 END_COMPILERRT_FUNCTION(__fixdfsivfp)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/fixsfsivfp.S b/lib/builtins/arm/fixsfsivfp.S
index 3d0d0f5..c7c3b81 100644
--- a/lib/builtins/arm/fixsfsivfp.S
+++ b/lib/builtins/arm/fixsfsivfp.S
@@ -24,3 +24,6 @@
 	vmov	r0, s15	       // move s15 to result register
 	bx	lr
 END_COMPILERRT_FUNCTION(__fixsfsivfp)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/fixunsdfsivfp.S b/lib/builtins/arm/fixunsdfsivfp.S
index 35dda5b..9cc1e62 100644
--- a/lib/builtins/arm/fixunsdfsivfp.S
+++ b/lib/builtins/arm/fixunsdfsivfp.S
@@ -25,3 +25,6 @@
 	vmov	r0, s15	      // move s15 to result register
 	bx	lr
 END_COMPILERRT_FUNCTION(__fixunsdfsivfp)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/fixunssfsivfp.S b/lib/builtins/arm/fixunssfsivfp.S
index 5c3a7d9..79d7082 100644
--- a/lib/builtins/arm/fixunssfsivfp.S
+++ b/lib/builtins/arm/fixunssfsivfp.S
@@ -25,3 +25,6 @@
 	vmov	r0, s15	       // move s15 to result register
 	bx	lr
 END_COMPILERRT_FUNCTION(__fixunssfsivfp)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/floatsidfvfp.S b/lib/builtins/arm/floatsidfvfp.S
index d691849..7623f26 100644
--- a/lib/builtins/arm/floatsidfvfp.S
+++ b/lib/builtins/arm/floatsidfvfp.S
@@ -24,3 +24,6 @@
 	vmov	r0, r1, d7     // move d7 to result register pair r0/r1
 	bx	lr
 END_COMPILERRT_FUNCTION(__floatsidfvfp)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/floatsisfvfp.S b/lib/builtins/arm/floatsisfvfp.S
index 4a0cb39..c73dfac 100644
--- a/lib/builtins/arm/floatsisfvfp.S
+++ b/lib/builtins/arm/floatsisfvfp.S
@@ -24,3 +24,6 @@
 	vmov	r0, s15        // move s15 to result register
 	bx	lr
 END_COMPILERRT_FUNCTION(__floatsisfvfp)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/floatunssidfvfp.S b/lib/builtins/arm/floatunssidfvfp.S
index d92969e..2a59fdb 100644
--- a/lib/builtins/arm/floatunssidfvfp.S
+++ b/lib/builtins/arm/floatunssidfvfp.S
@@ -24,3 +24,6 @@
 	vmov	r0, r1, d7     // move d7 to result register pair r0/r1
 	bx	lr
 END_COMPILERRT_FUNCTION(__floatunssidfvfp)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/floatunssisfvfp.S b/lib/builtins/arm/floatunssisfvfp.S
index f6aeba5..c096263 100644
--- a/lib/builtins/arm/floatunssisfvfp.S
+++ b/lib/builtins/arm/floatunssisfvfp.S
@@ -24,3 +24,6 @@
 	vmov	r0, s15        // move s15 to result register
 	bx	lr
 END_COMPILERRT_FUNCTION(__floatunssisfvfp)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/gedf2vfp.S b/lib/builtins/arm/gedf2vfp.S
index 9e23527..72f13ef 100644
--- a/lib/builtins/arm/gedf2vfp.S
+++ b/lib/builtins/arm/gedf2vfp.S
@@ -27,3 +27,6 @@
 	movlt	r0, #0
 	bx	lr
 END_COMPILERRT_FUNCTION(__gedf2vfp)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/gesf2vfp.S b/lib/builtins/arm/gesf2vfp.S
index 0ff6084..c9ee52c 100644
--- a/lib/builtins/arm/gesf2vfp.S
+++ b/lib/builtins/arm/gesf2vfp.S
@@ -27,3 +27,6 @@
 	movlt	r0, #0
 	bx	lr
 END_COMPILERRT_FUNCTION(__gesf2vfp)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/gtdf2vfp.S b/lib/builtins/arm/gtdf2vfp.S
index 3dc5d5b..c7f2775 100644
--- a/lib/builtins/arm/gtdf2vfp.S
+++ b/lib/builtins/arm/gtdf2vfp.S
@@ -27,3 +27,6 @@
 	movle	r0, #0
 	bx	lr
 END_COMPILERRT_FUNCTION(__gtdf2vfp)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/gtsf2vfp.S b/lib/builtins/arm/gtsf2vfp.S
index ddd843a..7d49e45 100644
--- a/lib/builtins/arm/gtsf2vfp.S
+++ b/lib/builtins/arm/gtsf2vfp.S
@@ -27,3 +27,6 @@
 	movle	r0, #0
 	bx	lr
 END_COMPILERRT_FUNCTION(__gtsf2vfp)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/ledf2vfp.S b/lib/builtins/arm/ledf2vfp.S
index b06ff6d..ca5b553 100644
--- a/lib/builtins/arm/ledf2vfp.S
+++ b/lib/builtins/arm/ledf2vfp.S
@@ -27,3 +27,6 @@
 	movhi	r0, #0
 	bx	lr
 END_COMPILERRT_FUNCTION(__ledf2vfp)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/lesf2vfp.S b/lib/builtins/arm/lesf2vfp.S
index 9b33c0c..f25422e 100644
--- a/lib/builtins/arm/lesf2vfp.S
+++ b/lib/builtins/arm/lesf2vfp.S
@@ -27,3 +27,6 @@
 	movhi	r0, #0
 	bx	lr
 END_COMPILERRT_FUNCTION(__lesf2vfp)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/ltdf2vfp.S b/lib/builtins/arm/ltdf2vfp.S
index 9f794b0..6e2c099 100644
--- a/lib/builtins/arm/ltdf2vfp.S
+++ b/lib/builtins/arm/ltdf2vfp.S
@@ -27,3 +27,6 @@
 	movpl	r0, #0
 	bx	lr
 END_COMPILERRT_FUNCTION(__ltdf2vfp)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/ltsf2vfp.S b/lib/builtins/arm/ltsf2vfp.S
index ba190d9..95febb6 100644
--- a/lib/builtins/arm/ltsf2vfp.S
+++ b/lib/builtins/arm/ltsf2vfp.S
@@ -27,3 +27,6 @@
 	movpl	r0, #0
 	bx	lr
 END_COMPILERRT_FUNCTION(__ltsf2vfp)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/modsi3.S b/lib/builtins/arm/modsi3.S
index 295a227..1d302ed 100644
--- a/lib/builtins/arm/modsi3.S
+++ b/lib/builtins/arm/modsi3.S
@@ -61,3 +61,6 @@
     CLEAR_FRAME_AND_RETURN
 #endif
 END_COMPILERRT_FUNCTION(__modsi3)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/muldf3vfp.S b/lib/builtins/arm/muldf3vfp.S
index 636cc71..f638de1 100644
--- a/lib/builtins/arm/muldf3vfp.S
+++ b/lib/builtins/arm/muldf3vfp.S
@@ -24,3 +24,6 @@
 	vmov 	r0, r1, d6         // move result back to r0/r1 pair
 	bx	lr
 END_COMPILERRT_FUNCTION(__muldf3vfp)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/mulsf3vfp.S b/lib/builtins/arm/mulsf3vfp.S
index 7f40082..bef58d3 100644
--- a/lib/builtins/arm/mulsf3vfp.S
+++ b/lib/builtins/arm/mulsf3vfp.S
@@ -24,3 +24,6 @@
 	vmov	r0, s13		// move result back to r0
 	bx	lr
 END_COMPILERRT_FUNCTION(__mulsf3vfp)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/nedf2vfp.S b/lib/builtins/arm/nedf2vfp.S
index 7ab2f55..78cf529 100644
--- a/lib/builtins/arm/nedf2vfp.S
+++ b/lib/builtins/arm/nedf2vfp.S
@@ -27,3 +27,6 @@
 	moveq	r0, #0
 	bx	lr
 END_COMPILERRT_FUNCTION(__nedf2vfp)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/negdf2vfp.S b/lib/builtins/arm/negdf2vfp.S
index 56d73c6..01c8ba6 100644
--- a/lib/builtins/arm/negdf2vfp.S
+++ b/lib/builtins/arm/negdf2vfp.S
@@ -21,3 +21,6 @@
 	eor	r1, r1, #-2147483648	// flip sign bit on double in r0/r1 pair
 	bx	lr
 END_COMPILERRT_FUNCTION(__negdf2vfp)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/negsf2vfp.S b/lib/builtins/arm/negsf2vfp.S
index a6e32e1..797abb3 100644
--- a/lib/builtins/arm/negsf2vfp.S
+++ b/lib/builtins/arm/negsf2vfp.S
@@ -21,3 +21,6 @@
 	eor	r0, r0, #-2147483648	// flip sign bit on float in r0
 	bx	lr
 END_COMPILERRT_FUNCTION(__negsf2vfp)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/nesf2vfp.S b/lib/builtins/arm/nesf2vfp.S
index 9fe8ecd..554d3e4 100644
--- a/lib/builtins/arm/nesf2vfp.S
+++ b/lib/builtins/arm/nesf2vfp.S
@@ -27,3 +27,6 @@
 	moveq	r0, #0
 	bx	lr
 END_COMPILERRT_FUNCTION(__nesf2vfp)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/restore_vfp_d8_d15_regs.S b/lib/builtins/arm/restore_vfp_d8_d15_regs.S
index 0f6ea51..0692cf3 100644
--- a/lib/builtins/arm/restore_vfp_d8_d15_regs.S
+++ b/lib/builtins/arm/restore_vfp_d8_d15_regs.S
@@ -31,3 +31,5 @@
 	bx      lr                      // return to prolog
 END_COMPILERRT_FUNCTION(__restore_vfp_d8_d15_regs)
 
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/save_vfp_d8_d15_regs.S b/lib/builtins/arm/save_vfp_d8_d15_regs.S
index f1d90e7..544dd54 100644
--- a/lib/builtins/arm/save_vfp_d8_d15_regs.S
+++ b/lib/builtins/arm/save_vfp_d8_d15_regs.S
@@ -31,3 +31,5 @@
 	bx      lr                      // return to prolog
 END_COMPILERRT_FUNCTION(__save_vfp_d8_d15_regs)
 
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/subdf3vfp.S b/lib/builtins/arm/subdf3vfp.S
index 5f3c0f7..1fc7d18 100644
--- a/lib/builtins/arm/subdf3vfp.S
+++ b/lib/builtins/arm/subdf3vfp.S
@@ -24,3 +24,6 @@
 	vmov 	r0, r1, d6         // move result back to r0/r1 pair
 	bx	lr
 END_COMPILERRT_FUNCTION(__subdf3vfp)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/subsf3vfp.S b/lib/builtins/arm/subsf3vfp.S
index d6e06df..11fe386 100644
--- a/lib/builtins/arm/subsf3vfp.S
+++ b/lib/builtins/arm/subsf3vfp.S
@@ -25,3 +25,6 @@
 	vmov	r0, s14		// move result back to r0
 	bx	lr
 END_COMPILERRT_FUNCTION(__subsf3vfp)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/switch16.S b/lib/builtins/arm/switch16.S
index 3c3a6b1..df9e38e 100644
--- a/lib/builtins/arm/switch16.S
+++ b/lib/builtins/arm/switch16.S
@@ -42,3 +42,5 @@
 	bx      ip                      // jump to computed label
 END_COMPILERRT_FUNCTION(__switch16)
 
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/switch32.S b/lib/builtins/arm/switch32.S
index b38cd2b..d97b536 100644
--- a/lib/builtins/arm/switch32.S
+++ b/lib/builtins/arm/switch32.S
@@ -42,3 +42,5 @@
 	bx      ip                       // jump to computed label
 END_COMPILERRT_FUNCTION(__switch32)
 
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/switch8.S b/lib/builtins/arm/switch8.S
index d7c2042..4d9e0ea 100644
--- a/lib/builtins/arm/switch8.S
+++ b/lib/builtins/arm/switch8.S
@@ -40,3 +40,5 @@
 	bx      ip                      // jump to computed label
 END_COMPILERRT_FUNCTION(__switch8)
 
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/switchu8.S b/lib/builtins/arm/switchu8.S
index 1844f11..4ffe35f 100644
--- a/lib/builtins/arm/switchu8.S
+++ b/lib/builtins/arm/switchu8.S
@@ -40,3 +40,5 @@
 	bx      ip                      // jump to computed label
 END_COMPILERRT_FUNCTION(__switchu8)
 
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/sync_fetch_and_add_4.S b/lib/builtins/arm/sync_fetch_and_add_4.S
index 54c33e2..7877d6c 100644
--- a/lib/builtins/arm/sync_fetch_and_add_4.S
+++ b/lib/builtins/arm/sync_fetch_and_add_4.S
@@ -19,3 +19,5 @@
 
 SYNC_OP_4(add_4)
 
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/sync_fetch_and_add_8.S b/lib/builtins/arm/sync_fetch_and_add_8.S
index 5724bb1..1df07a3 100644
--- a/lib/builtins/arm/sync_fetch_and_add_8.S
+++ b/lib/builtins/arm/sync_fetch_and_add_8.S
@@ -22,3 +22,5 @@
 SYNC_OP_8(add_8)
 #endif
 
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/sync_fetch_and_and_4.S b/lib/builtins/arm/sync_fetch_and_and_4.S
index e2b77a1..720ff02 100644
--- a/lib/builtins/arm/sync_fetch_and_and_4.S
+++ b/lib/builtins/arm/sync_fetch_and_and_4.S
@@ -17,3 +17,6 @@
 #define and_4(rD, rN, rM)  and rD, rN, rM
 
 SYNC_OP_4(and_4)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/sync_fetch_and_and_8.S b/lib/builtins/arm/sync_fetch_and_and_8.S
index a74163a..4f7b5ca 100644
--- a/lib/builtins/arm/sync_fetch_and_and_8.S
+++ b/lib/builtins/arm/sync_fetch_and_and_8.S
@@ -21,3 +21,6 @@
 
 SYNC_OP_8(and_8)
 #endif
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/sync_fetch_and_max_4.S b/lib/builtins/arm/sync_fetch_and_max_4.S
index 01e4f44..43da9c7 100644
--- a/lib/builtins/arm/sync_fetch_and_max_4.S
+++ b/lib/builtins/arm/sync_fetch_and_max_4.S
@@ -18,3 +18,5 @@
 
 SYNC_OP_4(max_4)
 
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/sync_fetch_and_max_8.S b/lib/builtins/arm/sync_fetch_and_max_8.S
index 1eef2b2..898fc62 100644
--- a/lib/builtins/arm/sync_fetch_and_max_8.S
+++ b/lib/builtins/arm/sync_fetch_and_max_8.S
@@ -19,3 +19,6 @@
 
 SYNC_OP_8(max_8)
 #endif
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/sync_fetch_and_min_4.S b/lib/builtins/arm/sync_fetch_and_min_4.S
index 015626b..bba31a0 100644
--- a/lib/builtins/arm/sync_fetch_and_min_4.S
+++ b/lib/builtins/arm/sync_fetch_and_min_4.S
@@ -18,3 +18,5 @@
 
 SYNC_OP_4(min_4)
 
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/sync_fetch_and_min_8.S b/lib/builtins/arm/sync_fetch_and_min_8.S
index ad5cce0..e7ccf9f 100644
--- a/lib/builtins/arm/sync_fetch_and_min_8.S
+++ b/lib/builtins/arm/sync_fetch_and_min_8.S
@@ -19,3 +19,6 @@
 
 SYNC_OP_8(min_8)
 #endif
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/sync_fetch_and_nand_4.S b/lib/builtins/arm/sync_fetch_and_nand_4.S
index b32a314..c13dd39 100644
--- a/lib/builtins/arm/sync_fetch_and_nand_4.S
+++ b/lib/builtins/arm/sync_fetch_and_nand_4.S
@@ -18,3 +18,5 @@
 
 SYNC_OP_4(nand_4)
 
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/sync_fetch_and_nand_8.S b/lib/builtins/arm/sync_fetch_and_nand_8.S
index a2c17c0..e8107ab 100644
--- a/lib/builtins/arm/sync_fetch_and_nand_8.S
+++ b/lib/builtins/arm/sync_fetch_and_nand_8.S
@@ -22,3 +22,5 @@
 SYNC_OP_8(nand_8)
 #endif
 
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/sync_fetch_and_or_4.S b/lib/builtins/arm/sync_fetch_and_or_4.S
index f2e0857..6726571 100644
--- a/lib/builtins/arm/sync_fetch_and_or_4.S
+++ b/lib/builtins/arm/sync_fetch_and_or_4.S
@@ -18,3 +18,5 @@
 
 SYNC_OP_4(or_4)
 
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/sync_fetch_and_or_8.S b/lib/builtins/arm/sync_fetch_and_or_8.S
index 87b940b..f7f162c 100644
--- a/lib/builtins/arm/sync_fetch_and_or_8.S
+++ b/lib/builtins/arm/sync_fetch_and_or_8.S
@@ -22,3 +22,5 @@
 SYNC_OP_8(or_8)
 #endif
 
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/sync_fetch_and_sub_4.S b/lib/builtins/arm/sync_fetch_and_sub_4.S
index 460b2bc..b9326b1 100644
--- a/lib/builtins/arm/sync_fetch_and_sub_4.S
+++ b/lib/builtins/arm/sync_fetch_and_sub_4.S
@@ -19,3 +19,5 @@
 
 SYNC_OP_4(sub_4)
 
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/sync_fetch_and_sub_8.S b/lib/builtins/arm/sync_fetch_and_sub_8.S
index a8035a2..6ce743e 100644
--- a/lib/builtins/arm/sync_fetch_and_sub_8.S
+++ b/lib/builtins/arm/sync_fetch_and_sub_8.S
@@ -22,3 +22,5 @@
 SYNC_OP_8(sub_8)
 #endif
 
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/sync_fetch_and_umax_4.S b/lib/builtins/arm/sync_fetch_and_umax_4.S
index c591530..b8d19ff 100644
--- a/lib/builtins/arm/sync_fetch_and_umax_4.S
+++ b/lib/builtins/arm/sync_fetch_and_umax_4.S
@@ -18,3 +18,5 @@
 
 SYNC_OP_4(umax_4)
 
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/sync_fetch_and_umax_8.S b/lib/builtins/arm/sync_fetch_and_umax_8.S
index d9b7965..34442fd 100644
--- a/lib/builtins/arm/sync_fetch_and_umax_8.S
+++ b/lib/builtins/arm/sync_fetch_and_umax_8.S
@@ -19,3 +19,6 @@
 
 SYNC_OP_8(umax_8)
 #endif
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/sync_fetch_and_umin_4.S b/lib/builtins/arm/sync_fetch_and_umin_4.S
index 9f3896f..0998e3e 100644
--- a/lib/builtins/arm/sync_fetch_and_umin_4.S
+++ b/lib/builtins/arm/sync_fetch_and_umin_4.S
@@ -18,3 +18,5 @@
 
 SYNC_OP_4(umin_4)
 
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/sync_fetch_and_umin_8.S b/lib/builtins/arm/sync_fetch_and_umin_8.S
index 7bf5e23..558f913 100644
--- a/lib/builtins/arm/sync_fetch_and_umin_8.S
+++ b/lib/builtins/arm/sync_fetch_and_umin_8.S
@@ -19,3 +19,6 @@
 
 SYNC_OP_8(umin_8)
 #endif
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/sync_fetch_and_xor_4.S b/lib/builtins/arm/sync_fetch_and_xor_4.S
index 7e7c90c..824f491 100644
--- a/lib/builtins/arm/sync_fetch_and_xor_4.S
+++ b/lib/builtins/arm/sync_fetch_and_xor_4.S
@@ -18,3 +18,5 @@
 
 SYNC_OP_4(xor_4)
 
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/sync_fetch_and_xor_8.S b/lib/builtins/arm/sync_fetch_and_xor_8.S
index ea9aa6d..073fb9c 100644
--- a/lib/builtins/arm/sync_fetch_and_xor_8.S
+++ b/lib/builtins/arm/sync_fetch_and_xor_8.S
@@ -22,3 +22,5 @@
 SYNC_OP_8(xor_8)
 #endif
 
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/sync_synchronize.S b/lib/builtins/arm/sync_synchronize.S
index 178f245..61d1db9 100644
--- a/lib/builtins/arm/sync_synchronize.S
+++ b/lib/builtins/arm/sync_synchronize.S
@@ -33,3 +33,6 @@
 	.subsections_via_symbols
 		
 #endif
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/truncdfsf2vfp.S b/lib/builtins/arm/truncdfsf2vfp.S
index fa4362c..04287ad 100644
--- a/lib/builtins/arm/truncdfsf2vfp.S
+++ b/lib/builtins/arm/truncdfsf2vfp.S
@@ -24,3 +24,6 @@
 	vmov 	r0, s15      // return result in r0
 	bx	lr
 END_COMPILERRT_FUNCTION(__truncdfsf2vfp)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/udivmodsi4.S b/lib/builtins/arm/udivmodsi4.S
index 85b8493..1ad8ee3 100644
--- a/lib/builtins/arm/udivmodsi4.S
+++ b/lib/builtins/arm/udivmodsi4.S
@@ -182,3 +182,6 @@
 #endif
 
 END_COMPILERRT_FUNCTION(__udivmodsi4)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/udivsi3.S b/lib/builtins/arm/udivsi3.S
index 165b2b5..085f8fb 100644
--- a/lib/builtins/arm/udivsi3.S
+++ b/lib/builtins/arm/udivsi3.S
@@ -168,3 +168,6 @@
 #endif
 
 END_COMPILERRT_FUNCTION(__udivsi3)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/umodsi3.S b/lib/builtins/arm/umodsi3.S
index 9e7a148..672487e 100644
--- a/lib/builtins/arm/umodsi3.S
+++ b/lib/builtins/arm/umodsi3.S
@@ -159,3 +159,6 @@
 #endif
 
 END_COMPILERRT_FUNCTION(__umodsi3)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/unorddf2vfp.S b/lib/builtins/arm/unorddf2vfp.S
index c4bea2d..022dd7a 100644
--- a/lib/builtins/arm/unorddf2vfp.S
+++ b/lib/builtins/arm/unorddf2vfp.S
@@ -27,3 +27,6 @@
 	movvc	r0, #0
 	bx	lr
 END_COMPILERRT_FUNCTION(__unorddf2vfp)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm/unordsf2vfp.S b/lib/builtins/arm/unordsf2vfp.S
index 886e965..5ebdd3d 100644
--- a/lib/builtins/arm/unordsf2vfp.S
+++ b/lib/builtins/arm/unordsf2vfp.S
@@ -27,3 +27,6 @@
 	movvc	r0, #0
 	bx	lr
 END_COMPILERRT_FUNCTION(__unordsf2vfp)
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/arm64/dummy.c b/lib/builtins/arm64/dummy.c
deleted file mode 100644
index 76bec30..0000000
--- a/lib/builtins/arm64/dummy.c
+++ /dev/null
@@ -1,13 +0,0 @@
-/* ===---------- dummy.c - Implements dummy function, for bringup -----------===
- *
- *                     The LLVM Compiler Infrastructure
- *
- * This file is dual licensed under the MIT and the University of Illinois Open
- * Source Licenses. See LICENSE.TXT for details.
- *
- * ===----------------------------------------------------------------------===
- */
-
-static void dummy(void) __attribute__((used));
-
-static void dummy(void) {}
diff --git a/lib/builtins/assembly.h b/lib/builtins/assembly.h
index c289705..5fc74f6 100644
--- a/lib/builtins/assembly.h
+++ b/lib/builtins/assembly.h
@@ -30,6 +30,8 @@
 #define SYMBOL_IS_FUNC(name)
 #define CONST_SECTION .const
 
+#define NO_EXEC_STACK_DIRECTIVE
+
 #elif defined(__ELF__)
 
 #define HIDDEN(name) .hidden name
@@ -42,6 +44,12 @@
 #endif
 #define CONST_SECTION .section .rodata
 
+#if defined(__GNU__) || defined(__ANDROID__) || defined(__FreeBSD__)
+#define NO_EXEC_STACK_DIRECTIVE .section .note.GNU-stack,"",%progbits
+#else
+#define NO_EXEC_STACK_DIRECTIVE
+#endif
+
 #else // !__APPLE__ && !__ELF__
 
 #define HIDDEN(name)
@@ -54,6 +62,8 @@
   .endef
 #define CONST_SECTION .section .rdata,"rd"
 
+#define NO_EXEC_STACK_DIRECTIVE
+
 #endif
 
 #if defined(__arm__)
diff --git a/lib/builtins/clear_cache.c b/lib/builtins/clear_cache.c
index ede7659..55bbdd3 100644
--- a/lib/builtins/clear_cache.c
+++ b/lib/builtins/clear_cache.c
@@ -14,6 +14,15 @@
 #if __APPLE__
   #include <libkern/OSCacheControl.h>
 #endif
+
+#if defined(_WIN32)
+/* Forward declare Win32 APIs since the GCC mode driver does not handle the
+   newer SDKs as well as needed.  */
+uint32_t FlushInstructionCache(uintptr_t hProcess, void *lpBaseAddress,
+                               uintptr_t dwSize);
+uintptr_t GetCurrentProcess(void);
+#endif
+
 #if (defined(__FreeBSD__) || defined(__Bitrig__)) && defined(__arm__)
   #include <sys/types.h>
   #include <machine/sysarch.h>
@@ -73,7 +82,7 @@
   #endif
 #endif
 
-#if defined(__ANDROID__) && defined(__arm__)
+#if defined(__linux__) && defined(__arm__)
   #include <asm/unistd.h>
 #endif
 
@@ -98,16 +107,18 @@
         arg.len = (uintptr_t)end - (uintptr_t)start;
 
         sysarch(ARM_SYNC_ICACHE, &arg);
-    #elif defined(__ANDROID__)
+    #elif defined(__linux__)
          register int start_reg __asm("r0") = (int) (intptr_t) start;
          const register int end_reg __asm("r1") = (int) (intptr_t) end;
-         const register int flags __asm("r2") = 0;
          const register int syscall_nr __asm("r7") = __ARM_NR_cacheflush;
-        __asm __volatile("svc 0x0" : "=r"(start_reg)
-            : "r"(syscall_nr), "r"(start_reg), "r"(end_reg), "r"(flags) : "r0");
+         __asm __volatile("svc 0x0"
+                          : "=r"(start_reg)
+                          : "r"(syscall_nr), "r"(start_reg), "r"(end_reg));
          if (start_reg != 0) {
              compilerrt_abort();
          }
+    #elif defined(_WIN32)
+        FlushInstructionCache(GetCurrentProcess(), start, end - start);
     #else
         compilerrt_abort();
     #endif
diff --git a/lib/builtins/emutls.c b/lib/builtins/emutls.c
index 09e7956..eccbf53 100644
--- a/lib/builtins/emutls.c
+++ b/lib/builtins/emutls.c
@@ -27,9 +27,14 @@
  * If xyz has non-zero initial value, __emutls_v.xyz's "value"
  * will point to __emutls_t.xyz, which has the initial value.
  */
+typedef unsigned int gcc_word __attribute__((mode(word)));
 typedef struct __emutls_control {
-    size_t size;  /* size of the object in bytes */
-    size_t align;  /* alignment of the object in bytes */
+    /* Must use gcc_word here, instead of size_t, to match GCC.  When
+       gcc_word is larger than size_t, the upper extra bits are all
+       zeros.  We can use variables of size_t to operate on size and
+       align.  */
+    gcc_word size;  /* size of the object in bytes */
+    gcc_word align;  /* alignment of the object in bytes */
     union {
         uintptr_t index;  /* data[index-1] is the object address */
         void* address;  /* object address, when in single thread env */
@@ -67,21 +72,20 @@
 /* Emulated TLS objects are always allocated at run-time. */
 static __inline void *emutls_allocate_object(__emutls_control *control) {
     /* Use standard C types, check with gcc's emutls.o. */
-    typedef unsigned int gcc_word __attribute__((mode(word)));
     typedef unsigned int gcc_pointer __attribute__((mode(pointer)));
-    COMPILE_TIME_ASSERT(sizeof(size_t) == sizeof(gcc_word));
     COMPILE_TIME_ASSERT(sizeof(uintptr_t) == sizeof(gcc_pointer));
     COMPILE_TIME_ASSERT(sizeof(uintptr_t) == sizeof(void*));
 
     size_t size = control->size;
     size_t align = control->align;
+    void* base;
     if (align < sizeof(void*))
         align = sizeof(void*);
     /* Make sure that align is power of 2. */
     if ((align & (align - 1)) != 0)
         abort();
 
-    void* base = emutls_memalign_alloc(align, size);
+    base = emutls_memalign_alloc(align, size);
     if (control->value)
         memcpy(base, control->value, size);
     else
@@ -160,12 +164,14 @@
     emutls_address_array* array = pthread_getspecific(emutls_pthread_key);
     if (array == NULL) {
         uintptr_t new_size = emutls_new_data_array_size(index);
-        array = calloc(new_size + 1, sizeof(void*));
+        array = malloc(new_size * sizeof(void *) + sizeof(emutls_address_array));
+        if (array)
+            memset(array->data, 0, new_size * sizeof(void*));
         emutls_check_array_set_size(array, new_size);
     } else if (index > array->size) {
         uintptr_t orig_size = array->size;
         uintptr_t new_size = emutls_new_data_array_size(index);
-        array = realloc(array, (new_size + 1) * sizeof(void*));
+        array = realloc(array, new_size * sizeof(void *) + sizeof(emutls_address_array));
         if (array)
             memset(array->data + orig_size, 0,
                    (new_size - orig_size) * sizeof(void*));
diff --git a/lib/builtins/floatdidf.c b/lib/builtins/floatdidf.c
index a300c9f..2b023ad 100644
--- a/lib/builtins/floatdidf.c
+++ b/lib/builtins/floatdidf.c
@@ -16,7 +16,7 @@
 
 /* Returns: convert a to a double, rounding toward even. */
 
-/* Assumption: double is a IEEE 64 bit floating point type 
+/* Assumption: double is a IEEE 64 bit floating point type
  *             di_int is a 64 bit integral type
  */
 
@@ -32,16 +32,16 @@
 COMPILER_RT_ABI double
 __floatdidf(di_int a)
 {
-	static const double twop52 = 4503599627370496.0; // 0x1.0p52
-	static const double twop32 = 4294967296.0; // 0x1.0p32
-	
-	union { int64_t x; double d; } low = { .d = twop52 };
-	
-	const double high = (int32_t)(a >> 32) * twop32;
-	low.x |= a & INT64_C(0x00000000ffffffff);
-	
-	const double result = (high - twop52) + low.d;
-	return result;
+    static const double twop52 = 4503599627370496.0; // 0x1.0p52
+    static const double twop32 = 4294967296.0; // 0x1.0p32
+
+    union { int64_t x; double d; } low = { .d = twop52 };
+
+    const double high = (int32_t)(a >> 32) * twop32;
+    low.x |= a & INT64_C(0x00000000ffffffff);
+
+    const double result = (high - twop52) + low.d;
+    return result;
 }
 
 #else
@@ -98,10 +98,10 @@
         /* a is now rounded to DBL_MANT_DIG bits */
     }
     double_bits fb;
-    fb.u.high = ((su_int)s & 0x80000000) |        /* sign */
-                ((e + 1023) << 20)      |        /* exponent */
-                ((su_int)(a >> 32) & 0x000FFFFF); /* mantissa-high */
-    fb.u.low = (su_int)a;                         /* mantissa-low */
+    fb.u.s.high = ((su_int)s & 0x80000000) |        /* sign */
+                  ((e + 1023) << 20)       |        /* exponent */
+                  ((su_int)(a >> 32) & 0x000FFFFF); /* mantissa-high */
+    fb.u.s.low = (su_int)a;                         /* mantissa-low */
     return fb.f;
 }
 #endif
diff --git a/lib/builtins/floattidf.c b/lib/builtins/floattidf.c
index 6331ba5..2702a3c 100644
--- a/lib/builtins/floattidf.c
+++ b/lib/builtins/floattidf.c
@@ -10,7 +10,7 @@
  * This file implements __floattidf for the compiler_rt library.
  *
  * ===----------------------------------------------------------------------===
- */ 
+ */
 
 #include "int_lib.h"
 
@@ -18,11 +18,11 @@
 
 /* Returns: convert a to a double, rounding toward even.*/
 
-/* Assumption: double is a IEEE 64 bit floating point type 
+/* Assumption: double is a IEEE 64 bit floating point type
  *            ti_int is a 128 bit integral type
  */
 
-/* seee eeee eeee mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm mmmm mmmm mmmm */ 
+/* seee eeee eeee mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm mmmm mmmm mmmm */
 
 COMPILER_RT_ABI double
 __floattidf(ti_int a)
diff --git a/lib/builtins/floatundidf.c b/lib/builtins/floatundidf.c
index 67aa86e..cfd3a7a 100644
--- a/lib/builtins/floatundidf.c
+++ b/lib/builtins/floatundidf.c
@@ -14,7 +14,7 @@
 
 /* Returns: convert a to a double, rounding toward even. */
 
-/* Assumption: double is a IEEE 64 bit floating point type 
+/* Assumption: double is a IEEE 64 bit floating point type
  *             du_int is a 64 bit integral type
  */
 
@@ -32,24 +32,24 @@
 COMPILER_RT_ABI double
 __floatundidf(du_int a)
 {
-	static const double twop52 = 4503599627370496.0; // 0x1.0p52
-	static const double twop84 = 19342813113834066795298816.0; // 0x1.0p84
-	static const double twop84_plus_twop52 = 19342813118337666422669312.0; // 0x1.00000001p84
-	
-	union { uint64_t x; double d; } high = { .d = twop84 };
-	union { uint64_t x; double d; } low = { .d = twop52 };
-	
-	high.x |= a >> 32;
-	low.x |= a & UINT64_C(0x00000000ffffffff);
-	
-	const double result = (high.d - twop84_plus_twop52) + low.d;
-	return result;
+    static const double twop52 = 4503599627370496.0; // 0x1.0p52
+    static const double twop84 = 19342813113834066795298816.0; // 0x1.0p84
+    static const double twop84_plus_twop52 = 19342813118337666422669312.0; // 0x1.00000001p84
+
+    union { uint64_t x; double d; } high = { .d = twop84 };
+    union { uint64_t x; double d; } low = { .d = twop52 };
+
+    high.x |= a >> 32;
+    low.x |= a & UINT64_C(0x00000000ffffffff);
+
+    const double result = (high.d - twop84_plus_twop52) + low.d;
+    return result;
 }
 
 #else
 /* Support for systems that don't have hardware floating-point; there are no flags to
  * set, and we don't want to code-gen to an unknown soft-float implementation.
- */ 
+ */
 
 COMPILER_RT_ABI double
 __floatundidf(du_int a)
@@ -98,9 +98,9 @@
         /* a is now rounded to DBL_MANT_DIG bits */
     }
     double_bits fb;
-    fb.u.high = ((e + 1023) << 20)      |        /* exponent */
-                ((su_int)(a >> 32) & 0x000FFFFF); /* mantissa-high */
-    fb.u.low = (su_int)a;                         /* mantissa-low  */
+    fb.u.s.high = ((e + 1023) << 20)       |        /* exponent */
+                  ((su_int)(a >> 32) & 0x000FFFFF); /* mantissa-high */
+    fb.u.s.low = (su_int)a;                         /* mantissa-low  */
     return fb.f;
 }
 #endif
diff --git a/lib/builtins/floatuntidf.c b/lib/builtins/floatuntidf.c
index 06202d9..960265d 100644
--- a/lib/builtins/floatuntidf.c
+++ b/lib/builtins/floatuntidf.c
@@ -18,7 +18,7 @@
 
 /* Returns: convert a to a double, rounding toward even. */
 
-/* Assumption: double is a IEEE 64 bit floating point type 
+/* Assumption: double is a IEEE 64 bit floating point type
  *             tu_int is a 128 bit integral type
  */
 
diff --git a/lib/builtins/gcc_personality_v0.c b/lib/builtins/gcc_personality_v0.c
index 331dc2b..29e5be3 100644
--- a/lib/builtins/gcc_personality_v0.c
+++ b/lib/builtins/gcc_personality_v0.c
@@ -131,6 +131,26 @@
     return result;
 }
 
+#if defined(__arm__) && !defined(__USING_SJLJ_EXCEPTIONS__) &&                 \
+    !defined(__ARM_DWARF_EH__)
+#define USING_ARM_EHABI 1
+_Unwind_Reason_Code __gnu_unwind_frame(struct _Unwind_Exception *,
+                                       struct _Unwind_Context *);
+#endif
+
+static inline _Unwind_Reason_Code
+continueUnwind(struct _Unwind_Exception *exceptionObject,
+               struct _Unwind_Context *context) {
+#if USING_ARM_EHABI
+    /*
+     * On ARM EHABI the personality routine is responsible for actually
+     * unwinding a single stack frame before returning (ARM EHABI Sec. 6.1).
+     */
+    if (__gnu_unwind_frame(exceptionObject, context) != _URC_OK)
+        return _URC_FAILURE;
+#endif
+    return _URC_CONTINUE_UNWIND;
+}
 
 /*
  * The C compiler makes references to __gcc_personality_v0 in
@@ -141,11 +161,17 @@
  * throw through a C function compiled with -fexceptions.
  */
 #if __USING_SJLJ_EXCEPTIONS__
-// the setjump-longjump based exceptions personality routine has a different name
+/* the setjump-longjump based exceptions personality routine has a
+ * different name */
 COMPILER_RT_ABI _Unwind_Reason_Code
 __gcc_personality_sj0(int version, _Unwind_Action actions,
          uint64_t exceptionClass, struct _Unwind_Exception* exceptionObject,
          struct _Unwind_Context *context)
+#elif USING_ARM_EHABI
+/* The ARM EHABI personality routine has a different signature. */
+COMPILER_RT_ABI _Unwind_Reason_Code __gcc_personality_v0(
+         _Unwind_State state, struct _Unwind_Exception *exceptionObject,
+         struct _Unwind_Context *context)
 #else
 COMPILER_RT_ABI _Unwind_Reason_Code
 __gcc_personality_v0(int version, _Unwind_Action actions,
@@ -155,13 +181,19 @@
 {
     /* Since C does not have catch clauses, there is nothing to do during */
     /* phase 1 (the search phase). */
-    if ( actions & _UA_SEARCH_PHASE ) 
-        return _URC_CONTINUE_UNWIND;
-        
+#if USING_ARM_EHABI
+    /* After resuming from a cleanup we should also continue on to the next
+     * frame straight away. */
+    if ((state & _US_ACTION_MASK) != _US_UNWIND_FRAME_STARTING)
+#else
+    if ( actions & _UA_SEARCH_PHASE )
+#endif
+        return continueUnwind(exceptionObject, context);
+
     /* There is nothing to do if there is no LSDA for this frame. */
     const uint8_t* lsda = (uint8_t*)_Unwind_GetLanguageSpecificData(context);
     if ( lsda == (uint8_t*) 0 )
-        return _URC_CONTINUE_UNWIND;
+        return continueUnwind(exceptionObject, context);
 
     uintptr_t pc = _Unwind_GetIP(context)-1;
     uintptr_t funcStart = _Unwind_GetRegionStart(context);
@@ -204,6 +236,6 @@
     }
 
     /* No landing pad found, continue unwinding. */
-    return _URC_CONTINUE_UNWIND;
+    return continueUnwind(exceptionObject, context);
 }
 
diff --git a/lib/builtins/i386/ashldi3.S b/lib/builtins/i386/ashldi3.S
index 3fbd739..6f05dcf 100644
--- a/lib/builtins/i386/ashldi3.S
+++ b/lib/builtins/i386/ashldi3.S
@@ -56,3 +56,6 @@
 
 #endif // __SSE2__
 #endif // __i386__
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/i386/ashrdi3.S b/lib/builtins/i386/ashrdi3.S
index 8f47424..206369f 100644
--- a/lib/builtins/i386/ashrdi3.S
+++ b/lib/builtins/i386/ashrdi3.S
@@ -67,3 +67,6 @@
 
 #endif // __SSE2__
 #endif // __i386__
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/i386/divdi3.S b/lib/builtins/i386/divdi3.S
index 2cb0ddd..2fb4bdc 100644
--- a/lib/builtins/i386/divdi3.S
+++ b/lib/builtins/i386/divdi3.S
@@ -160,3 +160,6 @@
 END_COMPILERRT_FUNCTION(__divdi3)
 
 #endif // __i386__
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/i386/floatdidf.S b/lib/builtins/i386/floatdidf.S
index dcc32f8..d75dfe6 100644
--- a/lib/builtins/i386/floatdidf.S
+++ b/lib/builtins/i386/floatdidf.S
@@ -37,3 +37,6 @@
 END_COMPILERRT_FUNCTION(__floatdidf)
 
 #endif // __i386__
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/i386/floatdisf.S b/lib/builtins/i386/floatdisf.S
index f642767..0874eaa 100644
--- a/lib/builtins/i386/floatdisf.S
+++ b/lib/builtins/i386/floatdisf.S
@@ -30,3 +30,6 @@
 END_COMPILERRT_FUNCTION(__floatdisf)
 
 #endif // __i386__
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/i386/floatdixf.S b/lib/builtins/i386/floatdixf.S
index 839b043..1044ef5 100644
--- a/lib/builtins/i386/floatdixf.S
+++ b/lib/builtins/i386/floatdixf.S
@@ -28,3 +28,6 @@
 END_COMPILERRT_FUNCTION(__floatdixf)
 
 #endif // __i386__
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/i386/floatundidf.S b/lib/builtins/i386/floatundidf.S
index 8058c2a..fe03234 100644
--- a/lib/builtins/i386/floatundidf.S
+++ b/lib/builtins/i386/floatundidf.S
@@ -50,3 +50,6 @@
 END_COMPILERRT_FUNCTION(__floatundidf)
 
 #endif // __i386__
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/i386/floatundisf.S b/lib/builtins/i386/floatundisf.S
index 94c97e2..16000b5 100644
--- a/lib/builtins/i386/floatundisf.S
+++ b/lib/builtins/i386/floatundisf.S
@@ -103,3 +103,6 @@
 END_COMPILERRT_FUNCTION(__floatundisf)
 
 #endif // __i386__
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/i386/floatundixf.S b/lib/builtins/i386/floatundixf.S
index 814b52f..c935670 100644
--- a/lib/builtins/i386/floatundixf.S
+++ b/lib/builtins/i386/floatundixf.S
@@ -41,3 +41,6 @@
 END_COMPILERRT_FUNCTION(__floatundixf)
 
 #endif // __i386__
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/i386/lshrdi3.S b/lib/builtins/i386/lshrdi3.S
index b80f11a..53e95cf 100644
--- a/lib/builtins/i386/lshrdi3.S
+++ b/lib/builtins/i386/lshrdi3.S
@@ -57,3 +57,6 @@
 
 #endif // __SSE2__
 #endif // __i386__
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/i386/moddi3.S b/lib/builtins/i386/moddi3.S
index b9cee9d..a5bf9ce 100644
--- a/lib/builtins/i386/moddi3.S
+++ b/lib/builtins/i386/moddi3.S
@@ -164,3 +164,6 @@
 END_COMPILERRT_FUNCTION(__moddi3)
 
 #endif // __i386__
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/i386/muldi3.S b/lib/builtins/i386/muldi3.S
index 15b6b49..1239460 100644
--- a/lib/builtins/i386/muldi3.S
+++ b/lib/builtins/i386/muldi3.S
@@ -28,3 +28,6 @@
 END_COMPILERRT_FUNCTION(__muldi3)
 
 #endif // __i386__
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/i386/udivdi3.S b/lib/builtins/i386/udivdi3.S
index 41b2edf..7276136 100644
--- a/lib/builtins/i386/udivdi3.S
+++ b/lib/builtins/i386/udivdi3.S
@@ -113,3 +113,6 @@
 END_COMPILERRT_FUNCTION(__udivdi3)
 
 #endif // __i386__
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/i386/umoddi3.S b/lib/builtins/i386/umoddi3.S
index a190a7d..763e821 100644
--- a/lib/builtins/i386/umoddi3.S
+++ b/lib/builtins/i386/umoddi3.S
@@ -124,3 +124,6 @@
 END_COMPILERRT_FUNCTION(__umoddi3)
 
 #endif // __i386__
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/int_lib.h b/lib/builtins/int_lib.h
index 272f9d9..8dfe567 100644
--- a/lib/builtins/int_lib.h
+++ b/lib/builtins/int_lib.h
@@ -35,11 +35,7 @@
 # define COMPILER_RT_ABI __attribute__((pcs("aapcs")))
 #else
 # define ARM_EABI_FNALIAS(aeabi_name, name)
-# if defined(__arm__) && defined(_WIN32) && (!defined(_MSC_VER) || defined(__clang__))
-#   define COMPILER_RT_ABI __attribute__((pcs("aapcs")))
-# else
-#   define COMPILER_RT_ABI
-# endif
+# define COMPILER_RT_ABI
 #endif
 
 #ifdef _MSC_VER
@@ -128,6 +124,6 @@
 #endif
 
 #define __builtin_clzl __builtin_clzll
-#endif // defined(_MSC_VER) && !defined(__clang__)
+#endif /* defined(_MSC_VER) && !defined(__clang__) */
 
 #endif /* INT_LIB_H */
diff --git a/lib/builtins/int_types.h b/lib/builtins/int_types.h
index 2dad43b..660385e 100644
--- a/lib/builtins/int_types.h
+++ b/lib/builtins/int_types.h
@@ -61,7 +61,8 @@
 } udwords;
 
 /* MIPS64 issue: PR 20098 */
-#if defined(__LP64__) && !(defined(__mips__) && defined(__clang__))
+#if (defined(__LP64__) || defined(__wasm__)) && \
+    !(defined(__mips__) && defined(__clang__))
 #define CRT_HAS_128BIT
 #endif
 
diff --git a/lib/builtins/ppc/restFP.S b/lib/builtins/ppc/restFP.S
index 9503289..507e756 100644
--- a/lib/builtins/ppc/restFP.S
+++ b/lib/builtins/ppc/restFP.S
@@ -41,3 +41,6 @@
         lwz     r0,8(r1)
         mtlr	r0
         blr
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/ppc/saveFP.S b/lib/builtins/ppc/saveFP.S
index 72bd459..20b06ff 100644
--- a/lib/builtins/ppc/saveFP.S
+++ b/lib/builtins/ppc/saveFP.S
@@ -38,3 +38,6 @@
         stfd    f31,-8(r1)
         stw      r0,8(r1)
         blr
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/x86_64/floatundidf.S b/lib/builtins/x86_64/floatundidf.S
index 3cd5d02..094a68d 100644
--- a/lib/builtins/x86_64/floatundidf.S
+++ b/lib/builtins/x86_64/floatundidf.S
@@ -47,3 +47,6 @@
 END_COMPILERRT_FUNCTION(__floatundidf)
 
 #endif // __x86_64__
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/x86_64/floatundisf.S b/lib/builtins/x86_64/floatundisf.S
index 61952f4..7c9f75e 100644
--- a/lib/builtins/x86_64/floatundisf.S
+++ b/lib/builtins/x86_64/floatundisf.S
@@ -33,3 +33,6 @@
 END_COMPILERRT_FUNCTION(__floatundisf)
 
 #endif // __x86_64__
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/builtins/x86_64/floatundixf.S b/lib/builtins/x86_64/floatundixf.S
index 92961c8..28a096b 100644
--- a/lib/builtins/x86_64/floatundixf.S
+++ b/lib/builtins/x86_64/floatundixf.S
@@ -66,3 +66,6 @@
 #endif // __x86_64__
 
 */
+
+NO_EXEC_STACK_DIRECTIVE
+
diff --git a/lib/cfi/CMakeLists.txt b/lib/cfi/CMakeLists.txt
index 24e5181..56ef882 100644
--- a/lib/cfi/CMakeLists.txt
+++ b/lib/cfi/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_custom_target(cfi)
+set_target_properties(cfi PROPERTIES FOLDER "Compiler-RT Misc")
 
 set(CFI_SOURCES cfi.cc)
 
@@ -30,11 +31,9 @@
                 RTSanitizerCommon
                 RTSanitizerCommonLibc
 		RTUbsan
-		RTUbsan_cxx
     CFLAGS ${CFI_CFLAGS} ${CFI_DIAG_CFLAGS}
     PARENT_TARGET cfi)
 endforeach()
 
-add_compiler_rt_resource_file(cfi_blacklist cfi_blacklist.txt)
-add_dependencies(cfi cfi_blacklist)
+add_compiler_rt_resource_file(cfi_blacklist cfi_blacklist.txt cfi)
 add_dependencies(compiler-rt cfi)
diff --git a/lib/cfi/cfi.cc b/lib/cfi/cfi.cc
index e6249e6..ca2cf8f 100644
--- a/lib/cfi/cfi.cc
+++ b/lib/cfi/cfi.cc
@@ -11,16 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FIXME: Intercept dlopen/dlclose.
-// FIXME: Support diagnostic mode.
-// FIXME: Harden:
-//  * mprotect shadow, use mremap for updates
-//  * something else equally important
-
 #include <assert.h>
 #include <elf.h>
 #include <link.h>
 #include <string.h>
+#include <sys/mman.h>
 
 typedef ElfW(Phdr) Elf_Phdr;
 typedef ElfW(Ehdr) Elf_Ehdr;
@@ -31,19 +26,55 @@
 #include "ubsan/ubsan_init.h"
 #include "ubsan/ubsan_flags.h"
 
-static uptr __cfi_shadow;
+#ifdef CFI_ENABLE_DIAG
+#include "ubsan/ubsan_handlers.h"
+#endif
+
+namespace __cfi {
+
+#define kCfiShadowLimitsStorageSize 4096 // 1 page
+// Lets hope that the data segment is mapped with 4K pages.
+// The pointer to the cfi shadow region is stored at the start of this page.
+// The rest of the page is unused and re-mapped read-only.
+static union {
+  char space[kCfiShadowLimitsStorageSize];
+  struct {
+    uptr start;
+    uptr size;
+  } limits;
+} cfi_shadow_limits_storage
+    __attribute__((aligned(kCfiShadowLimitsStorageSize)));
 static constexpr uptr kShadowGranularity = 12;
 static constexpr uptr kShadowAlign = 1UL << kShadowGranularity; // 4096
 
 static constexpr uint16_t kInvalidShadow = 0;
 static constexpr uint16_t kUncheckedShadow = 0xFFFFU;
 
-static uint16_t *mem_to_shadow(uptr x) {
-  return (uint16_t *)(__cfi_shadow + ((x >> kShadowGranularity) << 1));
+// Get the start address of the CFI shadow region.
+uptr GetShadow() {
+  return cfi_shadow_limits_storage.limits.start;
 }
 
-typedef int (*CFICheckFn)(uptr, void *);
+uptr GetShadowSize() {
+  return cfi_shadow_limits_storage.limits.size;
+}
 
+// This will only work while the shadow is not allocated.
+void SetShadowSize(uptr size) {
+  cfi_shadow_limits_storage.limits.size = size;
+}
+
+uptr MemToShadowOffset(uptr x) {
+  return (x >> kShadowGranularity) << 1;
+}
+
+uint16_t *MemToShadow(uptr x, uptr shadow_base) {
+  return (uint16_t *)(shadow_base + MemToShadowOffset(x));
+}
+
+typedef int (*CFICheckFn)(u64, void *, void *);
+
+// This class reads and decodes the shadow contents.
 class ShadowValue {
   uptr addr;
   uint16_t v;
@@ -61,49 +92,91 @@
     return reinterpret_cast<CFICheckFn>(p);
   }
 
-  // Load a shadow valud for the given application memory address.
+  // Load a shadow value for the given application memory address.
   static const ShadowValue load(uptr addr) {
-    return ShadowValue(addr, *mem_to_shadow(addr));
+    uptr shadow_base = GetShadow();
+    uptr shadow_offset = MemToShadowOffset(addr);
+    if (shadow_offset > GetShadowSize())
+      return ShadowValue(addr, kInvalidShadow);
+    else
+      return ShadowValue(
+          addr, *reinterpret_cast<uint16_t *>(shadow_base + shadow_offset));
   }
 };
 
-static void fill_shadow_constant(uptr begin, uptr end, uint16_t v) {
-  assert(v == kInvalidShadow || v == kUncheckedShadow);
-  uint16_t *shadow_begin = mem_to_shadow(begin);
-  uint16_t *shadow_end = mem_to_shadow(end - 1) + 1;
-  memset(shadow_begin, v, (shadow_end - shadow_begin) * sizeof(*shadow_begin));
+class ShadowBuilder {
+  uptr shadow_;
+
+public:
+  // Allocate a new empty shadow (for the entire address space) on the side.
+  void Start();
+  // Mark the given address range as unchecked.
+  // This is used for uninstrumented libraries like libc.
+  // Any CFI check with a target in that range will pass.
+  void AddUnchecked(uptr begin, uptr end);
+  // Mark the given address range as belonging to a library with the given
+  // cfi_check function.
+  void Add(uptr begin, uptr end, uptr cfi_check);
+  // Finish shadow construction. Atomically switch the current active shadow
+  // region with the newly constructed one and deallocate the former.
+  void Install();
+};
+
+void ShadowBuilder::Start() {
+  shadow_ = (uptr)MmapNoReserveOrDie(GetShadowSize(), "CFI shadow");
+  VReport(1, "CFI: shadow at %zx .. %zx\n", shadow_, shadow_ + GetShadowSize());
 }
 
-static void fill_shadow(uptr begin, uptr end, uptr cfi_check) {
+void ShadowBuilder::AddUnchecked(uptr begin, uptr end) {
+  uint16_t *shadow_begin = MemToShadow(begin, shadow_);
+  uint16_t *shadow_end = MemToShadow(end - 1, shadow_) + 1;
+  memset(shadow_begin, kUncheckedShadow,
+         (shadow_end - shadow_begin) * sizeof(*shadow_begin));
+}
+
+void ShadowBuilder::Add(uptr begin, uptr end, uptr cfi_check) {
   assert((cfi_check & (kShadowAlign - 1)) == 0);
 
   // Don't fill anything below cfi_check. We can not represent those addresses
   // in the shadow, and must make sure at codegen to place all valid call
   // targets above cfi_check.
-  uptr p = Max(begin, cfi_check);
-  uint16_t *s = mem_to_shadow(p);
-  uint16_t *s_end = mem_to_shadow(end - 1) + 1;
-  uint16_t sv = ((p - cfi_check) >> kShadowGranularity) + 1;
+  begin = Max(begin, cfi_check);
+  uint16_t *s = MemToShadow(begin, shadow_);
+  uint16_t *s_end = MemToShadow(end - 1, shadow_) + 1;
+  uint16_t sv = ((begin - cfi_check) >> kShadowGranularity) + 1;
   for (; s < s_end; s++, sv++)
     *s = sv;
+}
 
-  // Sanity checks.
-  uptr q = p & ~(kShadowAlign - 1);
-  for (; q < end; q += kShadowAlign) {
-    assert((uptr)ShadowValue::load(q).get_cfi_check() == cfi_check);
-    assert((uptr)ShadowValue::load(q + kShadowAlign / 2).get_cfi_check() ==
-           cfi_check);
-    assert((uptr)ShadowValue::load(q + kShadowAlign - 1).get_cfi_check() ==
-           cfi_check);
+#if SANITIZER_LINUX
+void ShadowBuilder::Install() {
+  MprotectReadOnly(shadow_, GetShadowSize());
+  uptr main_shadow = GetShadow();
+  if (main_shadow) {
+    // Update.
+    void *res = mremap((void *)shadow_, GetShadowSize(), GetShadowSize(),
+                       MREMAP_MAYMOVE | MREMAP_FIXED, (void *)main_shadow);
+    CHECK(res != MAP_FAILED);
+  } else {
+    // Initial setup.
+    CHECK_EQ(kCfiShadowLimitsStorageSize, GetPageSizeCached());
+    CHECK_EQ(0, GetShadow());
+    cfi_shadow_limits_storage.limits.start = shadow_;
+    MprotectReadOnly((uptr)&cfi_shadow_limits_storage,
+                     sizeof(cfi_shadow_limits_storage));
+    CHECK_EQ(shadow_, GetShadow());
   }
 }
+#else
+#error not implemented
+#endif
 
 // This is a workaround for a glibc bug:
 // https://sourceware.org/bugzilla/show_bug.cgi?id=15199
 // Other platforms can, hopefully, just do
 //    dlopen(RTLD_NOLOAD | RTLD_LAZY)
 //    dlsym("__cfi_check").
-static uptr find_cfi_check_in_dso(dl_phdr_info *info) {
+uptr find_cfi_check_in_dso(dl_phdr_info *info) {
   const ElfW(Dyn) *dynamic = nullptr;
   for (int i = 0; i < info->dlpi_phnum; ++i) {
     if (info->dlpi_phdr[i].p_type == PT_DYNAMIC) {
@@ -157,11 +230,13 @@
   return 0;
 }
 
-static int dl_iterate_phdr_cb(dl_phdr_info *info, size_t size, void *data) {
+int dl_iterate_phdr_cb(dl_phdr_info *info, size_t size, void *data) {
   uptr cfi_check = find_cfi_check_in_dso(info);
   if (cfi_check)
     VReport(1, "Module '%s' __cfi_check %zx\n", info->dlpi_name, cfi_check);
 
+  ShadowBuilder *b = reinterpret_cast<ShadowBuilder *>(data);
+
   for (int i = 0; i < info->dlpi_phnum; i++) {
     const Elf_Phdr *phdr = &info->dlpi_phdr[i];
     if (phdr->p_type == PT_LOAD) {
@@ -174,28 +249,69 @@
       uptr cur_end = cur_beg + phdr->p_memsz;
       if (cfi_check) {
         VReport(1, "   %zx .. %zx\n", cur_beg, cur_end);
-        fill_shadow(cur_beg, cur_end, cfi_check ? cfi_check : (uptr)(-1));
+        b->Add(cur_beg, cur_end, cfi_check);
       } else {
-        fill_shadow_constant(cur_beg, cur_end, kInvalidShadow);
+        b->AddUnchecked(cur_beg, cur_end);
       }
     }
   }
   return 0;
 }
 
-// Fill shadow for the initial libraries.
-static void init_shadow() {
-  dl_iterate_phdr(dl_iterate_phdr_cb, nullptr);
+// Init or update shadow for the current set of loaded libraries.
+void UpdateShadow() {
+  ShadowBuilder b;
+  b.Start();
+  dl_iterate_phdr(dl_iterate_phdr_cb, &b);
+  b.Install();
 }
 
-extern "C" SANITIZER_INTERFACE_ATTRIBUTE
-void __cfi_slowpath(uptr CallSiteTypeId, void *Ptr) {
+void InitShadow() {
+  CHECK_EQ(0, GetShadow());
+  CHECK_EQ(0, GetShadowSize());
+
+  uptr vma = GetMaxVirtualAddress();
+  // Shadow is 2 -> 2**kShadowGranularity.
+  SetShadowSize((vma >> (kShadowGranularity - 1)) + 1);
+  VReport(1, "CFI: VMA size %zx, shadow size %zx\n", vma, GetShadowSize());
+
+  UpdateShadow();
+}
+
+THREADLOCAL int in_loader;
+BlockingMutex shadow_update_lock(LINKER_INITIALIZED);
+
+void EnterLoader() {
+  if (in_loader == 0) {
+    shadow_update_lock.Lock();
+  }
+  ++in_loader;
+}
+
+void ExitLoader() {
+  CHECK(in_loader > 0);
+  --in_loader;
+  UpdateShadow();
+  if (in_loader == 0) {
+    shadow_update_lock.Unlock();
+  }
+}
+
+ALWAYS_INLINE void CfiSlowPathCommon(u64 CallSiteTypeId, void *Ptr,
+                                     void *DiagData) {
   uptr Addr = (uptr)Ptr;
-  VReport(3, "__cfi_slowpath: %zx, %p\n", CallSiteTypeId, Ptr);
+  VReport(3, "__cfi_slowpath: %llx, %p\n", CallSiteTypeId, Ptr);
   ShadowValue sv = ShadowValue::load(Addr);
   if (sv.is_invalid()) {
-    VReport(2, "CFI: invalid memory region for a function pointer (shadow==0): %p\n", Ptr);
-    Die();
+    VReport(1, "CFI: invalid memory region for a check target: %p\n", Ptr);
+#ifdef CFI_ENABLE_DIAG
+    if (DiagData) {
+      __ubsan_handle_cfi_check_fail(
+          reinterpret_cast<__ubsan::CFICheckFailData *>(DiagData), Addr, false);
+      return;
+    }
+#endif
+    Trap();
   }
   if (sv.is_unchecked()) {
     VReport(2, "CFI: unchecked call (shadow=FFFF): %p\n", Ptr);
@@ -203,10 +319,10 @@
   }
   CFICheckFn cfi_check = sv.get_cfi_check();
   VReport(2, "__cfi_check at %p\n", cfi_check);
-  cfi_check(CallSiteTypeId, Ptr);
+  cfi_check(CallSiteTypeId, Ptr, DiagData);
 }
 
-static void InitializeFlags() {
+void InitializeFlags() {
   SetCommonFlagsDefaults();
 #ifdef CFI_ENABLE_DIAG
   __ubsan::Flags *uf = __ubsan::flags();
@@ -227,15 +343,54 @@
   ubsan_parser.ParseString(GetEnv("UBSAN_OPTIONS"));
 #endif
 
-  SetVerbosity(common_flags()->verbosity);
+  InitializeCommonFlags();
 
-  if (Verbosity()) ReportUnrecognizedFlags();
+  if (Verbosity())
+    ReportUnrecognizedFlags();
 
   if (common_flags()->help) {
     cfi_parser.PrintFlagDescriptions();
   }
 }
 
+} // namespace __cfi
+
+using namespace __cfi;
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE void
+__cfi_slowpath(u64 CallSiteTypeId, void *Ptr) {
+  CfiSlowPathCommon(CallSiteTypeId, Ptr, nullptr);
+}
+
+#ifdef CFI_ENABLE_DIAG
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE void
+__cfi_slowpath_diag(u64 CallSiteTypeId, void *Ptr, void *DiagData) {
+  CfiSlowPathCommon(CallSiteTypeId, Ptr, DiagData);
+}
+#endif
+
+// Setup shadow for dlopen()ed libraries.
+// The actual shadow setup happens after dlopen() returns, which means that
+// a library can not be a target of any CFI checks while its constructors are
+// running. It's unclear how to fix this without some extra help from libc.
+// In glibc, mmap inside dlopen is not interceptable.
+// Maybe a seccomp-bpf filter?
+// We could insert a high-priority constructor into the library, but that would
+// not help with the uninstrumented libraries.
+INTERCEPTOR(void*, dlopen, const char *filename, int flag) {
+  EnterLoader();
+  void *handle = REAL(dlopen)(filename, flag);
+  ExitLoader();
+  return handle;
+}
+
+INTERCEPTOR(int, dlclose, void *handle) {
+  EnterLoader();
+  int res = REAL(dlclose)(handle);
+  ExitLoader();
+  return res;
+}
+
 extern "C" SANITIZER_INTERFACE_ATTRIBUTE
 #if !SANITIZER_CAN_USE_PREINIT_ARRAY
 // On ELF platforms, the constructor is invoked using .preinit_array (see below)
@@ -244,16 +399,10 @@
 void __cfi_init() {
   SanitizerToolName = "CFI";
   InitializeFlags();
+  InitShadow();
 
-  uptr vma = GetMaxVirtualAddress();
-  // Shadow is 2 -> 2**kShadowGranularity.
-  uptr shadow_size = (vma >> (kShadowGranularity - 1)) + 1;
-  VReport(1, "CFI: VMA size %zx, shadow size %zx\n", vma, shadow_size);
-  void *shadow = MmapNoReserveOrDie(shadow_size, "CFI shadow");
-  VReport(1, "CFI: shadow at %zx .. %zx\n", shadow,
-          reinterpret_cast<uptr>(shadow) + shadow_size);
-  __cfi_shadow = (uptr)shadow;
-  init_shadow();
+  INTERCEPT_FUNCTION(dlopen);
+  INTERCEPT_FUNCTION(dlclose);
 
 #ifdef CFI_ENABLE_DIAG
   __ubsan::InitAsPlugin();
diff --git a/lib/dfsan/CMakeLists.txt b/lib/dfsan/CMakeLists.txt
index 19a7909..eca402d 100644
--- a/lib/dfsan/CMakeLists.txt
+++ b/lib/dfsan/CMakeLists.txt
@@ -6,12 +6,14 @@
   dfsan_custom.cc
   dfsan_interceptors.cc)
 set(DFSAN_COMMON_CFLAGS ${SANITIZER_COMMON_CFLAGS})
-append_no_rtti_flag(DFSAN_COMMON_CFLAGS)
+append_rtti_flag(OFF DFSAN_COMMON_CFLAGS)
 # Prevent clang from generating libc calls.
 append_list_if(COMPILER_RT_HAS_FFREESTANDING_FLAG -ffreestanding DFSAN_COMMON_CFLAGS)
 
 # Static runtime library.
 add_custom_target(dfsan)
+set_target_properties(dfsan PROPERTIES FOLDER "Compiler-RT Misc")
+
 foreach(arch ${DFSAN_SUPPORTED_ARCH})
   set(DFSAN_CFLAGS ${DFSAN_COMMON_CFLAGS})
   append_list_if(COMPILER_RT_HAS_FPIE_FLAG -fPIE DFSAN_CFLAGS)
diff --git a/lib/dfsan/dfsan.cc b/lib/dfsan/dfsan.cc
index 7285f20..4156000 100644
--- a/lib/dfsan/dfsan.cc
+++ b/lib/dfsan/dfsan.cc
@@ -362,12 +362,13 @@
   RegisterCommonFlags(&parser);
   RegisterDfsanFlags(&parser, &flags());
   parser.ParseString(GetEnv("DFSAN_OPTIONS"));
-  SetVerbosity(common_flags()->verbosity);
+  InitializeCommonFlags();
   if (Verbosity()) ReportUnrecognizedFlags();
   if (common_flags()->help) parser.PrintFlagDescriptions();
 }
 
 static void InitializePlatformEarly() {
+  AvoidCVE_2016_2143();
 #ifdef DFSAN_RUNTIME_VMA
   __dfsan::vmaSize =
     (MostSignificantSetBitIndex(GET_CURRENT_FRAME()) + 1);
@@ -411,7 +412,7 @@
   // case by disabling memory protection when ASLR is disabled.
   uptr init_addr = (uptr)&dfsan_init;
   if (!(init_addr >= UnusedAddr() && init_addr < AppAddr()))
-    MmapNoAccess(UnusedAddr(), AppAddr() - UnusedAddr());
+    MmapFixedNoAccess(UnusedAddr(), AppAddr() - UnusedAddr());
 
   InitializeInterceptors();
 
diff --git a/lib/esan/CMakeLists.txt b/lib/esan/CMakeLists.txt
new file mode 100644
index 0000000..2a0a71b
--- /dev/null
+++ b/lib/esan/CMakeLists.txt
@@ -0,0 +1,43 @@
+# Build for the EfficiencySanitizer runtime support library.
+
+add_custom_target(esan)
+set_target_properties(esan PROPERTIES FOLDER "Compiler-RT Misc")
+
+set(ESAN_RTL_CFLAGS ${SANITIZER_COMMON_CFLAGS})
+append_rtti_flag(OFF ESAN_RTL_CFLAGS)
+
+include_directories(..)
+
+set(ESAN_SOURCES
+  esan.cpp
+  esan_flags.cpp
+  esan_interface.cpp
+  esan_interceptors.cpp
+  esan_linux.cpp
+  esan_sideline_linux.cpp
+  cache_frag.cpp
+  working_set.cpp
+  working_set_posix.cpp)
+
+foreach (arch ${ESAN_SUPPORTED_ARCH})
+  add_compiler_rt_runtime(clang_rt.esan
+    STATIC
+    ARCHS ${arch}
+    SOURCES ${ESAN_SOURCES}
+            $<TARGET_OBJECTS:RTInterception.${arch}>
+            $<TARGET_OBJECTS:RTSanitizerCommon.${arch}>
+            $<TARGET_OBJECTS:RTSanitizerCommonLibc.${arch}>
+    CFLAGS ${ESAN_RTL_CFLAGS})
+  add_sanitizer_rt_symbols(clang_rt.esan
+    ARCHS ${arch}
+    EXTRA esan.syms.extra)
+  add_dependencies(esan
+    clang_rt.esan-${arch}
+    clang_rt.esan-${arch}-symbols)
+endforeach()
+
+add_dependencies(compiler-rt esan)
+
+if (COMPILER_RT_INCLUDE_TESTS)
+  # TODO(bruening): add tests via add_subdirectory(tests)
+endif()
diff --git a/lib/esan/cache_frag.cpp b/lib/esan/cache_frag.cpp
new file mode 100644
index 0000000..a3e612d
--- /dev/null
+++ b/lib/esan/cache_frag.cpp
@@ -0,0 +1,208 @@
+//===-- cache_frag.cpp ----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of EfficiencySanitizer, a family of performance tuners.
+//
+// This file contains cache fragmentation-specific code.
+//===----------------------------------------------------------------------===//
+
+#include "esan.h"
+#include "esan_flags.h"
+#include "sanitizer_common/sanitizer_addrhashmap.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_placement_new.h"
+#include <string.h>
+
+namespace __esan {
+
+//===-- Struct field access counter runtime -------------------------------===//
+
+// This should be kept consistent with LLVM's EfficiencySanitizer StructInfo.
+struct StructInfo {
+  const char *StructName;
+  u32 Size;
+  u32 NumFields;
+  u32 *FieldOffset;           // auxiliary struct field info.
+  u32 *FieldSize;             // auxiliary struct field info.
+  const char **FieldTypeName; // auxiliary struct field info.
+  u64 *FieldCounters;
+  u64 *ArrayCounter;
+  bool hasAuxFieldInfo() { return FieldOffset != nullptr; }
+};
+
+// This should be kept consistent with LLVM's EfficiencySanitizer CacheFragInfo.
+// The tool-specific information per compilation unit (module).
+struct CacheFragInfo {
+  const char *UnitName;
+  u32 NumStructs;
+  StructInfo *Structs;
+};
+
+struct StructCounter {
+  StructInfo *Struct;
+  u64 Count; // The total access count of the struct.
+  u64 Ratio; // Difference ratio for the struct layout access.
+};
+
+// We use StructHashMap to keep track of an unique copy of StructCounter.
+typedef AddrHashMap<StructCounter, 31051> StructHashMap;
+struct Context {
+  StructHashMap StructMap;
+  u32 NumStructs;
+  u64 TotalCount; // The total access count of all structs.
+};
+static Context *Ctx;
+
+static void reportStructSummary() {
+  // FIXME: provide a better struct field access summary report.
+  Report("%s: total struct field access count = %llu\n", SanitizerToolName,
+         Ctx->TotalCount);
+}
+
+// FIXME: we are still exploring proper ways to evaluate the difference between
+// struct field counts.  Currently, we use a simple formula to calculate the
+// difference ratio: V1/V2.
+static inline u64 computeDifferenceRatio(u64 Val1, u64 Val2) {
+  if (Val2 > Val1) {
+    Swap(Val1, Val2);
+  }
+  if (Val2 == 0)
+    Val2 = 1;
+  return (Val1 / Val2);
+}
+
+static void reportStructCounter(StructHashMap::Handle &Handle) {
+  const u32 TypePrintLimit = 512;
+  const char *type, *start, *end;
+  StructInfo *Struct = Handle->Struct;
+  // Union field address calculation is done via bitcast instead of GEP,
+  // so the count for union is always 0.
+  // We skip the union report to avoid confusion.
+  if (strncmp(Struct->StructName, "union.", 6) == 0)
+    return;
+  // Remove the '.' after class/struct during print.
+  if (strncmp(Struct->StructName, "class.", 6) == 0) {
+    type = "class";
+    start = &Struct->StructName[6];
+  } else {
+    type = "struct";
+    start = &Struct->StructName[7];
+  }
+  // Remove the suffixes with '#' during print.
+  end = strchr(start, '#');
+  CHECK(end != nullptr);
+  Report("  %s %.*s\n", type, end - start, start);
+  Report("   size = %u, count = %llu, ratio = %llu, array access = %llu\n",
+         Struct->Size, Handle->Count, Handle->Ratio, *Struct->ArrayCounter);
+  if (Struct->hasAuxFieldInfo()) {
+    for (u32 i = 0; i < Struct->NumFields; ++i) {
+      Report("   #%2u: offset = %u,\t size = %u,"
+             "\t count = %llu,\t type = %.*s\n",
+             i, Struct->FieldOffset[i], Struct->FieldSize[i],
+             Struct->FieldCounters[i], TypePrintLimit, Struct->FieldTypeName[i]);
+    }
+  } else {
+    for (u32 i = 0; i < Struct->NumFields; ++i) {
+      Report("   #%2u: count = %llu\n", i, Struct->FieldCounters[i]);
+    }
+  }
+}
+
+static void computeStructRatio(StructHashMap::Handle &Handle) {
+  Handle->Ratio = 0;
+  Handle->Count = Handle->Struct->FieldCounters[0];
+  for (u32 i = 1; i < Handle->Struct->NumFields; ++i) {
+    Handle->Count += Handle->Struct->FieldCounters[i];
+    Handle->Ratio += computeDifferenceRatio(
+        Handle->Struct->FieldCounters[i - 1], Handle->Struct->FieldCounters[i]);
+  }
+  Ctx->TotalCount += Handle->Count;
+  if (Handle->Ratio >= (u64)getFlags()->report_threshold ||
+      (Verbosity() >= 1 && Handle->Count > 0))
+    reportStructCounter(Handle);
+}
+
+static void registerStructInfo(CacheFragInfo *CacheFrag) {
+  for (u32 i = 0; i < CacheFrag->NumStructs; ++i) {
+    StructInfo *Struct = &CacheFrag->Structs[i];
+    StructHashMap::Handle H(&Ctx->StructMap, (uptr)Struct->FieldCounters);
+    if (H.created()) {
+      VPrintf(2, " Register %s: %u fields\n", Struct->StructName,
+              Struct->NumFields);
+      H->Struct = Struct;
+      ++Ctx->NumStructs;
+    } else {
+      VPrintf(2, " Duplicated %s: %u fields\n", Struct->StructName,
+              Struct->NumFields);
+    }
+  }
+}
+
+static void unregisterStructInfo(CacheFragInfo *CacheFrag) {
+  // FIXME: if the library is unloaded before finalizeCacheFrag, we should
+  // collect the result for later report.
+  for (u32 i = 0; i < CacheFrag->NumStructs; ++i) {
+    StructInfo *Struct = &CacheFrag->Structs[i];
+    StructHashMap::Handle H(&Ctx->StructMap, (uptr)Struct->FieldCounters, true);
+    if (H.exists()) {
+      VPrintf(2, " Unregister %s: %u fields\n", Struct->StructName,
+              Struct->NumFields);
+      // FIXME: we should move this call to finalizeCacheFrag once we can
+      // iterate over the hash map there.
+      computeStructRatio(H);
+      --Ctx->NumStructs;
+    } else {
+      VPrintf(2, " Duplicated %s: %u fields\n", Struct->StructName,
+              Struct->NumFields);
+    }
+  }
+  static bool Reported = false;
+  if (Ctx->NumStructs == 0 && !Reported) {
+    Reported = true;
+    reportStructSummary();
+  }
+}
+
+//===-- Init/exit functions -----------------------------------------------===//
+
+void processCacheFragCompilationUnitInit(void *Ptr) {
+  CacheFragInfo *CacheFrag = (CacheFragInfo *)Ptr;
+  VPrintf(2, "in esan::%s: %s with %u class(es)/struct(s)\n", __FUNCTION__,
+          CacheFrag->UnitName, CacheFrag->NumStructs);
+  registerStructInfo(CacheFrag);
+}
+
+void processCacheFragCompilationUnitExit(void *Ptr) {
+  CacheFragInfo *CacheFrag = (CacheFragInfo *)Ptr;
+  VPrintf(2, "in esan::%s: %s with %u class(es)/struct(s)\n", __FUNCTION__,
+          CacheFrag->UnitName, CacheFrag->NumStructs);
+  unregisterStructInfo(CacheFrag);
+}
+
+void initializeCacheFrag() {
+  VPrintf(2, "in esan::%s\n", __FUNCTION__);
+  // We use placement new to initialize Ctx before C++ static initializaion.
+  // We make CtxMem 8-byte aligned for atomic operations in AddrHashMap.
+  static u64 CtxMem[sizeof(Context) / sizeof(u64) + 1];
+  Ctx = new (CtxMem) Context();
+  Ctx->NumStructs = 0;
+}
+
+int finalizeCacheFrag() {
+  VPrintf(2, "in esan::%s\n", __FUNCTION__);
+  return 0;
+}
+
+void reportCacheFrag() {
+  VPrintf(2, "in esan::%s\n", __FUNCTION__);
+  // FIXME: Not yet implemented.  We need to iterate over all of the
+  // compilation unit data.
+}
+
+} // namespace __esan
diff --git a/lib/esan/cache_frag.h b/lib/esan/cache_frag.h
new file mode 100644
index 0000000..646d3f8
--- /dev/null
+++ b/lib/esan/cache_frag.h
@@ -0,0 +1,29 @@
+//===-- cache_frag.h --------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of EfficiencySanitizer, a family of performance tuners.
+//
+// Header for cache-fragmentation-specific code.
+//===----------------------------------------------------------------------===//
+
+#ifndef CACHE_FRAG_H
+#define CACHE_FRAG_H
+
+namespace __esan {
+
+void processCacheFragCompilationUnitInit(void *Ptr);
+void processCacheFragCompilationUnitExit(void *Ptr);
+
+void initializeCacheFrag();
+int finalizeCacheFrag();
+void reportCacheFrag();
+
+} // namespace __esan
+
+#endif  // CACHE_FRAG_H
diff --git a/lib/esan/esan.cpp b/lib/esan/esan.cpp
new file mode 100644
index 0000000..3c69b4e
--- /dev/null
+++ b/lib/esan/esan.cpp
@@ -0,0 +1,262 @@
+//===-- esan.cpp ----------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of EfficiencySanitizer, a family of performance tuners.
+//
+// Main file (entry points) for the Esan run-time.
+//===----------------------------------------------------------------------===//
+
+#include "esan.h"
+#include "esan_flags.h"
+#include "esan_interface_internal.h"
+#include "esan_shadow.h"
+#include "cache_frag.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_flag_parser.h"
+#include "sanitizer_common/sanitizer_flags.h"
+#include "working_set.h"
+
+// See comment below.
+extern "C" {
+extern void __cxa_atexit(void (*function)(void));
+}
+
+namespace __esan {
+
+bool EsanIsInitialized;
+bool EsanDuringInit;
+ShadowMapping Mapping;
+
+// Different tools use different scales within the same shadow mapping scheme.
+// The scale used here must match that used by the compiler instrumentation.
+// This array is indexed by the ToolType enum.
+static const uptr ShadowScale[] = {
+  0, // ESAN_None.
+  2, // ESAN_CacheFrag: 4B:1B, so 4 to 1 == >>2.
+  6, // ESAN_WorkingSet: 64B:1B, so 64 to 1 == >>6.
+};
+
+// We are combining multiple performance tuning tools under the umbrella of
+// one EfficiencySanitizer super-tool.  Most of our tools have very similar
+// memory access instrumentation, shadow memory mapping, libc interception,
+// etc., and there is typically more shared code than distinct code.
+//
+// We are not willing to dispatch on tool dynamically in our fastpath
+// instrumentation: thus, which tool to use is a static option selected
+// at compile time and passed to __esan_init().
+//
+// We are willing to pay the overhead of tool dispatch in the slowpath to more
+// easily share code.  We expect to only come here rarely.
+// If this becomes a performance hit, we can add separate interface
+// routines for each subtool (e.g., __esan_cache_frag_aligned_load_4).
+// But for libc interceptors, we'll have to do one of the following:
+// A) Add multiple-include support to sanitizer_common_interceptors.inc,
+//    instantiate it separately for each tool, and call the selected
+//    tool's intercept setup code.
+// B) Build separate static runtime libraries, one for each tool.
+// C) Completely split the tools into separate sanitizers.
+
+void processRangeAccess(uptr PC, uptr Addr, int Size, bool IsWrite) {
+  VPrintf(3, "in esan::%s %p: %c %p %d\n", __FUNCTION__, PC,
+          IsWrite ? 'w' : 'r', Addr, Size);
+  if (__esan_which_tool == ESAN_CacheFrag) {
+    // TODO(bruening): add shadow mapping and update shadow bits here.
+    // We'll move this to cache_frag.cpp once we have something.
+  } else if (__esan_which_tool == ESAN_WorkingSet) {
+    processRangeAccessWorkingSet(PC, Addr, Size, IsWrite);
+  }
+}
+
+bool processSignal(int SigNum, void (*Handler)(int), void (**Result)(int)) {
+  if (__esan_which_tool == ESAN_WorkingSet)
+    return processWorkingSetSignal(SigNum, Handler, Result);
+  return true;
+}
+
+bool processSigaction(int SigNum, const void *Act, void *OldAct) {
+  if (__esan_which_tool == ESAN_WorkingSet)
+    return processWorkingSetSigaction(SigNum, Act, OldAct);
+  return true;
+}
+
+bool processSigprocmask(int How, void *Set, void *OldSet) {
+  if (__esan_which_tool == ESAN_WorkingSet)
+    return processWorkingSetSigprocmask(How, Set, OldSet);
+  return true;
+}
+
+#if SANITIZER_DEBUG
+static bool verifyShadowScheme() {
+  // Sanity checks for our shadow mapping scheme.
+  uptr AppStart, AppEnd;
+  if (Verbosity() >= 3) {
+    for (int i = 0; getAppRegion(i, &AppStart, &AppEnd); ++i) {
+      VPrintf(3, "App #%d: [%zx-%zx) (%zuGB)\n", i, AppStart, AppEnd,
+              (AppEnd - AppStart) >> 30);
+    }
+  }
+  for (int Scale = 0; Scale < 8; ++Scale) {
+    Mapping.initialize(Scale);
+    if (Verbosity() >= 3) {
+      VPrintf(3, "\nChecking scale %d\n", Scale);
+      uptr ShadowStart, ShadowEnd;
+      for (int i = 0; getShadowRegion(i, &ShadowStart, &ShadowEnd); ++i) {
+        VPrintf(3, "Shadow #%d: [%zx-%zx) (%zuGB)\n", i, ShadowStart,
+                ShadowEnd, (ShadowEnd - ShadowStart) >> 30);
+      }
+      for (int i = 0; getShadowRegion(i, &ShadowStart, &ShadowEnd); ++i) {
+        VPrintf(3, "Shadow(Shadow) #%d: [%zx-%zx)\n", i,
+                appToShadow(ShadowStart), appToShadow(ShadowEnd - 1)+1);
+      }
+    }
+    for (int i = 0; getAppRegion(i, &AppStart, &AppEnd); ++i) {
+      DCHECK(isAppMem(AppStart));
+      DCHECK(!isAppMem(AppStart - 1));
+      DCHECK(isAppMem(AppEnd - 1));
+      DCHECK(!isAppMem(AppEnd));
+      DCHECK(!isShadowMem(AppStart));
+      DCHECK(!isShadowMem(AppEnd - 1));
+      DCHECK(isShadowMem(appToShadow(AppStart)));
+      DCHECK(isShadowMem(appToShadow(AppEnd - 1)));
+      // Double-shadow checks.
+      DCHECK(!isShadowMem(appToShadow(appToShadow(AppStart))));
+      DCHECK(!isShadowMem(appToShadow(appToShadow(AppEnd - 1))));
+    }
+    // Ensure no shadow regions overlap each other.
+    uptr ShadowAStart, ShadowBStart, ShadowAEnd, ShadowBEnd;
+    for (int i = 0; getShadowRegion(i, &ShadowAStart, &ShadowAEnd); ++i) {
+      for (int j = 0; getShadowRegion(j, &ShadowBStart, &ShadowBEnd); ++j) {
+        DCHECK(i == j || ShadowAStart >= ShadowBEnd ||
+               ShadowAEnd <= ShadowBStart);
+      }
+    }
+  }
+  return true;
+}
+#endif
+
+static void initializeShadow() {
+  verifyAddressSpace();
+
+  DCHECK(verifyShadowScheme());
+
+  Mapping.initialize(ShadowScale[__esan_which_tool]);
+
+  VPrintf(1, "Shadow scale=%d offset=%p\n", Mapping.Scale, Mapping.Offset);
+
+  uptr ShadowStart, ShadowEnd;
+  for (int i = 0; getShadowRegion(i, &ShadowStart, &ShadowEnd); ++i) {
+    VPrintf(1, "Shadow #%d: [%zx-%zx) (%zuGB)\n", i, ShadowStart, ShadowEnd,
+            (ShadowEnd - ShadowStart) >> 30);
+
+    uptr Map;
+    if (__esan_which_tool == ESAN_WorkingSet) {
+      // We want to identify all shadow pages that are touched so we start
+      // out inaccessible.
+      Map = (uptr)MmapFixedNoAccess(ShadowStart, ShadowEnd- ShadowStart,
+                                    "shadow");
+    } else {
+      Map = (uptr)MmapFixedNoReserve(ShadowStart, ShadowEnd - ShadowStart,
+                                     "shadow");
+    }
+    if (Map != ShadowStart) {
+      Printf("FATAL: EfficiencySanitizer failed to map its shadow memory.\n");
+      Die();
+    }
+
+    if (common_flags()->no_huge_pages_for_shadow)
+      NoHugePagesInRegion(ShadowStart, ShadowEnd - ShadowStart);
+    if (common_flags()->use_madv_dontdump)
+      DontDumpShadowMemory(ShadowStart, ShadowEnd - ShadowStart);
+
+    // TODO: Call MmapNoAccess() on in-between regions.
+  }
+}
+
+void initializeLibrary(ToolType Tool) {
+  // We assume there is only one thread during init, but we need to
+  // guard against double-init when we're (re-)called from an
+  // early interceptor.
+  if (EsanIsInitialized || EsanDuringInit)
+    return;
+  EsanDuringInit = true;
+  CHECK(Tool == __esan_which_tool);
+  SanitizerToolName = "EfficiencySanitizer";
+  CacheBinaryName();
+  initializeFlags();
+
+  // Intercepting libc _exit or exit via COMMON_INTERCEPTOR_ON_EXIT only
+  // finalizes on an explicit exit call by the app.  To handle a normal
+  // exit we register an atexit handler.
+  ::__cxa_atexit((void (*)())finalizeLibrary);
+
+  VPrintf(1, "in esan::%s\n", __FUNCTION__);
+  if (__esan_which_tool <= ESAN_None || __esan_which_tool >= ESAN_Max) {
+    Printf("ERROR: unknown tool %d requested\n", __esan_which_tool);
+    Die();
+  }
+
+  initializeShadow();
+  if (__esan_which_tool == ESAN_WorkingSet)
+    initializeShadowWorkingSet();
+
+  initializeInterceptors();
+
+  if (__esan_which_tool == ESAN_CacheFrag) {
+    initializeCacheFrag();
+  } else if (__esan_which_tool == ESAN_WorkingSet) {
+    initializeWorkingSet();
+  }
+
+  EsanIsInitialized = true;
+  EsanDuringInit = false;
+}
+
+int finalizeLibrary() {
+  VPrintf(1, "in esan::%s\n", __FUNCTION__);
+  if (__esan_which_tool == ESAN_CacheFrag) {
+    return finalizeCacheFrag();
+  } else if (__esan_which_tool == ESAN_WorkingSet) {
+    return finalizeWorkingSet();
+  }
+  return 0;
+}
+
+void reportResults() {
+  VPrintf(1, "in esan::%s\n", __FUNCTION__);
+  if (__esan_which_tool == ESAN_CacheFrag) {
+    return reportCacheFrag();
+  } else if (__esan_which_tool == ESAN_WorkingSet) {
+    return reportWorkingSet();
+  }
+}
+
+void processCompilationUnitInit(void *Ptr) {
+  VPrintf(2, "in esan::%s\n", __FUNCTION__);
+  if (__esan_which_tool == ESAN_CacheFrag) {
+    DCHECK(Ptr != nullptr);
+    processCacheFragCompilationUnitInit(Ptr);
+  } else {
+    DCHECK(Ptr == nullptr);
+  }
+}
+
+// This is called when the containing module is unloaded.
+// For the main executable module, this is called after finalizeLibrary.
+void processCompilationUnitExit(void *Ptr) {
+  VPrintf(2, "in esan::%s\n", __FUNCTION__);
+  if (__esan_which_tool == ESAN_CacheFrag) {
+    DCHECK(Ptr != nullptr);
+    processCacheFragCompilationUnitExit(Ptr);
+  } else {
+    DCHECK(Ptr == nullptr);
+  }
+}
+
+} // namespace __esan
diff --git a/lib/esan/esan.h b/lib/esan/esan.h
new file mode 100644
index 0000000..371810d
--- /dev/null
+++ b/lib/esan/esan.h
@@ -0,0 +1,59 @@
+//===-- esan.h --------------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of EfficiencySanitizer, a family of performance tuners.
+//
+// Main internal esan header file.
+//
+// Ground rules:
+//   - C++ run-time should not be used (static CTORs, RTTI, exceptions, static
+//     function-scope locals)
+//   - All functions/classes/etc reside in namespace __esan, except for those
+//     declared in esan_interface_internal.h.
+//   - Platform-specific files should be used instead of ifdefs (*).
+//   - No system headers included in header files (*).
+//   - Platform specific headers included only into platform-specific files (*).
+//
+//  (*) Except when inlining is critical for performance.
+//===----------------------------------------------------------------------===//
+
+#ifndef ESAN_H
+#define ESAN_H
+
+#include "interception/interception.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "esan_interface_internal.h"
+
+namespace __esan {
+
+extern bool EsanIsInitialized;
+extern bool EsanDuringInit;
+
+void initializeLibrary(ToolType Tool);
+int finalizeLibrary();
+void reportResults();
+// Esan creates the variable per tool per compilation unit at compile time
+// and passes its pointer Ptr to the runtime library.
+void processCompilationUnitInit(void *Ptr);
+void processCompilationUnitExit(void *Ptr);
+void processRangeAccess(uptr PC, uptr Addr, int Size, bool IsWrite);
+void initializeInterceptors();
+
+// Platform-dependent routines.
+void verifyAddressSpace();
+bool fixMmapAddr(void **Addr, SIZE_T Size, int Flags);
+uptr checkMmapResult(uptr Addr, SIZE_T Size);
+// The return value indicates whether to call the real version or not.
+bool processSignal(int SigNum, void (*Handler)(int), void (**Result)(int));
+bool processSigaction(int SigNum, const void *Act, void *OldAct);
+bool processSigprocmask(int How, void *Set, void *OldSet);
+
+} // namespace __esan
+
+#endif // ESAN_H
diff --git a/lib/esan/esan.syms.extra b/lib/esan/esan.syms.extra
new file mode 100644
index 0000000..d6397d4
--- /dev/null
+++ b/lib/esan/esan.syms.extra
@@ -0,0 +1,4 @@
+__esan_init
+__esan_exit
+__esan_aligned*
+__esan_unaligned*
diff --git a/lib/esan/esan_circular_buffer.h b/lib/esan/esan_circular_buffer.h
new file mode 100644
index 0000000..9ce102d
--- /dev/null
+++ b/lib/esan/esan_circular_buffer.h
@@ -0,0 +1,96 @@
+//===-- esan_circular_buffer.h ----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of EfficiencySanitizer, a family of performance tuners.
+//
+// Circular buffer data structure.
+//===----------------------------------------------------------------------===//
+
+#include "sanitizer_common/sanitizer_common.h"
+
+namespace __esan {
+
+// A circular buffer for POD data whose memory is allocated using mmap.
+// There are two usage models: one is to use initialize/free (for global
+// instances) and the other is to use placement new with the
+// constructor and to call the destructor or free (they are equivalent).
+template<typename T>
+class CircularBuffer {
+ public:
+  // To support global instances we cannot initialize any field in the
+  // default constructor.
+  explicit CircularBuffer() {}
+  CircularBuffer(uptr BufferCapacity) {
+    initialize(BufferCapacity);
+    WasConstructed = true;
+  }
+  ~CircularBuffer() {
+    if (WasConstructed) // Else caller will call free() explicitly.
+      free();
+  }
+  void initialize(uptr BufferCapacity) {
+    Capacity = BufferCapacity;
+    // MmapOrDie rounds up to the page size for us.
+    Data = (T *)MmapOrDie(Capacity * sizeof(T), "CircularBuffer");
+    StartIdx = 0;
+    Count = 0;
+    WasConstructed = false;
+  }
+  void free() {
+    UnmapOrDie(Data, Capacity * sizeof(T));
+  }
+  T &operator[](uptr Idx) {
+    CHECK_LT(Idx, Count);
+    uptr ArrayIdx = (StartIdx + Idx) % Capacity;
+    return Data[ArrayIdx];
+  }
+  const T &operator[](uptr Idx) const {
+    CHECK_LT(Idx, Count);
+    uptr ArrayIdx = (StartIdx + Idx) % Capacity;
+    return Data[ArrayIdx];
+  }
+  void push_back(const T &Item) {
+    CHECK_GT(Capacity, 0);
+    uptr ArrayIdx = (StartIdx + Count) % Capacity;
+    Data[ArrayIdx] = Item;
+    if (Count < Capacity)
+      ++Count;
+    else
+      StartIdx = (StartIdx + 1) % Capacity;
+  }
+  T &back() {
+    CHECK_GT(Count, 0);
+    uptr ArrayIdx = (StartIdx + Count - 1) % Capacity;
+    return Data[ArrayIdx];
+  }
+  void pop_back() {
+    CHECK_GT(Count, 0);
+    --Count;
+  }
+  uptr size() const {
+    return Count;
+  }
+  void clear() {
+    StartIdx = 0;
+    Count = 0;
+  }
+  bool empty() const { return size() == 0; }
+
+ private:
+  CircularBuffer(const CircularBuffer&);
+  void operator=(const CircularBuffer&);
+
+  bool WasConstructed;
+  T *Data;
+  uptr Capacity;
+  uptr StartIdx;
+  uptr Count;
+};
+
+} // namespace __esan
diff --git a/lib/esan/esan_flags.cpp b/lib/esan/esan_flags.cpp
new file mode 100644
index 0000000..3b047e2
--- /dev/null
+++ b/lib/esan/esan_flags.cpp
@@ -0,0 +1,58 @@
+//===-- esan_flags.cc -------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of EfficiencySanitizer, a family of performance tuners.
+//
+// Esan flag parsing logic.
+//===----------------------------------------------------------------------===//
+
+#include "esan_flags.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_flag_parser.h"
+#include "sanitizer_common/sanitizer_flags.h"
+
+namespace __esan {
+
+static const char EsanOptsEnv[] = "ESAN_OPTIONS";
+
+Flags EsanFlagsDontUseDirectly;
+
+void Flags::setDefaults() {
+#define ESAN_FLAG(Type, Name, DefaultValue, Description) Name = DefaultValue;
+#include "esan_flags.inc"
+#undef ESAN_FLAG
+}
+
+static void registerEsanFlags(FlagParser *Parser, Flags *F) {
+#define ESAN_FLAG(Type, Name, DefaultValue, Description) \
+  RegisterFlag(Parser, #Name, Description, &F->Name);
+#include "esan_flags.inc"
+#undef ESAN_FLAG
+}
+
+void initializeFlags() {
+  SetCommonFlagsDefaults();
+  Flags *F = getFlags();
+  F->setDefaults();
+
+  FlagParser Parser;
+  registerEsanFlags(&Parser, F);
+  RegisterCommonFlags(&Parser);
+  Parser.ParseString(GetEnv(EsanOptsEnv));
+
+  InitializeCommonFlags();
+  if (Verbosity())
+    ReportUnrecognizedFlags();
+  if (common_flags()->help)
+    Parser.PrintFlagDescriptions();
+
+  __sanitizer_set_report_path(common_flags()->log_path);
+}
+
+} // namespace __esan
diff --git a/lib/esan/esan_flags.h b/lib/esan/esan_flags.h
new file mode 100644
index 0000000..c8f4ef5
--- /dev/null
+++ b/lib/esan/esan_flags.h
@@ -0,0 +1,41 @@
+//===-- esan_flags.h --------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of EfficiencySanitizer, a family of performance tuners.
+//
+// Esan runtime flags.
+//===----------------------------------------------------------------------===//
+
+#ifndef ESAN_FLAGS_H
+#define ESAN_FLAGS_H
+
+#include "sanitizer_common/sanitizer_internal_defs.h"
+#include "sanitizer_common/sanitizer_flag_parser.h"
+
+namespace __esan {
+
+class Flags {
+public:
+#define ESAN_FLAG(Type, Name, DefaultValue, Description) Type Name;
+#include "esan_flags.inc"
+#undef ESAN_FLAG
+
+  void setDefaults();
+};
+
+extern Flags EsanFlagsDontUseDirectly;
+inline Flags *getFlags() {
+  return &EsanFlagsDontUseDirectly;
+}
+
+void initializeFlags();
+
+} // namespace __esan
+
+#endif // ESAN_FLAGS_H
diff --git a/lib/esan/esan_flags.inc b/lib/esan/esan_flags.inc
new file mode 100644
index 0000000..5687cac
--- /dev/null
+++ b/lib/esan/esan_flags.inc
@@ -0,0 +1,56 @@
+//===-- esan_flags.inc ------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Esan runtime flags.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ESAN_FLAG
+# error "Define ESAN_FLAG prior to including this file!"
+#endif
+
+// ESAN_FLAG(Type, Name, DefaultValue, Description)
+// See COMMON_FLAG in sanitizer_flags.inc for more details.
+
+//===----------------------------------------------------------------------===//
+// Cross-tool options
+//===----------------------------------------------------------------------===//
+
+ESAN_FLAG(int, cache_line_size, 64,
+          "The number of bytes in a cache line.  For the working-set tool, this "
+          "cannot be changed without also changing the compiler "
+          "instrumentation.")
+
+//===----------------------------------------------------------------------===//
+// Working set tool options
+//===----------------------------------------------------------------------===//
+
+ESAN_FLAG(bool, record_snapshots, true,
+          "Working set tool: whether to sample snapshots during a run.")
+
+// Typical profiling uses a 10ms timer.  Our snapshots take some work
+// to scan memory so we reduce to 20ms.
+// To disable samples, turn off record_snapshots.
+ESAN_FLAG(int, sample_freq, 20,
+          "Working set tool: sampling frequency in milliseconds.")
+
+// This controls the difference in frequency between each successive series
+// of snapshots.  There are 8 in total, with number 0 using sample_freq.
+// Number N samples number N-1 every (1 << snapshot_step) instance of N-1.
+ESAN_FLAG(int, snapshot_step, 2, "Working set tool: the log of the sampling "
+          "performed for the next-higher-frequency snapshot series.")
+
+//===----------------------------------------------------------------------===//
+// Cache Fragmentation tool options
+//===----------------------------------------------------------------------===//
+
+// The difference information of a struct is reported if the struct's difference
+// score is greater than the report_threshold.
+ESAN_FLAG(int, report_threshold, 1<<10, "Cache-frag tool: the struct difference"
+          " score threshold for reporting.")
diff --git a/lib/esan/esan_interceptors.cpp b/lib/esan/esan_interceptors.cpp
new file mode 100644
index 0000000..647f010
--- /dev/null
+++ b/lib/esan/esan_interceptors.cpp
@@ -0,0 +1,547 @@
+//===-- esan_interceptors.cpp ---------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of EfficiencySanitizer, a family of performance tuners.
+//
+// Interception routines for the esan run-time.
+//===----------------------------------------------------------------------===//
+
+#include "esan.h"
+#include "esan_shadow.h"
+#include "interception/interception.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_libc.h"
+#include "sanitizer_common/sanitizer_linux.h"
+#include "sanitizer_common/sanitizer_stacktrace.h"
+
+using namespace __esan; // NOLINT
+
+#define CUR_PC() (StackTrace::GetCurrentPc())
+
+//===----------------------------------------------------------------------===//
+// Interception via sanitizer common interceptors
+//===----------------------------------------------------------------------===//
+
+// Get the per-platform defines for what is possible to intercept
+#include "sanitizer_common/sanitizer_platform_interceptors.h"
+
+// TODO(bruening): tsan disables several interceptors (getpwent, etc.) claiming
+// that interception is a perf hit: should we do the same?
+
+// We have no need to intercept:
+#undef SANITIZER_INTERCEPT_TLS_GET_ADDR
+
+// TODO(bruening): the common realpath interceptor assumes malloc is
+// intercepted!  We should try to parametrize that, though we'll
+// intercept malloc soon ourselves and can then remove this undef.
+#undef SANITIZER_INTERCEPT_REALPATH
+
+// We provide our own version:
+#undef SANITIZER_INTERCEPT_SIGPROCMASK
+
+#define COMMON_INTERCEPTOR_NOTHING_IS_INITIALIZED (!EsanIsInitialized)
+
+#define COMMON_INTERCEPT_FUNCTION(name) INTERCEPT_FUNCTION(name)
+#define COMMON_INTERCEPT_FUNCTION_VER(name, ver)                          \
+  INTERCEPT_FUNCTION_VER(name, ver)
+
+// We must initialize during early interceptors, to support tcmalloc.
+// This means that for some apps we fully initialize prior to
+// __esan_init() being called.
+// We currently do not use ctx.
+#define COMMON_INTERCEPTOR_ENTER(ctx, func, ...)                               \
+  do {                                                                         \
+    if (UNLIKELY(COMMON_INTERCEPTOR_NOTHING_IS_INITIALIZED)) {                 \
+      if (!UNLIKELY(EsanDuringInit))                                           \
+        initializeLibrary(__esan_which_tool);                                  \
+      return REAL(func)(__VA_ARGS__);                                          \
+    }                                                                          \
+    ctx = nullptr;                                                             \
+    (void)ctx;                                                                 \
+  } while (false)
+
+#define COMMON_INTERCEPTOR_ENTER_NOIGNORE(ctx, func, ...)                      \
+  COMMON_INTERCEPTOR_ENTER(ctx, func, __VA_ARGS__)
+
+#define COMMON_INTERCEPTOR_WRITE_RANGE(ctx, ptr, size)                         \
+  processRangeAccess(CUR_PC(), (uptr)ptr, size, true)
+
+#define COMMON_INTERCEPTOR_READ_RANGE(ctx, ptr, size)                          \
+  processRangeAccess(CUR_PC(), (uptr)ptr, size, false)
+
+// This is only called if the app explicitly calls exit(), not on
+// a normal exit.
+#define COMMON_INTERCEPTOR_ON_EXIT(ctx) finalizeLibrary()
+
+#define COMMON_INTERCEPTOR_FILE_OPEN(ctx, file, path)                          \
+  do {                                                                         \
+    (void)(ctx);                                                               \
+    (void)(file);                                                              \
+    (void)(path);                                                              \
+  } while (false)
+#define COMMON_INTERCEPTOR_FILE_CLOSE(ctx, file)                               \
+  do {                                                                         \
+    (void)(ctx);                                                               \
+    (void)(file);                                                              \
+  } while (false)
+#define COMMON_INTERCEPTOR_LIBRARY_LOADED(filename, handle)                    \
+  do {                                                                         \
+    (void)(filename);                                                          \
+    (void)(handle);                                                            \
+  } while (false)
+#define COMMON_INTERCEPTOR_LIBRARY_UNLOADED()                                  \
+  do {                                                                         \
+  } while (false)
+#define COMMON_INTERCEPTOR_ACQUIRE(ctx, u)                                     \
+  do {                                                                         \
+    (void)(ctx);                                                               \
+    (void)(u);                                                                 \
+  } while (false)
+#define COMMON_INTERCEPTOR_RELEASE(ctx, u)                                     \
+  do {                                                                         \
+    (void)(ctx);                                                               \
+    (void)(u);                                                                 \
+  } while (false)
+#define COMMON_INTERCEPTOR_DIR_ACQUIRE(ctx, path)                              \
+  do {                                                                         \
+    (void)(ctx);                                                               \
+    (void)(path);                                                              \
+  } while (false)
+#define COMMON_INTERCEPTOR_FD_ACQUIRE(ctx, fd)                                 \
+  do {                                                                         \
+    (void)(ctx);                                                               \
+    (void)(fd);                                                                \
+  } while (false)
+#define COMMON_INTERCEPTOR_FD_RELEASE(ctx, fd)                                 \
+  do {                                                                         \
+    (void)(ctx);                                                               \
+    (void)(fd);                                                                \
+  } while (false)
+#define COMMON_INTERCEPTOR_FD_ACCESS(ctx, fd)                                  \
+  do {                                                                         \
+    (void)(ctx);                                                               \
+    (void)(fd);                                                                \
+  } while (false)
+#define COMMON_INTERCEPTOR_FD_SOCKET_ACCEPT(ctx, fd, newfd)                    \
+  do {                                                                         \
+    (void)(ctx);                                                               \
+    (void)(fd);                                                                \
+    (void)(newfd);                                                             \
+  } while (false)
+#define COMMON_INTERCEPTOR_SET_THREAD_NAME(ctx, name)                          \
+  do {                                                                         \
+    (void)(ctx);                                                               \
+    (void)(name);                                                              \
+  } while (false)
+#define COMMON_INTERCEPTOR_SET_PTHREAD_NAME(ctx, thread, name)                 \
+  do {                                                                         \
+    (void)(ctx);                                                               \
+    (void)(thread);                                                            \
+    (void)(name);                                                              \
+  } while (false)
+#define COMMON_INTERCEPTOR_BLOCK_REAL(name) REAL(name)
+#define COMMON_INTERCEPTOR_MUTEX_LOCK(ctx, m)                                  \
+  do {                                                                         \
+    (void)(ctx);                                                               \
+    (void)(m);                                                                 \
+  } while (false)
+#define COMMON_INTERCEPTOR_MUTEX_UNLOCK(ctx, m)                                \
+  do {                                                                         \
+    (void)(ctx);                                                               \
+    (void)(m);                                                                 \
+  } while (false)
+#define COMMON_INTERCEPTOR_MUTEX_REPAIR(ctx, m)                                \
+  do {                                                                         \
+    (void)(ctx);                                                               \
+    (void)(m);                                                                 \
+  } while (false)
+#define COMMON_INTERCEPTOR_HANDLE_RECVMSG(ctx, msg)                            \
+  do {                                                                         \
+    (void)(ctx);                                                               \
+    (void)(msg);                                                               \
+  } while (false)
+#define COMMON_INTERCEPTOR_USER_CALLBACK_START()                               \
+  do {                                                                         \
+  } while (false)
+#define COMMON_INTERCEPTOR_USER_CALLBACK_END()                                 \
+  do {                                                                         \
+  } while (false)
+
+#include "sanitizer_common/sanitizer_common_interceptors.inc"
+
+//===----------------------------------------------------------------------===//
+// Syscall interception
+//===----------------------------------------------------------------------===//
+
+// We want the caller's PC b/c unlike the other function interceptors these
+// are separate pre and post functions called around the app's syscall().
+
+#define COMMON_SYSCALL_PRE_READ_RANGE(ptr, size)                               \
+  processRangeAccess(GET_CALLER_PC(), (uptr)ptr, size, false)
+
+#define COMMON_SYSCALL_PRE_WRITE_RANGE(ptr, size)                              \
+  do {                                                                         \
+    (void)(ptr);                                                               \
+    (void)(size);                                                              \
+  } while (false)
+
+#define COMMON_SYSCALL_POST_READ_RANGE(ptr, size)                              \
+  do {                                                                         \
+    (void)(ptr);                                                               \
+    (void)(size);                                                              \
+  } while (false)
+
+// The actual amount written is in post, not pre.
+#define COMMON_SYSCALL_POST_WRITE_RANGE(ptr, size)                             \
+  processRangeAccess(GET_CALLER_PC(), (uptr)ptr, size, true)
+
+#define COMMON_SYSCALL_ACQUIRE(addr)                                           \
+  do {                                                                         \
+    (void)(addr);                                                              \
+  } while (false)
+#define COMMON_SYSCALL_RELEASE(addr)                                           \
+  do {                                                                         \
+    (void)(addr);                                                              \
+  } while (false)
+#define COMMON_SYSCALL_FD_CLOSE(fd)                                            \
+  do {                                                                         \
+    (void)(fd);                                                                \
+  } while (false)
+#define COMMON_SYSCALL_FD_ACQUIRE(fd)                                          \
+  do {                                                                         \
+    (void)(fd);                                                                \
+  } while (false)
+#define COMMON_SYSCALL_FD_RELEASE(fd)                                          \
+  do {                                                                         \
+    (void)(fd);                                                                \
+  } while (false)
+#define COMMON_SYSCALL_PRE_FORK()                                              \
+  do {                                                                         \
+  } while (false)
+#define COMMON_SYSCALL_POST_FORK(res)                                          \
+  do {                                                                         \
+    (void)(res);                                                               \
+  } while (false)
+
+#include "sanitizer_common/sanitizer_common_syscalls.inc"
+
+//===----------------------------------------------------------------------===//
+// Custom interceptors
+//===----------------------------------------------------------------------===//
+
+// TODO(bruening): move more of these to the common interception pool as they
+// are shared with tsan and asan.
+// While our other files match LLVM style, here we match sanitizer style as we
+// expect to move these to the common pool.
+
+INTERCEPTOR(char *, strcpy, char *dst, const char *src) { // NOLINT
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, strcpy, dst, src);
+  uptr srclen = internal_strlen(src);
+  COMMON_INTERCEPTOR_WRITE_RANGE(ctx, dst, srclen + 1);
+  COMMON_INTERCEPTOR_READ_RANGE(ctx, src, srclen + 1);
+  return REAL(strcpy)(dst, src); // NOLINT
+}
+
+INTERCEPTOR(char *, strncpy, char *dst, char *src, uptr n) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, strncpy, dst, src, n);
+  uptr srclen = internal_strnlen(src, n);
+  uptr copied_size = srclen + 1 > n ? n : srclen + 1;
+  COMMON_INTERCEPTOR_WRITE_RANGE(ctx, dst, copied_size);
+  COMMON_INTERCEPTOR_READ_RANGE(ctx, src, copied_size);
+  return REAL(strncpy)(dst, src, n);
+}
+
+INTERCEPTOR(int, open, const char *name, int flags, int mode) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, open, name, flags, mode);
+  COMMON_INTERCEPTOR_READ_STRING(ctx, name, 0);
+  return REAL(open)(name, flags, mode);
+}
+
+#if SANITIZER_LINUX
+INTERCEPTOR(int, open64, const char *name, int flags, int mode) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, open64, name, flags, mode);
+  COMMON_INTERCEPTOR_READ_STRING(ctx, name, 0);
+  return REAL(open64)(name, flags, mode);
+}
+#define ESAN_MAYBE_INTERCEPT_OPEN64 INTERCEPT_FUNCTION(open64)
+#else
+#define ESAN_MAYBE_INTERCEPT_OPEN64
+#endif
+
+INTERCEPTOR(int, creat, const char *name, int mode) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, creat, name, mode);
+  COMMON_INTERCEPTOR_READ_STRING(ctx, name, 0);
+  return REAL(creat)(name, mode);
+}
+
+#if SANITIZER_LINUX
+INTERCEPTOR(int, creat64, const char *name, int mode) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, creat64, name, mode);
+  COMMON_INTERCEPTOR_READ_STRING(ctx, name, 0);
+  return REAL(creat64)(name, mode);
+}
+#define ESAN_MAYBE_INTERCEPT_CREAT64 INTERCEPT_FUNCTION(creat64)
+#else
+#define ESAN_MAYBE_INTERCEPT_CREAT64
+#endif
+
+INTERCEPTOR(int, unlink, char *path) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, unlink, path);
+  COMMON_INTERCEPTOR_READ_STRING(ctx, path, 0);
+  return REAL(unlink)(path);
+}
+
+INTERCEPTOR(uptr, fread, void *ptr, uptr size, uptr nmemb, void *f) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, fread, ptr, size, nmemb, f);
+  COMMON_INTERCEPTOR_WRITE_RANGE(ctx, ptr, size * nmemb);
+  return REAL(fread)(ptr, size, nmemb, f);
+}
+
+INTERCEPTOR(uptr, fwrite, const void *p, uptr size, uptr nmemb, void *f) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, fwrite, p, size, nmemb, f);
+  COMMON_INTERCEPTOR_READ_RANGE(ctx, p, size * nmemb);
+  return REAL(fwrite)(p, size, nmemb, f);
+}
+
+INTERCEPTOR(int, puts, const char *s) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, puts, s);
+  COMMON_INTERCEPTOR_READ_RANGE(ctx, s, internal_strlen(s));
+  return REAL(puts)(s);
+}
+
+INTERCEPTOR(int, rmdir, char *path) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, rmdir, path);
+  COMMON_INTERCEPTOR_READ_STRING(ctx, path, 0);
+  return REAL(rmdir)(path);
+}
+
+//===----------------------------------------------------------------------===//
+// Shadow-related interceptors
+//===----------------------------------------------------------------------===//
+
+// These are candidates for sharing with all sanitizers if shadow memory
+// support is also standardized.
+
+INTERCEPTOR(void *, mmap, void *addr, SIZE_T sz, int prot, int flags,
+                 int fd, OFF_T off) {
+  if (UNLIKELY(REAL(mmap) == nullptr)) {
+    // With esan init during interceptor init and a static libc preventing
+    // our early-calloc from triggering, we can end up here before our
+    // REAL pointer is set up.
+    return (void *)internal_mmap(addr, sz, prot, flags, fd, off);
+  }
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, mmap, addr, sz, prot, flags, fd, off);
+  if (!fixMmapAddr(&addr, sz, flags))
+    return (void *)-1;
+  void *result = REAL(mmap)(addr, sz, prot, flags, fd, off);
+  return (void *)checkMmapResult((uptr)result, sz);
+}
+
+#if SANITIZER_LINUX
+INTERCEPTOR(void *, mmap64, void *addr, SIZE_T sz, int prot, int flags,
+                 int fd, OFF64_T off) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, mmap64, addr, sz, prot, flags, fd, off);
+  if (!fixMmapAddr(&addr, sz, flags))
+    return (void *)-1;
+  void *result = REAL(mmap64)(addr, sz, prot, flags, fd, off);
+  return (void *)checkMmapResult((uptr)result, sz);
+}
+#define ESAN_MAYBE_INTERCEPT_MMAP64 INTERCEPT_FUNCTION(mmap64)
+#else
+#define ESAN_MAYBE_INTERCEPT_MMAP64
+#endif
+
+//===----------------------------------------------------------------------===//
+// Signal-related interceptors
+//===----------------------------------------------------------------------===//
+
+#if SANITIZER_LINUX
+typedef void (*signal_handler_t)(int);
+INTERCEPTOR(signal_handler_t, signal, int signum, signal_handler_t handler) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, signal, signum, handler);
+  signal_handler_t result;
+  if (!processSignal(signum, handler, &result))
+    return result;
+  else
+    return REAL(signal)(signum, handler);
+}
+#define ESAN_MAYBE_INTERCEPT_SIGNAL INTERCEPT_FUNCTION(signal)
+#else
+#error Platform not supported
+#define ESAN_MAYBE_INTERCEPT_SIGNAL
+#endif
+
+#if SANITIZER_LINUX
+DECLARE_REAL(int, sigaction, int signum, const struct sigaction *act,
+             struct sigaction *oldact)
+INTERCEPTOR(int, sigaction, int signum, const struct sigaction *act,
+            struct sigaction *oldact) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, sigaction, signum, act, oldact);
+  if (!processSigaction(signum, act, oldact))
+    return 0;
+  else
+    return REAL(sigaction)(signum, act, oldact);
+}
+
+// This is required to properly use internal_sigaction.
+namespace __sanitizer {
+int real_sigaction(int signum, const void *act, void *oldact) {
+  if (REAL(sigaction) == nullptr) {
+    // With an instrumented allocator, this is called during interceptor init
+    // and we need a raw syscall solution.
+    return internal_sigaction_syscall(signum, act, oldact);
+  }
+  return REAL(sigaction)(signum, (const struct sigaction *)act,
+                         (struct sigaction *)oldact);
+}
+} // namespace __sanitizer
+
+#define ESAN_MAYBE_INTERCEPT_SIGACTION INTERCEPT_FUNCTION(sigaction)
+#else
+#error Platform not supported
+#define ESAN_MAYBE_INTERCEPT_SIGACTION
+#endif
+
+#if SANITIZER_LINUX
+INTERCEPTOR(int, sigprocmask, int how, __sanitizer_sigset_t *set,
+            __sanitizer_sigset_t *oldset) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, sigprocmask, how, set, oldset);
+  int res = 0;
+  if (processSigprocmask(how, set, oldset))
+    res = REAL(sigprocmask)(how, set, oldset);
+  if (!res && oldset)
+    COMMON_INTERCEPTOR_WRITE_RANGE(ctx, oldset, sizeof(*oldset));
+  return res;
+}
+#define ESAN_MAYBE_INTERCEPT_SIGPROCMASK INTERCEPT_FUNCTION(sigprocmask)
+#else
+#define ESAN_MAYBE_INTERCEPT_SIGPROCMASK
+#endif
+
+#if !SANITIZER_WINDOWS
+INTERCEPTOR(int, pthread_sigmask, int how, __sanitizer_sigset_t *set,
+            __sanitizer_sigset_t *oldset) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, pthread_sigmask, how, set, oldset);
+  int res = 0;
+  if (processSigprocmask(how, set, oldset))
+    res = REAL(sigprocmask)(how, set, oldset);
+  if (!res && oldset)
+    COMMON_INTERCEPTOR_WRITE_RANGE(ctx, oldset, sizeof(*oldset));
+  return res;
+}
+#define ESAN_MAYBE_INTERCEPT_PTHREAD_SIGMASK INTERCEPT_FUNCTION(pthread_sigmask)
+#else
+#define ESAN_MAYBE_INTERCEPT_PTHREAD_SIGMASK
+#endif
+
+//===----------------------------------------------------------------------===//
+// Malloc interceptors
+//===----------------------------------------------------------------------===//
+
+static char early_alloc_buf[128];
+static bool used_early_alloc_buf;
+
+static void *handleEarlyAlloc(uptr size) {
+  // If esan is initialized during an interceptor (which happens with some
+  // tcmalloc implementations that call pthread_mutex_lock), the call from
+  // dlsym to calloc will deadlock.  There is only one such calloc (dlsym
+  // allocates a single pthread key), so we work around it by using a
+  // static buffer for the calloc request.  The loader currently needs
+  // 32 bytes but we size at 128 to allow for future changes.
+  // This solution will also allow us to deliberately intercept malloc & family
+  // in the future (to perform tool actions on each allocation, without
+  // replacing the allocator), as it also solves the problem of intercepting
+  // calloc when it will itself be called before its REAL pointer is
+  // initialized.
+  CHECK(!used_early_alloc_buf && size < sizeof(early_alloc_buf));
+  // We do not handle multiple threads here.  This only happens at process init
+  // time, and while it's possible for a shared library to create early threads
+  // that race here, we consider that to be a corner case extreme enough that
+  // it's not worth the effort to handle.
+  used_early_alloc_buf = true;
+  return (void *)early_alloc_buf;
+}
+
+INTERCEPTOR(void*, calloc, uptr size, uptr n) {
+  if (EsanDuringInit && REAL(calloc) == nullptr)
+    return handleEarlyAlloc(size * n);
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, calloc, size, n);
+  void *res = REAL(calloc)(size, n);
+  // The memory is zeroed and thus is all written.
+  COMMON_INTERCEPTOR_WRITE_RANGE(nullptr, (uptr)res, size * n);
+  return res;
+}
+
+INTERCEPTOR(void, free, void *p) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, free, p);
+  if (p == (void *)early_alloc_buf) {
+    // We expect just a singleton use but we clear this for cleanliness.
+    used_early_alloc_buf = false;
+    return;
+  }
+  REAL(free)(p);
+}
+
+namespace __esan {
+
+void initializeInterceptors() {
+  InitializeCommonInterceptors();
+
+  INTERCEPT_FUNCTION(strcpy); // NOLINT
+  INTERCEPT_FUNCTION(strncpy);
+
+  INTERCEPT_FUNCTION(open);
+  ESAN_MAYBE_INTERCEPT_OPEN64;
+  INTERCEPT_FUNCTION(creat);
+  ESAN_MAYBE_INTERCEPT_CREAT64;
+  INTERCEPT_FUNCTION(unlink);
+  INTERCEPT_FUNCTION(fread);
+  INTERCEPT_FUNCTION(fwrite);
+  INTERCEPT_FUNCTION(puts);
+  INTERCEPT_FUNCTION(rmdir);
+
+  INTERCEPT_FUNCTION(mmap);
+  ESAN_MAYBE_INTERCEPT_MMAP64;
+
+  ESAN_MAYBE_INTERCEPT_SIGNAL;
+  ESAN_MAYBE_INTERCEPT_SIGACTION;
+  ESAN_MAYBE_INTERCEPT_SIGPROCMASK;
+  ESAN_MAYBE_INTERCEPT_PTHREAD_SIGMASK;
+
+  INTERCEPT_FUNCTION(calloc);
+  INTERCEPT_FUNCTION(free);
+
+  // TODO(bruening): intercept routines that other sanitizers intercept that
+  // are not in the common pool or here yet, ideally by adding to the common
+  // pool.  Examples include wcslen and bcopy.
+
+  // TODO(bruening): there are many more libc routines that read or write data
+  // structures that no sanitizer is intercepting: sigaction, strtol, etc.
+}
+
+} // namespace __esan
diff --git a/lib/esan/esan_interface.cpp b/lib/esan/esan_interface.cpp
new file mode 100644
index 0000000..8a64d15
--- /dev/null
+++ b/lib/esan/esan_interface.cpp
@@ -0,0 +1,118 @@
+//===-- esan_interface.cpp ------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of EfficiencySanitizer, a family of performance tuners.
+//
+//===----------------------------------------------------------------------===//
+
+#include "esan_interface_internal.h"
+#include "esan.h"
+#include "sanitizer_common/sanitizer_internal_defs.h"
+
+using namespace __esan; // NOLINT
+
+void __esan_init(ToolType Tool, void *Ptr) {
+  if (Tool != __esan_which_tool) {
+    Printf("ERROR: tool mismatch: %d vs %d\n", Tool, __esan_which_tool);
+    Die();
+  }
+  initializeLibrary(Tool);
+  processCompilationUnitInit(Ptr);
+}
+
+void __esan_exit(void *Ptr) {
+  processCompilationUnitExit(Ptr);
+}
+
+void __esan_aligned_load1(void *Addr) {
+  processRangeAccess(GET_CALLER_PC(), (uptr)Addr, 1, false);
+}
+
+void __esan_aligned_load2(void *Addr) {
+  processRangeAccess(GET_CALLER_PC(), (uptr)Addr, 2, false);
+}
+
+void __esan_aligned_load4(void *Addr) {
+  processRangeAccess(GET_CALLER_PC(), (uptr)Addr, 4, false);
+}
+
+void __esan_aligned_load8(void *Addr) {
+  processRangeAccess(GET_CALLER_PC(), (uptr)Addr, 8, false);
+}
+
+void __esan_aligned_load16(void *Addr) {
+  processRangeAccess(GET_CALLER_PC(), (uptr)Addr, 16, false);
+}
+
+void __esan_aligned_store1(void *Addr) {
+  processRangeAccess(GET_CALLER_PC(), (uptr)Addr, 1, true);
+}
+
+void __esan_aligned_store2(void *Addr) {
+  processRangeAccess(GET_CALLER_PC(), (uptr)Addr, 2, true);
+}
+
+void __esan_aligned_store4(void *Addr) {
+  processRangeAccess(GET_CALLER_PC(), (uptr)Addr, 4, true);
+}
+
+void __esan_aligned_store8(void *Addr) {
+  processRangeAccess(GET_CALLER_PC(), (uptr)Addr, 8, true);
+}
+
+void __esan_aligned_store16(void *Addr) {
+  processRangeAccess(GET_CALLER_PC(), (uptr)Addr, 16, true);
+}
+
+void __esan_unaligned_load2(void *Addr) {
+  processRangeAccess(GET_CALLER_PC(), (uptr)Addr, 2, false);
+}
+
+void __esan_unaligned_load4(void *Addr) {
+  processRangeAccess(GET_CALLER_PC(), (uptr)Addr, 4, false);
+}
+
+void __esan_unaligned_load8(void *Addr) {
+  processRangeAccess(GET_CALLER_PC(), (uptr)Addr, 8, false);
+}
+
+void __esan_unaligned_load16(void *Addr) {
+  processRangeAccess(GET_CALLER_PC(), (uptr)Addr, 16, false);
+}
+
+void __esan_unaligned_store2(void *Addr) {
+  processRangeAccess(GET_CALLER_PC(), (uptr)Addr, 2, true);
+}
+
+void __esan_unaligned_store4(void *Addr) {
+  processRangeAccess(GET_CALLER_PC(), (uptr)Addr, 4, true);
+}
+
+void __esan_unaligned_store8(void *Addr) {
+  processRangeAccess(GET_CALLER_PC(), (uptr)Addr, 8, true);
+}
+
+void __esan_unaligned_store16(void *Addr) {
+  processRangeAccess(GET_CALLER_PC(), (uptr)Addr, 16, true);
+}
+
+void __esan_unaligned_loadN(void *Addr, uptr Size) {
+  processRangeAccess(GET_CALLER_PC(), (uptr)Addr, Size, false);
+}
+
+void __esan_unaligned_storeN(void *Addr, uptr Size) {
+  processRangeAccess(GET_CALLER_PC(), (uptr)Addr, Size, true);
+}
+
+// Public interface:
+extern "C" {
+SANITIZER_INTERFACE_ATTRIBUTE void __esan_report() {
+  reportResults();
+}
+} // extern "C"
diff --git a/lib/esan/esan_interface_internal.h b/lib/esan/esan_interface_internal.h
new file mode 100644
index 0000000..3b915d0
--- /dev/null
+++ b/lib/esan/esan_interface_internal.h
@@ -0,0 +1,80 @@
+//===-- esan_interface_internal.h -------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of EfficiencySanitizer, a family of performance tuners.
+//
+// Calls to the functions declared in this header will be inserted by
+// the instrumentation module.
+//===----------------------------------------------------------------------===//
+
+#ifndef ESAN_INTERFACE_INTERNAL_H
+#define ESAN_INTERFACE_INTERNAL_H
+
+#include <sanitizer_common/sanitizer_internal_defs.h>
+
+// This header should NOT include any other headers.
+// All functions in this header are extern "C" and start with __esan_.
+
+extern "C" {
+
+// This should be kept consistent with LLVM's EfficiencySanitizerOptions.
+// The value is passed as a 32-bit integer by the compiler.
+typedef enum Type : u32 {
+  ESAN_None = 0,
+  ESAN_CacheFrag,
+  ESAN_WorkingSet,
+  ESAN_Max,
+} ToolType;
+
+// To handle interceptors that invoke instrumented code prior to
+// __esan_init() being called, the instrumentation module creates this
+// global variable specifying the tool.
+extern ToolType __esan_which_tool;
+
+// This function should be called at the very beginning of the process,
+// before any instrumented code is executed and before any call to malloc.
+SANITIZER_INTERFACE_ATTRIBUTE void __esan_init(ToolType Tool, void *Ptr);
+SANITIZER_INTERFACE_ATTRIBUTE void __esan_exit(void *Ptr);
+
+// The instrumentation module will insert a call to one of these routines prior
+// to each load and store instruction for which we do not have "fastpath"
+// inlined instrumentation.  These calls constitute the "slowpath" for our
+// tools.  We have separate routines for each type of memory access to enable
+// targeted optimization.
+SANITIZER_INTERFACE_ATTRIBUTE void __esan_aligned_load1(void *Addr);
+SANITIZER_INTERFACE_ATTRIBUTE void __esan_aligned_load2(void *Addr);
+SANITIZER_INTERFACE_ATTRIBUTE void __esan_aligned_load4(void *Addr);
+SANITIZER_INTERFACE_ATTRIBUTE void __esan_aligned_load8(void *Addr);
+SANITIZER_INTERFACE_ATTRIBUTE void __esan_aligned_load16(void *Addr);
+
+SANITIZER_INTERFACE_ATTRIBUTE void __esan_aligned_store1(void *Addr);
+SANITIZER_INTERFACE_ATTRIBUTE void __esan_aligned_store2(void *Addr);
+SANITIZER_INTERFACE_ATTRIBUTE void __esan_aligned_store4(void *Addr);
+SANITIZER_INTERFACE_ATTRIBUTE void __esan_aligned_store8(void *Addr);
+SANITIZER_INTERFACE_ATTRIBUTE void __esan_aligned_store16(void *Addr);
+
+SANITIZER_INTERFACE_ATTRIBUTE void __esan_unaligned_load2(void *Addr);
+SANITIZER_INTERFACE_ATTRIBUTE void __esan_unaligned_load4(void *Addr);
+SANITIZER_INTERFACE_ATTRIBUTE void __esan_unaligned_load8(void *Addr);
+SANITIZER_INTERFACE_ATTRIBUTE void __esan_unaligned_load16(void *Addr);
+
+SANITIZER_INTERFACE_ATTRIBUTE void __esan_unaligned_store2(void *Addr);
+SANITIZER_INTERFACE_ATTRIBUTE void __esan_unaligned_store4(void *Addr);
+SANITIZER_INTERFACE_ATTRIBUTE void __esan_unaligned_store8(void *Addr);
+SANITIZER_INTERFACE_ATTRIBUTE void __esan_unaligned_store16(void *Addr);
+
+// These cover unusually-sized accesses.
+SANITIZER_INTERFACE_ATTRIBUTE
+void __esan_unaligned_loadN(void *Addr, uptr Size);
+SANITIZER_INTERFACE_ATTRIBUTE
+void __esan_unaligned_storeN(void *Addr, uptr Size);
+
+} // extern "C"
+
+#endif // ESAN_INTERFACE_INTERNAL_H
diff --git a/lib/esan/esan_linux.cpp b/lib/esan/esan_linux.cpp
new file mode 100644
index 0000000..aa961b6
--- /dev/null
+++ b/lib/esan/esan_linux.cpp
@@ -0,0 +1,83 @@
+//===-- esan.cpp ----------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of EfficiencySanitizer, a family of performance tuners.
+//
+// Linux-specific code for the Esan run-time.
+//===----------------------------------------------------------------------===//
+
+#include "sanitizer_common/sanitizer_platform.h"
+#if SANITIZER_FREEBSD || SANITIZER_LINUX
+
+#include "esan.h"
+#include "esan_shadow.h"
+#include "interception/interception.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include <sys/mman.h>
+#include <errno.h>
+
+namespace __esan {
+
+void verifyAddressSpace() {
+#if SANITIZER_LINUX && defined(__x86_64__)
+  // The kernel determines its mmap base from the stack size limit.
+  // Our Linux 64-bit shadow mapping assumes the stack limit is less than a
+  // terabyte, which keeps the mmap region above 0x7e00'.
+  uptr StackLimit = GetStackSizeLimitInBytes();
+  if (StackSizeIsUnlimited() || StackLimit > MaxStackSize) {
+    VReport(1, "The stack size limit is beyond the maximum supported.\n"
+            "Re-execing with a stack size below 1TB.\n");
+    SetStackSizeLimitInBytes(MaxStackSize);
+    ReExec();
+  }
+#endif
+}
+
+static bool liesWithinSingleAppRegion(uptr Start, SIZE_T Size) {
+  uptr AppStart, AppEnd;
+  for (int i = 0; getAppRegion(i, &AppStart, &AppEnd); ++i) {
+    if (Start >= AppStart && Start + Size - 1 <= AppEnd) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool fixMmapAddr(void **Addr, SIZE_T Size, int Flags) {
+  if (*Addr) {
+    if (!liesWithinSingleAppRegion((uptr)*Addr, Size)) {
+      VPrintf(1, "mmap conflict: [%p-%p) is not in an app region\n",
+              *Addr, (uptr)*Addr + Size);
+      if (Flags & MAP_FIXED) {
+        errno = EINVAL;
+        return false;
+      } else {
+        *Addr = 0;
+      }
+    }
+  }
+  return true;
+}
+
+uptr checkMmapResult(uptr Addr, SIZE_T Size) {
+  if ((void *)Addr == MAP_FAILED)
+    return Addr;
+  if (!liesWithinSingleAppRegion(Addr, Size)) {
+    // FIXME: attempt to dynamically add this as an app region if it
+    // fits our shadow criteria.
+    // We could also try to remap somewhere else.
+    Printf("ERROR: unsupported mapping at [%p-%p)\n", Addr, Addr+Size);
+    Die();
+  }
+  return Addr;
+}
+
+} // namespace __esan
+
+#endif // SANITIZER_FREEBSD || SANITIZER_LINUX
diff --git a/lib/esan/esan_shadow.h b/lib/esan/esan_shadow.h
new file mode 100644
index 0000000..f8f154e
--- /dev/null
+++ b/lib/esan/esan_shadow.h
@@ -0,0 +1,203 @@
+//===-- esan_shadow.h -------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of EfficiencySanitizer, a family of performance tuners.
+//
+// Shadow memory mappings for the esan run-time.
+//===----------------------------------------------------------------------===//
+
+#ifndef ESAN_SHADOW_H
+#define ESAN_SHADOW_H
+
+#include <sanitizer_common/sanitizer_platform.h>
+
+#if SANITIZER_WORDSIZE != 64
+#error Only 64-bit is supported
+#endif
+
+namespace __esan {
+
+#if SANITIZER_LINUX && defined(__x86_64__)
+// Linux x86_64
+//
+// Application memory falls into these 5 regions (ignoring the corner case
+// of PIE with a non-zero PT_LOAD base):
+//
+// [0x00000000'00000000, 0x00000100'00000000) non-PIE + heap
+// [0x00005500'00000000, 0x00005700'00000000) PIE
+// [0x00007e00'00000000, 0x00007fff'ff600000) libraries + stack, part 1
+// [0x00007fff'ff601000, 0x00008000'00000000) libraries + stack, part 2
+// [0xffffffff'ff600000, 0xffffffff'ff601000) vsyscall
+//
+// Although we can ignore the vsyscall for the most part as there are few data
+// references there (other sanitizers ignore it), we enforce a gap inside the
+// library region to distinguish the vsyscall's shadow, considering this gap to
+// be an invalid app region.
+// We disallow application memory outside of those 5 regions.
+// Our regions assume that the stack rlimit is less than a terabyte (otherwise
+// the Linux kernel's default mmap region drops below 0x7e00'), which we enforce
+// at init time (we can support larger and unlimited sizes for shadow
+// scaledowns, but it is difficult for 1:1 mappings).
+//
+// Our shadow memory is scaled from a 1:1 mapping and supports a scale
+// specified at library initialization time that can be any power-of-2
+// scaledown (1x, 2x, 4x, 8x, 16x, etc.).
+//
+// We model our shadow memory after Umbra, a library used by the Dr. Memory
+// tool: https://github.com/DynamoRIO/drmemory/blob/master/umbra/umbra_x64.c.
+// We use Umbra's scheme as it was designed to support different
+// offsets, it supports two different shadow mappings (which we may want to
+// use for future tools), and it ensures that the shadow of a shadow will
+// not overlap either shadow memory or application memory.
+//
+// This formula translates from application memory to shadow memory:
+//
+//   shadow(app) = ((app & 0x00000fff'ffffffff) + offset) >> scale
+//
+// Where the offset for 1:1 is 0x00001300'00000000.  For other scales, the
+// offset is shifted left by the scale, except for scales of 1 and 2 where
+// it must be tweaked in order to pass the double-shadow test
+// (see the "shadow(shadow)" comments below):
+//   scale == 0: 0x00001300'000000000
+//   scale == 1: 0x00002200'000000000
+//   scale == 2: 0x00004400'000000000
+//   scale >= 3: (0x00001300'000000000 << scale)
+//
+// Do not pass in the open-ended end value to the formula as it will fail.
+//
+// The resulting shadow memory regions for a 0 scaling are:
+//
+// [0x00001300'00000000, 0x00001400'00000000)
+// [0x00001800'00000000, 0x00001a00'00000000)
+// [0x00002100'00000000, 0x000022ff'ff600000)
+// [0x000022ff'ff601000, 0x00002300'00000000)
+// [0x000022ff'ff600000, 0x000022ff'ff601000]
+//
+// We also want to ensure that a wild access by the application into the shadow
+// regions will not corrupt our own shadow memory.  shadow(shadow) ends up
+// disjoint from shadow(app):
+//
+// [0x00001600'00000000, 0x00001700'00000000)
+// [0x00001b00'00000000, 0x00001d00'00000000)
+// [0x00001400'00000000, 0x000015ff'ff600000]
+// [0x000015ff'ff601000, 0x00001600'00000000]
+// [0x000015ff'ff600000, 0x000015ff'ff601000]
+
+struct ApplicationRegion {
+  uptr Start;
+  uptr End;
+  bool ShadowMergedWithPrev;
+};
+
+static const struct ApplicationRegion AppRegions[] = {
+  {0x0000000000000000ull, 0x0000010000000000u, false},
+  {0x0000550000000000u,   0x0000570000000000u, false},
+  // We make one shadow mapping to hold the shadow regions for all 3 of these
+  // app regions, as the mappings interleave, and the gap between the 3rd and
+  // 4th scales down below a page.
+  {0x00007e0000000000u,   0x00007fffff600000u, false},
+  {0x00007fffff601000u,   0x0000800000000000u, true},
+  {0xffffffffff600000u,   0xffffffffff601000u, true},
+};
+static const u32 NumAppRegions = sizeof(AppRegions)/sizeof(AppRegions[0]);
+
+// See the comment above: we do not currently support a stack size rlimit
+// equal to or larger than 1TB.
+static const uptr MaxStackSize = (1ULL << 40) - 4096;
+
+class ShadowMapping {
+public:
+  static const uptr Mask = 0x00000fffffffffffu;
+  // The scale and offset vary by tool.
+  uptr Scale;
+  uptr Offset;
+  void initialize(uptr ShadowScale) {
+    static const uptr OffsetArray[3] = {
+        0x0000130000000000u,
+        0x0000220000000000u,
+        0x0000440000000000u,
+    };
+    Scale = ShadowScale;
+    if (Scale <= 2)
+      Offset = OffsetArray[Scale];
+    else
+      Offset = OffsetArray[0] << Scale;
+  }
+};
+extern ShadowMapping Mapping;
+#else
+// We'll want to use templatized functions over the ShadowMapping once
+// we support more platforms.
+#error Platform not supported
+#endif
+
+static inline bool getAppRegion(u32 i, uptr *Start, uptr *End) {
+  if (i >= NumAppRegions)
+    return false;
+  *Start = AppRegions[i].Start;
+  *End = AppRegions[i].End;
+  return true;
+}
+
+ALWAYS_INLINE
+bool isAppMem(uptr Mem) {
+  for (u32 i = 0; i < NumAppRegions; ++i) {
+    if (Mem >= AppRegions[i].Start && Mem < AppRegions[i].End)
+      return true;
+  }
+  return false;
+}
+
+ALWAYS_INLINE
+uptr appToShadow(uptr App) {
+  return (((App & ShadowMapping::Mask) + Mapping.Offset) >> Mapping.Scale);
+}
+
+static inline bool getShadowRegion(u32 i, uptr *Start, uptr *End) {
+  if (i >= NumAppRegions)
+    return false;
+  u32 UnmergedShadowCount = 0;
+  u32 AppIdx;
+  for (AppIdx = 0; AppIdx < NumAppRegions; ++AppIdx) {
+    if (!AppRegions[AppIdx].ShadowMergedWithPrev) {
+      if (UnmergedShadowCount == i)
+        break;
+      UnmergedShadowCount++;
+    }
+  }
+  if (AppIdx >= NumAppRegions || UnmergedShadowCount != i)
+    return false;
+  *Start = appToShadow(AppRegions[AppIdx].Start);
+  // The formula fails for the end itself.
+  *End = appToShadow(AppRegions[AppIdx].End - 1) + 1;
+  // Merge with adjacent shadow regions:
+  for (++AppIdx; AppIdx < NumAppRegions; ++AppIdx) {
+    if (!AppRegions[AppIdx].ShadowMergedWithPrev)
+      break;
+    *Start = Min(*Start, appToShadow(AppRegions[AppIdx].Start));
+    *End = Max(*End, appToShadow(AppRegions[AppIdx].End - 1) + 1);
+  }
+  return true;
+}
+
+ALWAYS_INLINE
+bool isShadowMem(uptr Mem) {
+  // We assume this is not used on any critical performance path and so there's
+  // no need to hardcode the mapping results.
+  for (uptr i = 0; i < NumAppRegions; ++i) {
+    if (Mem >= appToShadow(AppRegions[i].Start) &&
+        Mem < appToShadow(AppRegions[i].End - 1) + 1)
+      return true;
+  }
+  return false;
+}
+
+} // namespace __esan
+
+#endif /* ESAN_SHADOW_H */
diff --git a/lib/esan/esan_sideline.h b/lib/esan/esan_sideline.h
new file mode 100644
index 0000000..aa3fae1
--- /dev/null
+++ b/lib/esan/esan_sideline.h
@@ -0,0 +1,61 @@
+//===-- esan_sideline.h -----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of EfficiencySanitizer, a family of performance tuners.
+//
+// Esan sideline thread support.
+//===----------------------------------------------------------------------===//
+
+#ifndef ESAN_SIDELINE_H
+#define ESAN_SIDELINE_H
+
+#include "sanitizer_common/sanitizer_atomic.h"
+#include "sanitizer_common/sanitizer_internal_defs.h"
+
+namespace __esan {
+
+typedef void (*SidelineFunc)(void *Arg);
+
+// Currently only one sideline thread is supported.
+// It calls the SidelineFunc passed to launchThread once on each sample at the
+// given frequency in real time (i.e., wall clock time).
+class SidelineThread {
+public:
+  // We cannot initialize any fields in the constructor as it will be called
+  // *after* launchThread for a static instance, as esan.module_ctor is called
+  // before static initializers.
+  SidelineThread() {}
+  ~SidelineThread() {}
+
+  // To simplify declaration in sanitizer code where we want to avoid
+  // heap allocations, the constructor and destructor do nothing and
+  // launchThread and joinThread do the real work.
+  // They should each be called just once.
+  bool launchThread(SidelineFunc takeSample, void *Arg, u32 FreqMilliSec);
+  bool joinThread();
+
+  // Must be called from the sideline thread itself.
+  bool adjustTimer(u32 FreqMilliSec);
+
+private:
+  static int runSideline(void *Arg);
+  static void registerSignal(int SigNum);
+  static void handleSidelineSignal(int SigNum, void *SigInfo, void *Ctx);
+
+  char *Stack;
+  SidelineFunc sampleFunc;
+  void *FuncArg;
+  u32 Freq;
+  uptr SidelineId;
+  atomic_uintptr_t SidelineExit;
+};
+
+} // namespace __esan
+
+#endif  // ESAN_SIDELINE_H
diff --git a/lib/esan/esan_sideline_linux.cpp b/lib/esan/esan_sideline_linux.cpp
new file mode 100644
index 0000000..ba4fe62
--- /dev/null
+++ b/lib/esan/esan_sideline_linux.cpp
@@ -0,0 +1,170 @@
+//===-- esan_sideline_linux.cpp ---------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of EfficiencySanitizer, a family of performance tuners.
+//
+// Support for a separate or "sideline" tool thread on Linux.
+//===----------------------------------------------------------------------===//
+
+#include "sanitizer_common/sanitizer_platform.h"
+#if SANITIZER_LINUX
+
+#include "esan_sideline.h"
+#include "sanitizer_common/sanitizer_atomic.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_linux.h"
+#include <errno.h>
+#include <sched.h>
+#include <sys/prctl.h>
+#include <sys/signal.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+namespace __esan {
+
+static const int SigAltStackSize = 4*1024;
+static const int SidelineStackSize = 4*1024;
+
+// FIXME: we'll need some kind of TLS (can we trust that a pthread key will
+// work in our non-POSIX thread?) to access our data in our signal handler
+// with multiple sideline threads.  For now we assume there is only one
+// sideline thread and we use a dirty solution of a global var.
+static SidelineThread *TheThread;
+
+// We aren't passing SA_NODEFER so the same signal is blocked while here.
+void SidelineThread::handleSidelineSignal(int SigNum, void *SigInfo,
+                                          void *Ctx) {
+  VPrintf(3, "Sideline signal %d\n", SigNum);
+  CHECK_EQ(SigNum, SIGALRM);
+  // See above about needing TLS to avoid this global var.
+  SidelineThread *Thread = TheThread;
+  if (atomic_load(&Thread->SidelineExit, memory_order_relaxed) != 0)
+    return;
+  Thread->sampleFunc(Thread->FuncArg);
+}
+
+void SidelineThread::registerSignal(int SigNum) {
+  __sanitizer_sigaction SigAct;
+  internal_memset(&SigAct, 0, sizeof(SigAct));
+  SigAct.sigaction = handleSidelineSignal;
+  // We do not pass SA_NODEFER as we want to block the same signal.
+  SigAct.sa_flags = SA_ONSTACK | SA_SIGINFO;
+  int Res = internal_sigaction(SigNum, &SigAct, nullptr);
+  CHECK_EQ(Res, 0);
+}
+
+int SidelineThread::runSideline(void *Arg) {
+  VPrintf(1, "Sideline thread starting\n");
+  SidelineThread *Thread = static_cast<SidelineThread*>(Arg);
+
+  // If the parent dies, we want to exit also.
+  internal_prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0);
+
+  // Set up a signal handler on an alternate stack for safety.
+  InternalScopedBuffer<char> StackMap(SigAltStackSize);
+  struct sigaltstack SigAltStack;
+  SigAltStack.ss_sp = StackMap.data();
+  SigAltStack.ss_size = SigAltStackSize;
+  SigAltStack.ss_flags = 0;
+  internal_sigaltstack(&SigAltStack, nullptr);
+
+  // We inherit the signal mask from the app thread.  In case
+  // we weren't created at init time, we ensure the mask is empty.
+  __sanitizer_sigset_t SigSet;
+  internal_sigfillset(&SigSet);
+  int Res = internal_sigprocmask(SIG_UNBLOCK, &SigSet, nullptr);
+  CHECK_EQ(Res, 0);
+
+  registerSignal(SIGALRM);
+
+  bool TimerSuccess = Thread->adjustTimer(Thread->Freq);
+  CHECK(TimerSuccess);
+
+  // We loop, doing nothing but handling itimer signals.
+  while (atomic_load(&TheThread->SidelineExit, memory_order_relaxed) == 0)
+    sched_yield();
+
+  if (!Thread->adjustTimer(0))
+    VPrintf(1, "Failed to disable timer\n");
+
+  VPrintf(1, "Sideline thread exiting\n");
+  return 0;
+}
+
+bool SidelineThread::launchThread(SidelineFunc takeSample, void *Arg,
+                                  u32 FreqMilliSec) {
+  // This can only be called once.  However, we can't clear a field in
+  // the constructor and check for that here as the constructor for
+  // a static instance is called *after* our module_ctor and thus after
+  // this routine!  Thus we rely on the TheThread check below.
+  CHECK(TheThread == nullptr); // Only one sideline thread is supported.
+  TheThread = this;
+  sampleFunc = takeSample;
+  FuncArg = Arg;
+  Freq = FreqMilliSec;
+  atomic_store(&SidelineExit, 0, memory_order_relaxed);
+
+  // We do without a guard page.
+  Stack = static_cast<char*>(MmapOrDie(SidelineStackSize, "SidelineStack"));
+  // By omitting CLONE_THREAD, the child is in its own thread group and will not
+  // receive any of the application's signals.
+  SidelineId = internal_clone(
+      runSideline, Stack + SidelineStackSize,
+      CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_UNTRACED,
+      this, nullptr /* parent_tidptr */,
+      nullptr /* newtls */, nullptr /* child_tidptr */);
+  int ErrCode;
+  if (internal_iserror(SidelineId, &ErrCode)) {
+    Printf("FATAL: EfficiencySanitizer failed to spawn a thread (code %d).\n",
+           ErrCode);
+    Die();
+    return false; // Not reached.
+  }
+  return true;
+}
+
+bool SidelineThread::joinThread() {
+  VPrintf(1, "Joining sideline thread\n");
+  bool Res = true;
+  atomic_store(&SidelineExit, 1, memory_order_relaxed);
+  while (true) {
+    uptr Status = internal_waitpid(SidelineId, nullptr, __WALL);
+    int ErrCode;
+    if (!internal_iserror(Status, &ErrCode))
+      break;
+    if (ErrCode == EINTR)
+      continue;
+    VPrintf(1, "Failed to join sideline thread (errno %d)\n", ErrCode);
+    Res = false;
+    break;
+  }
+  UnmapOrDie(Stack, SidelineStackSize);
+  return Res;
+}
+
+// Must be called from the sideline thread itself.
+bool SidelineThread::adjustTimer(u32 FreqMilliSec) {
+  CHECK(internal_getpid() == SidelineId);
+  Freq = FreqMilliSec;
+  struct itimerval TimerVal;
+  TimerVal.it_interval.tv_sec = (time_t) Freq / 1000;
+  TimerVal.it_interval.tv_usec = (time_t) (Freq % 1000) * 1000;
+  TimerVal.it_value.tv_sec = (time_t) Freq / 1000;
+  TimerVal.it_value.tv_usec = (time_t) (Freq % 1000) * 1000;
+  // As we're in a different thread group, we cannot use either
+  // ITIMER_PROF or ITIMER_VIRTUAL without taking up scheduled
+  // time ourselves: thus we must use real time.
+  int Res = setitimer(ITIMER_REAL, &TimerVal, nullptr);
+  return (Res == 0);
+}
+
+} // namespace __esan
+
+#endif // SANITIZER_LINUX
diff --git a/lib/esan/working_set.cpp b/lib/esan/working_set.cpp
new file mode 100644
index 0000000..3fde5a8
--- /dev/null
+++ b/lib/esan/working_set.cpp
@@ -0,0 +1,274 @@
+//===-- working_set.cpp ---------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of EfficiencySanitizer, a family of performance tuners.
+//
+// This file contains working-set-specific code.
+//===----------------------------------------------------------------------===//
+
+#include "working_set.h"
+#include "esan.h"
+#include "esan_circular_buffer.h"
+#include "esan_flags.h"
+#include "esan_shadow.h"
+#include "esan_sideline.h"
+#include "sanitizer_common/sanitizer_procmaps.h"
+
+// We shadow every cache line of app memory with one shadow byte.
+// - The highest bit of each shadow byte indicates whether the corresponding
+//   cache line has ever been accessed.
+// - The lowest bit of each shadow byte indicates whether the corresponding
+//   cache line was accessed since the last sample.
+// - The other bits are used for working set snapshots at successively
+//   lower frequencies, each bit to the left from the lowest bit stepping
+//   down the frequency by 2 to the power of getFlags()->snapshot_step.
+// Thus we have something like this:
+//   Bit 0: Since last sample
+//   Bit 1: Since last 2^2 samples
+//   Bit 2: Since last 2^4 samples
+//   Bit 3: ...
+//   Bit 7: Ever accessed.
+// We live with races in accessing each shadow byte.
+typedef unsigned char byte;
+
+namespace __esan {
+
+// Our shadow memory assumes that the line size is 64.
+static const u32 CacheLineSize = 64;
+
+// See the shadow byte layout description above.
+static const u32 TotalWorkingSetBitIdx = 7;
+// We accumulate to the left until we hit this bit.
+// We don't need to accumulate to the final bit as it's set on each ref
+// by the compiler instrumentation.
+static const u32 MaxAccumBitIdx = 6;
+static const u32 CurWorkingSetBitIdx = 0;
+static const byte ShadowAccessedVal =
+  (1 << TotalWorkingSetBitIdx) | (1 << CurWorkingSetBitIdx);
+
+static SidelineThread Thread;
+// If we use real-time-based timer samples this won't overflow in any realistic
+// scenario, but if we switch to some other unit (such as memory accesses) we
+// may want to consider a 64-bit int.
+static u32 SnapshotNum;
+
+// We store the wset size for each of 8 different sampling frequencies.
+static const u32 NumFreq = 8; // One for each bit of our shadow bytes.
+// We cannot use static objects as the global destructor is called
+// prior to our finalize routine.
+// These are each circular buffers, sized up front.
+CircularBuffer<u32> SizePerFreq[NumFreq];
+// We cannot rely on static initializers (they may run too late) but
+// we record the size here for clarity:
+u32 CircularBufferSizes[NumFreq] = {
+  // These are each mmap-ed so our minimum is one page.
+  32*1024,
+  16*1024,
+  8*1024,
+  4*1024,
+  4*1024,
+  4*1024,
+  4*1024,
+  4*1024,
+};
+
+void processRangeAccessWorkingSet(uptr PC, uptr Addr, SIZE_T Size,
+                                  bool IsWrite) {
+  if (Size == 0)
+    return;
+  SIZE_T I = 0;
+  uptr LineSize = getFlags()->cache_line_size;
+  // As Addr+Size could overflow at the top of a 32-bit address space,
+  // we avoid the simpler formula that rounds the start and end.
+  SIZE_T NumLines = Size / LineSize +
+    // Add any extra at the start or end adding on an extra line:
+    (LineSize - 1 + Addr % LineSize + Size % LineSize) / LineSize;
+  byte *Shadow = (byte *)appToShadow(Addr);
+  // Write shadow bytes until we're word-aligned.
+  while (I < NumLines && (uptr)Shadow % 4 != 0) {
+    if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal)
+      *Shadow |= ShadowAccessedVal;
+    ++Shadow;
+    ++I;
+  }
+  // Write whole shadow words at a time.
+  // Using a word-stride loop improves the runtime of a microbenchmark of
+  // memset calls by 10%.
+  u32 WordValue = ShadowAccessedVal | ShadowAccessedVal << 8 |
+    ShadowAccessedVal << 16 | ShadowAccessedVal << 24;
+  while (I + 4 <= NumLines) {
+    if ((*(u32*)Shadow & WordValue) != WordValue)
+      *(u32*)Shadow |= WordValue;
+    Shadow += 4;
+    I += 4;
+  }
+  // Write any trailing shadow bytes.
+  while (I < NumLines) {
+    if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal)
+      *Shadow |= ShadowAccessedVal;
+    ++Shadow;
+    ++I;
+  }
+}
+
+// This routine will word-align ShadowStart and ShadowEnd prior to scanning.
+// It does *not* clear for BitIdx==TotalWorkingSetBitIdx, as that top bit
+// measures the access during the entire execution and should never be cleared.
+static u32 countAndClearShadowValues(u32 BitIdx, uptr ShadowStart,
+                                     uptr ShadowEnd) {
+  u32 WorkingSetSize = 0;
+  u32 ByteValue = 0x1 << BitIdx;
+  u32 WordValue = ByteValue | ByteValue << 8 | ByteValue << 16 |
+    ByteValue << 24;
+  // Get word aligned start.
+  ShadowStart = RoundDownTo(ShadowStart, sizeof(u32));
+  bool Accum = getFlags()->record_snapshots && BitIdx < MaxAccumBitIdx;
+  // Do not clear the bit that measures access during the entire execution.
+  bool Clear = BitIdx < TotalWorkingSetBitIdx;
+  for (u32 *Ptr = (u32 *)ShadowStart; Ptr < (u32 *)ShadowEnd; ++Ptr) {
+    if ((*Ptr & WordValue) != 0) {
+      byte *BytePtr = (byte *)Ptr;
+      for (u32 j = 0; j < sizeof(u32); ++j) {
+        if (BytePtr[j] & ByteValue) {
+          ++WorkingSetSize;
+          if (Accum) {
+            // Accumulate to the lower-frequency bit to the left.
+            BytePtr[j] |= (ByteValue << 1);
+          }
+        }
+      }
+      if (Clear) {
+        // Clear this bit from every shadow byte.
+        *Ptr &= ~WordValue;
+      }
+    }
+  }
+  return WorkingSetSize;
+}
+
+// Scan shadow memory to calculate the number of cache lines being accessed,
+// i.e., the number of non-zero bits indexed by BitIdx in each shadow byte.
+// We also clear the lowest bits (most recent working set snapshot).
+// We do *not* clear for BitIdx==TotalWorkingSetBitIdx, as that top bit
+// measures the access during the entire execution and should never be cleared.
+static u32 computeWorkingSizeAndReset(u32 BitIdx) {
+  u32 WorkingSetSize = 0;
+  MemoryMappingLayout MemIter(true/*cache*/);
+  uptr Start, End, Prot;
+  while (MemIter.Next(&Start, &End, nullptr/*offs*/, nullptr/*file*/,
+                      0/*file size*/, &Prot)) {
+    VPrintf(4, "%s: considering %p-%p app=%d shadow=%d prot=%u\n",
+            __FUNCTION__, Start, End, Prot, isAppMem(Start),
+            isShadowMem(Start));
+    if (isShadowMem(Start) && (Prot & MemoryMappingLayout::kProtectionWrite)) {
+      VPrintf(3, "%s: walking %p-%p\n", __FUNCTION__, Start, End);
+      WorkingSetSize += countAndClearShadowValues(BitIdx, Start, End);
+    }
+  }
+  return WorkingSetSize;
+}
+
+// This is invoked from a signal handler but in a sideline thread doing nothing
+// else so it is a little less fragile than a typical signal handler.
+static void takeSample(void *Arg) {
+  u32 BitIdx = CurWorkingSetBitIdx;
+  u32 Freq = 1;
+  ++SnapshotNum; // Simpler to skip 0 whose mod matches everything.
+  while (BitIdx <= MaxAccumBitIdx && (SnapshotNum % Freq) == 0) {
+    u32 NumLines = computeWorkingSizeAndReset(BitIdx);
+    VReport(1, "%s: snapshot #%5d bit %d freq %4d: %8u\n", SanitizerToolName,
+            SnapshotNum, BitIdx, Freq, NumLines);
+    SizePerFreq[BitIdx].push_back(NumLines);
+    Freq = Freq << getFlags()->snapshot_step;
+    BitIdx++;
+  }
+}
+
+// Initialization that must be done before any instrumented code is executed.
+void initializeShadowWorkingSet() {
+  CHECK(getFlags()->cache_line_size == CacheLineSize);
+  registerMemoryFaultHandler();
+}
+
+void initializeWorkingSet() {
+  if (getFlags()->record_snapshots) {
+    for (u32 i = 0; i < NumFreq; ++i)
+      SizePerFreq[i].initialize(CircularBufferSizes[i]);
+    Thread.launchThread(takeSample, nullptr, getFlags()->sample_freq);
+  }
+}
+
+static u32 getPeriodForPrinting(u32 MilliSec, const char *&Unit) {
+  if (MilliSec > 600000) {
+    Unit = "min";
+    return MilliSec / 60000;
+  } else if (MilliSec > 10000) {
+    Unit = "sec";
+    return MilliSec / 1000;
+  } else {
+    Unit = "ms";
+    return MilliSec;
+  }
+}
+
+static u32 getSizeForPrinting(u32 NumOfCachelines, const char *&Unit) {
+  // We need a constant to avoid software divide support:
+  static const u32 KilobyteCachelines = (0x1 << 10) / CacheLineSize;
+  static const u32 MegabyteCachelines = KilobyteCachelines << 10;
+
+  if (NumOfCachelines > 10 * MegabyteCachelines) {
+    Unit = "MB";
+    return NumOfCachelines / MegabyteCachelines;
+  } else if (NumOfCachelines > 10 * KilobyteCachelines) {
+    Unit = "KB";
+    return NumOfCachelines / KilobyteCachelines;
+  } else {
+    Unit = "Bytes";
+    return NumOfCachelines * CacheLineSize;
+  }
+}
+
+void reportWorkingSet() {
+  const char *Unit;
+  if (getFlags()->record_snapshots) {
+    u32 Freq = 1;
+    Report(" Total number of samples: %u\n", SnapshotNum);
+    for (u32 i = 0; i < NumFreq; ++i) {
+      u32 Time = getPeriodForPrinting(getFlags()->sample_freq*Freq, Unit);
+      Report(" Samples array #%d at period %u %s\n", i, Time, Unit);
+      // FIXME: report whether we wrapped around and thus whether we
+      // have data on the whole run or just the last N samples.
+      for (u32 j = 0; j < SizePerFreq[i].size(); ++j) {
+        u32 Size = getSizeForPrinting(SizePerFreq[i][j], Unit);
+        Report("#%4d: %8u %s (%9u cache lines)\n", j, Size, Unit,
+               SizePerFreq[i][j]);
+      }
+      Freq = Freq << getFlags()->snapshot_step;
+    }
+  }
+
+  // Get the working set size for the entire execution.
+  u32 NumOfCachelines = computeWorkingSizeAndReset(TotalWorkingSetBitIdx);
+  u32 Size = getSizeForPrinting(NumOfCachelines, Unit);
+  Report(" %s: the total working set size: %u %s (%u cache lines)\n",
+         SanitizerToolName, Size, Unit, NumOfCachelines);
+}
+
+int finalizeWorkingSet() {
+  if (getFlags()->record_snapshots)
+    Thread.joinThread();
+  reportWorkingSet();
+  if (getFlags()->record_snapshots) {
+    for (u32 i = 0; i < NumFreq; ++i)
+      SizePerFreq[i].free();
+  }
+  return 0;
+}
+
+} // namespace __esan
diff --git a/lib/esan/working_set.h b/lib/esan/working_set.h
new file mode 100644
index 0000000..38ff063
--- /dev/null
+++ b/lib/esan/working_set.h
@@ -0,0 +1,39 @@
+//===-- working_set.h -------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of EfficiencySanitizer, a family of performance tuners.
+//
+// Header for working-set-specific code.
+//===----------------------------------------------------------------------===//
+
+#ifndef WORKING_SET_H
+#define WORKING_SET_H
+
+#include "interception/interception.h"
+#include "sanitizer_common/sanitizer_internal_defs.h"
+
+namespace __esan {
+
+void initializeWorkingSet();
+void initializeShadowWorkingSet();
+int finalizeWorkingSet();
+void reportWorkingSet();
+void processRangeAccessWorkingSet(uptr PC, uptr Addr, SIZE_T Size,
+                                  bool IsWrite);
+
+// Platform-dependent.
+void registerMemoryFaultHandler();
+bool processWorkingSetSignal(int SigNum, void (*Handler)(int),
+                             void (**Result)(int));
+bool processWorkingSetSigaction(int SigNum, const void *Act, void *OldAct);
+bool processWorkingSetSigprocmask(int How, void *Set, void *OldSet);
+
+} // namespace __esan
+
+#endif // WORKING_SET_H
diff --git a/lib/esan/working_set_posix.cpp b/lib/esan/working_set_posix.cpp
new file mode 100644
index 0000000..fcfa871
--- /dev/null
+++ b/lib/esan/working_set_posix.cpp
@@ -0,0 +1,133 @@
+//===-- working_set_posix.cpp -----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of EfficiencySanitizer, a family of performance tuners.
+//
+// POSIX-specific working set tool code.
+//===----------------------------------------------------------------------===//
+
+#include "working_set.h"
+#include "esan_flags.h"
+#include "esan_shadow.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_linux.h"
+#include <signal.h>
+#include <sys/mman.h>
+
+namespace __esan {
+
+// We only support regular POSIX threads with a single signal handler
+// for the whole process == thread group.
+// Thus we only need to store one app signal handler.
+// FIXME: Store and use any alternate stack and signal flags set by
+// the app.  For now we just call the app handler from our handler.
+static __sanitizer_sigaction AppSigAct;
+
+bool processWorkingSetSignal(int SigNum, void (*Handler)(int),
+                             void (**Result)(int)) {
+  VPrintf(2, "%s: %d\n", __FUNCTION__, SigNum);
+  if (SigNum == SIGSEGV) {
+    *Result = AppSigAct.handler;
+    AppSigAct.sigaction = (void (*)(int, void*, void*))Handler;
+    return false; // Skip real call.
+  }
+  return true;
+}
+
+bool processWorkingSetSigaction(int SigNum, const void *ActVoid,
+                                void *OldActVoid) {
+  VPrintf(2, "%s: %d\n", __FUNCTION__, SigNum);
+  if (SigNum == SIGSEGV) {
+    const struct sigaction *Act = (const struct sigaction *) ActVoid;
+    struct sigaction *OldAct = (struct sigaction *) OldActVoid;
+    if (OldAct)
+      internal_memcpy(OldAct, &AppSigAct, sizeof(OldAct));
+    if (Act)
+      internal_memcpy(&AppSigAct, Act, sizeof(AppSigAct));
+    return false; // Skip real call.
+  }
+  return true;
+}
+
+bool processWorkingSetSigprocmask(int How, void *Set, void *OldSet) {
+  VPrintf(2, "%s\n", __FUNCTION__);
+  // All we need to do is ensure that SIGSEGV is not blocked.
+  // FIXME: we are not fully transparent as we do not pretend that
+  // SIGSEGV is still blocked on app queries: that would require
+  // per-thread mask tracking.
+  if (Set && (How == SIG_BLOCK || How == SIG_SETMASK)) {
+    if (internal_sigismember((__sanitizer_sigset_t *)Set, SIGSEGV)) {
+      VPrintf(1, "%s: removing SIGSEGV from the blocked set\n", __FUNCTION__);
+      internal_sigdelset((__sanitizer_sigset_t *)Set, SIGSEGV);
+    }
+  }
+  return true;
+}
+
+static void reinstateDefaultHandler(int SigNum) {
+  __sanitizer_sigaction SigAct;
+  internal_memset(&SigAct, 0, sizeof(SigAct));
+  SigAct.sigaction = (void (*)(int, void*, void*)) SIG_DFL;
+  int Res = internal_sigaction(SigNum, &SigAct, nullptr);
+  CHECK(Res == 0);
+  VPrintf(1, "Unregistered for %d handler\n", SigNum);
+}
+
+// If this is a shadow fault, we handle it here; otherwise, we pass it to the
+// app to handle it just as the app would do without our tool in place.
+static void handleMemoryFault(int SigNum, void *Info, void *Ctx) {
+  if (SigNum == SIGSEGV) {
+    // We rely on si_addr being filled in (thus we do not support old kernels).
+    siginfo_t *SigInfo = (siginfo_t *)Info;
+    uptr Addr = (uptr)SigInfo->si_addr;
+    if (isShadowMem(Addr)) {
+      VPrintf(3, "Shadow fault @%p\n", Addr);
+      uptr PageSize = GetPageSizeCached();
+      int Res = internal_mprotect((void *)RoundDownTo(Addr, PageSize),
+                                  PageSize, PROT_READ|PROT_WRITE);
+      CHECK(Res == 0);
+    } else if (AppSigAct.sigaction) {
+      // FIXME: For simplicity we ignore app options including its signal stack
+      // (we just use ours) and all the delivery flags.
+      AppSigAct.sigaction(SigNum, Info, Ctx);
+    } else {
+      // Crash instead of spinning with infinite faults.
+      reinstateDefaultHandler(SigNum);
+    }
+  } else
+    UNREACHABLE("signal not registered");
+}
+
+void registerMemoryFaultHandler() {
+  // We do not use an alternate signal stack, as doing so would require
+  // setting it up for each app thread.
+  // FIXME: This could result in problems with emulating the app's signal
+  // handling if the app relies on an alternate stack for SIGSEGV.
+
+  // We require that SIGSEGV is not blocked.  We use a sigprocmask
+  // interceptor to ensure that in the future.  Here we ensure it for
+  // the current thread.  We assume there are no other threads at this
+  // point during initialization, or that at least they do not block
+  // SIGSEGV.
+  __sanitizer_sigset_t SigSet;
+  internal_sigemptyset(&SigSet);
+  internal_sigprocmask(SIG_BLOCK, &SigSet, nullptr);
+
+  __sanitizer_sigaction SigAct;
+  internal_memset(&SigAct, 0, sizeof(SigAct));
+  SigAct.sigaction = handleMemoryFault;
+  // We want to handle nested signals b/c we need to handle a
+  // shadow fault in an app signal handler.
+  SigAct.sa_flags = SA_SIGINFO | SA_NODEFER;
+  int Res = internal_sigaction(SIGSEGV, &SigAct, &AppSigAct);
+  CHECK(Res == 0);
+  VPrintf(1, "Registered for SIGSEGV handler\n");
+}
+
+} // namespace __esan
diff --git a/lib/interception/CMakeLists.txt b/lib/interception/CMakeLists.txt
index 16b41c9..18d2594 100644
--- a/lib/interception/CMakeLists.txt
+++ b/lib/interception/CMakeLists.txt
@@ -10,10 +10,14 @@
 include_directories(..)
 
 set(INTERCEPTION_CFLAGS ${SANITIZER_COMMON_CFLAGS})
-append_no_rtti_flag(INTERCEPTION_CFLAGS)
+append_rtti_flag(OFF INTERCEPTION_CFLAGS)
 
 add_compiler_rt_object_libraries(RTInterception
     OS ${SANITIZER_COMMON_SUPPORTED_OS}
     ARCHS ${SANITIZER_COMMON_SUPPORTED_ARCH}
     SOURCES ${INTERCEPTION_SOURCES}
     CFLAGS ${INTERCEPTION_CFLAGS})
+
+if(COMPILER_RT_INCLUDE_TESTS)
+  add_subdirectory(tests)
+endif()
diff --git a/lib/interception/Makefile.mk b/lib/interception/Makefile.mk
deleted file mode 100644
index 88aa6cb..0000000
--- a/lib/interception/Makefile.mk
+++ /dev/null
@@ -1,23 +0,0 @@
-#===- lib/interception/Makefile.mk -------------------------*- Makefile -*--===#
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-#===------------------------------------------------------------------------===#
-
-ModuleName := interception
-SubDirs :=
-
-Sources := $(foreach file,$(wildcard $(Dir)/*.cc),$(notdir $(file)))
-ObjNames := $(Sources:%.cc=%.o)
-
-Implementation := Generic
-
-# FIXME: use automatic dependencies?
-Dependencies := $(wildcard $(Dir)/*.h)
-Dependencies += $(wildcard $(Dir)/../sanitizer_common/*.h)
-
-# Define a convenience variable for all the interception functions.
-InterceptionFunctions := $(Sources:%.cc=%)
diff --git a/lib/interception/interception_win.cc b/lib/interception/interception_win.cc
index 4c04c83..1db8ac4 100644
--- a/lib/interception/interception_win.cc
+++ b/lib/interception/interception_win.cc
@@ -10,16 +10,160 @@
 // This file is a part of AddressSanitizer, an address sanity checker.
 //
 // Windows-specific interception methods.
+//
+// This file is implementing several hooking techniques to intercept calls
+// to functions. The hooks are dynamically installed by modifying the assembly
+// code.
+//
+// The hooking techniques are making assumptions on the way the code is
+// generated and are safe under these assumptions.
+//
+// On 64-bit architecture, there is no direct 64-bit jump instruction. To allow
+// arbitrary branching on the whole memory space, the notion of trampoline
+// region is used. A trampoline region is a memory space withing 2G boundary
+// where it is safe to add custom assembly code to build 64-bit jumps.
+//
+// Hooking techniques
+// ==================
+//
+// 1) Detour
+//
+//    The Detour hooking technique is assuming the presence of an header with
+//    padding and an overridable 2-bytes nop instruction (mov edi, edi). The
+//    nop instruction can safely be replaced by a 2-bytes jump without any need
+//    to save the instruction. A jump to the target is encoded in the function
+//    header and the nop instruction is replaced by a short jump to the header.
+//
+//        head:  5 x nop                 head:  jmp <hook>
+//        func:  mov edi, edi    -->     func:  jmp short <head>
+//               [...]                   real:  [...]
+//
+//    This technique is only implemented on 32-bit architecture.
+//    Most of the time, Windows API are hookable with the detour technique.
+//
+// 2) Redirect Jump
+//
+//    The redirect jump is applicable when the first instruction is a direct
+//    jump. The instruction is replaced by jump to the hook.
+//
+//        func:  jmp <label>     -->     func:  jmp <hook>
+//
+//    On an 64-bit architecture, a trampoline is inserted.
+//
+//        func:  jmp <label>     -->     func:  jmp <tramp>
+//                                              [...]
+//
+//                                   [trampoline]
+//                                      tramp:  jmp QWORD [addr]
+//                                       addr:  .bytes <hook>
+//
+//    Note: <real> is equilavent to <label>.
+//
+// 3) HotPatch
+//
+//    The HotPatch hooking is assuming the presence of an header with padding
+//    and a first instruction with at least 2-bytes.
+//
+//    The reason to enforce the 2-bytes limitation is to provide the minimal
+//    space to encode a short jump. HotPatch technique is only rewriting one
+//    instruction to avoid breaking a sequence of instructions containing a
+//    branching target.
+//
+//    Assumptions are enforced by MSVC compiler by using the /HOTPATCH flag.
+//      see: https://msdn.microsoft.com/en-us/library/ms173507.aspx
+//    Default padding length is 5 bytes in 32-bits and 6 bytes in 64-bits.
+//
+//        head:   5 x nop                head:  jmp <hook>
+//        func:   <instr>        -->     func:  jmp short <head>
+//                [...]                  body:  [...]
+//
+//                                   [trampoline]
+//                                       real:  <instr>
+//                                              jmp <body>
+//
+//    On an 64-bit architecture:
+//
+//        head:   6 x nop                head:  jmp QWORD [addr1]
+//        func:   <instr>        -->     func:  jmp short <head>
+//                [...]                  body:  [...]
+//
+//                                   [trampoline]
+//                                      addr1:  .bytes <hook>
+//                                       real:  <instr>
+//                                              jmp QWORD [addr2]
+//                                      addr2:  .bytes <body>
+//
+// 4) Trampoline
+//
+//    The Trampoline hooking technique is the most aggressive one. It is
+//    assuming that there is a sequence of instructions that can be safely
+//    replaced by a jump (enough room and no incoming branches).
+//
+//    Unfortunately, these assumptions can't be safely presumed and code may
+//    be broken after hooking.
+//
+//        func:   <instr>        -->     func:  jmp <hook>
+//                <instr>
+//                [...]                  body:  [...]
+//
+//                                   [trampoline]
+//                                       real:  <instr>
+//                                              <instr>
+//                                              jmp <body>
+//
+//    On an 64-bit architecture:
+//
+//        func:   <instr>        -->     func:  jmp QWORD [addr1]
+//                <instr>
+//                [...]                  body:  [...]
+//
+//                                   [trampoline]
+//                                      addr1:  .bytes <hook>
+//                                       real:  <instr>
+//                                              <instr>
+//                                              jmp QWORD [addr2]
+//                                      addr2:  .bytes <body>
 //===----------------------------------------------------------------------===//
 
 #ifdef _WIN32
 
 #include "interception.h"
+#include "sanitizer_common/sanitizer_platform.h"
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
 
 namespace __interception {
 
+static const int kAddressLength = FIRST_32_SECOND_64(4, 8);
+static const int kJumpInstructionLength = 5;
+static const int kShortJumpInstructionLength = 2;
+static const int kIndirectJumpInstructionLength = 6;
+static const int kBranchLength =
+    FIRST_32_SECOND_64(kJumpInstructionLength, kIndirectJumpInstructionLength);
+static const int kDirectBranchLength = kBranchLength + kAddressLength;
+
+static void InterceptionFailed() {
+  // Do we have a good way to abort with an error message here?
+  __debugbreak();
+}
+
+static bool DistanceIsWithin2Gig(uptr from, uptr target) {
+  if (from < target)
+    return target - from <= (uptr)0x7FFFFFFFU;
+  else
+    return from - target <= (uptr)0x80000000U;
+}
+
+static uptr GetMmapGranularity() {
+  SYSTEM_INFO si;
+  GetSystemInfo(&si);
+  return si.dwAllocationGranularity;
+}
+
+static uptr RoundUpTo(uptr size, uptr boundary) {
+  return (size + boundary - 1) & ~(boundary - 1);
+}
+
 // FIXME: internal_str* and internal_mem* functions should be moved from the
 // ASan sources into interception/.
 
@@ -35,163 +179,587 @@
     dst_c[i] = src_c[i];
 }
 
-static void WriteJumpInstruction(char *jmp_from, char *to) {
-  // jmp XXYYZZWW = E9 WW ZZ YY XX, where XXYYZZWW is an offset fromt jmp_from
-  // to the next instruction to the destination.
-  ptrdiff_t offset = to - jmp_from - 5;
-  *jmp_from = '\xE9';
-  *(ptrdiff_t*)(jmp_from + 1) = offset;
+static bool ChangeMemoryProtection(
+    uptr address, uptr size, DWORD *old_protection) {
+  return ::VirtualProtect((void*)address, size,
+                          PAGE_EXECUTE_READWRITE,
+                          old_protection) != FALSE;
 }
 
-static char *GetMemoryForTrampoline(size_t size) {
-  // Trampolines are allocated from a common pool.
-  const int POOL_SIZE = 1024;
-  static char *pool = NULL;
-  static size_t pool_used = 0;
-  if (!pool) {
-    pool = (char *)VirtualAlloc(NULL, POOL_SIZE, MEM_RESERVE | MEM_COMMIT,
-                                PAGE_EXECUTE_READWRITE);
-    // FIXME: Might want to apply PAGE_EXECUTE_READ access after all the
-    // interceptors are in place.
-    if (!pool)
-      return NULL;
-    _memset(pool, 0xCC /* int 3 */, POOL_SIZE);
+static bool RestoreMemoryProtection(
+    uptr address, uptr size, DWORD old_protection) {
+  DWORD unused;
+  return ::VirtualProtect((void*)address, size,
+                          old_protection,
+                          &unused) != FALSE;
+}
+
+static bool IsMemoryPadding(uptr address, uptr size) {
+  u8* function = (u8*)address;
+  for (size_t i = 0; i < size; ++i)
+    if (function[i] != 0x90 && function[i] != 0xCC)
+      return false;
+  return true;
+}
+
+static const u8 kHintNop10Bytes[] = {
+  0x66, 0x66, 0x0F, 0x1F, 0x84,
+  0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+template<class T>
+static bool FunctionHasPrefix(uptr address, const T &pattern) {
+  u8* function = (u8*)address - sizeof(pattern);
+  for (size_t i = 0; i < sizeof(pattern); ++i)
+    if (function[i] != pattern[i])
+      return false;
+  return true;
+}
+
+static bool FunctionHasPadding(uptr address, uptr size) {
+  if (IsMemoryPadding(address - size, size))
+    return true;
+  if (size <= sizeof(kHintNop10Bytes) &&
+      FunctionHasPrefix(address, kHintNop10Bytes))
+    return true;
+  return false;
+}
+
+static void WritePadding(uptr from, uptr size) {
+  _memset((void*)from, 0xCC, (size_t)size);
+}
+
+static void CopyInstructions(uptr from, uptr to, uptr size) {
+  _memcpy((void*)from, (void*)to, (size_t)size);
+}
+
+static void WriteJumpInstruction(uptr from, uptr target) {
+  if (!DistanceIsWithin2Gig(from + kJumpInstructionLength, target))
+    InterceptionFailed();
+  ptrdiff_t offset = target - from - kJumpInstructionLength;
+  *(u8*)from = 0xE9;
+  *(u32*)(from + 1) = offset;
+}
+
+static void WriteShortJumpInstruction(uptr from, uptr target) {
+  sptr offset = target - from - kShortJumpInstructionLength;
+  if (offset < -128 || offset > 127)
+    InterceptionFailed();
+  *(u8*)from = 0xEB;
+  *(u8*)(from + 1) = (u8)offset;
+}
+
+#if SANITIZER_WINDOWS64
+static void WriteIndirectJumpInstruction(uptr from, uptr indirect_target) {
+  // jmp [rip + <offset>] = FF 25 <offset> where <offset> is a relative
+  // offset.
+  // The offset is the distance from then end of the jump instruction to the
+  // memory location containing the targeted address. The displacement is still
+  // 32-bit in x64, so indirect_target must be located within +/- 2GB range.
+  int offset = indirect_target - from - kIndirectJumpInstructionLength;
+  if (!DistanceIsWithin2Gig(from + kIndirectJumpInstructionLength,
+                            indirect_target)) {
+    InterceptionFailed();
+  }
+  *(u16*)from = 0x25FF;
+  *(u32*)(from + 2) = offset;
+}
+#endif
+
+static void WriteBranch(
+    uptr from, uptr indirect_target, uptr target) {
+#if SANITIZER_WINDOWS64
+  WriteIndirectJumpInstruction(from, indirect_target);
+  *(u64*)indirect_target = target;
+#else
+  (void)indirect_target;
+  WriteJumpInstruction(from, target);
+#endif
+}
+
+static void WriteDirectBranch(uptr from, uptr target) {
+#if SANITIZER_WINDOWS64
+  // Emit an indirect jump through immediately following bytes:
+  //   jmp [rip + kBranchLength]
+  //   .quad <target>
+  WriteBranch(from, from + kBranchLength, target);
+#else
+  WriteJumpInstruction(from, target);
+#endif
+}
+
+struct TrampolineMemoryRegion {
+  uptr content;
+  uptr allocated_size;
+  uptr max_size;
+};
+
+static const uptr kTrampolineScanLimitRange = 1 << 30;  // 1 gig
+static const int kMaxTrampolineRegion = 1024;
+static TrampolineMemoryRegion TrampolineRegions[kMaxTrampolineRegion];
+
+static void *AllocateTrampolineRegion(uptr image_address, size_t granularity) {
+#if SANITIZER_WINDOWS64
+  uptr address = image_address;
+  uptr scanned = 0;
+  while (scanned < kTrampolineScanLimitRange) {
+    MEMORY_BASIC_INFORMATION info;
+    if (!::VirtualQuery((void*)address, &info, sizeof(info)))
+      return nullptr;
+
+    // Check whether a region can be allocated at |address|.
+    if (info.State == MEM_FREE && info.RegionSize >= granularity) {
+      void *page = ::VirtualAlloc((void*)RoundUpTo(address, granularity),
+                                  granularity,
+                                  MEM_RESERVE | MEM_COMMIT,
+                                  PAGE_EXECUTE_READWRITE);
+      return page;
+    }
+
+    // Move to the next region.
+    address = (uptr)info.BaseAddress + info.RegionSize;
+    scanned += info.RegionSize;
+  }
+  return nullptr;
+#else
+  return ::VirtualAlloc(nullptr,
+                        granularity,
+                        MEM_RESERVE | MEM_COMMIT,
+                        PAGE_EXECUTE_READWRITE);
+#endif
+}
+
+// Used by unittests to release mapped memory space.
+void TestOnlyReleaseTrampolineRegions() {
+  for (size_t bucket = 0; bucket < kMaxTrampolineRegion; ++bucket) {
+    TrampolineMemoryRegion *current = &TrampolineRegions[bucket];
+    if (current->content == 0)
+      return;
+    ::VirtualFree((void*)current->content, 0, MEM_RELEASE);
+    current->content = 0;
+  }
+}
+
+static uptr AllocateMemoryForTrampoline(uptr image_address, size_t size) {
+  // Find a region within 2G with enough space to allocate |size| bytes.
+  TrampolineMemoryRegion *region = nullptr;
+  for (size_t bucket = 0; bucket < kMaxTrampolineRegion; ++bucket) {
+    TrampolineMemoryRegion* current = &TrampolineRegions[bucket];
+    if (current->content == 0) {
+      // No valid region found, allocate a new region.
+      size_t bucket_size = GetMmapGranularity();
+      void *content = AllocateTrampolineRegion(image_address, bucket_size);
+      if (content == nullptr)
+        return 0U;
+
+      current->content = (uptr)content;
+      current->allocated_size = 0;
+      current->max_size = bucket_size;
+      region = current;
+      break;
+    } else if (current->max_size - current->allocated_size > size) {
+#if SANITIZER_WINDOWS64
+        // In 64-bits, the memory space must be allocated within 2G boundary.
+        uptr next_address = current->content + current->allocated_size;
+        if (next_address < image_address ||
+            next_address - image_address >= 0x7FFF0000)
+          continue;
+#endif
+      // The space can be allocated in the current region.
+      region = current;
+      break;
+    }
   }
 
-  if (pool_used + size > POOL_SIZE)
-    return NULL;
+  // Failed to find a region.
+  if (region == nullptr)
+    return 0U;
 
-  char *ret = pool + pool_used;
-  pool_used += size;
-  return ret;
+  // Allocate the space in the current region.
+  uptr allocated_space = region->content + region->allocated_size;
+  region->allocated_size += size;
+  WritePadding(allocated_space, size);
+
+  return allocated_space;
 }
 
 // Returns 0 on error.
-static size_t RoundUpToInstrBoundary(size_t size, char *code) {
-  size_t cursor = 0;
-  while (cursor < size) {
-    switch (code[cursor]) {
-      case '\x51':  // push ecx
-      case '\x52':  // push edx
-      case '\x53':  // push ebx
-      case '\x54':  // push esp
-      case '\x55':  // push ebp
-      case '\x56':  // push esi
-      case '\x57':  // push edi
-      case '\x5D':  // pop ebp
-        cursor++;
-        continue;
-      case '\x6A':  // 6A XX = push XX
-        cursor += 2;
-        continue;
-      case '\xE9':  // E9 XX YY ZZ WW = jmp WWZZYYXX
-      case '\xB8':  // B8 XX YY ZZ WW = mov eax, WWZZYYXX
-        cursor += 5;
-        continue;
-    }
-    switch (*(unsigned short*)(code + cursor)) {  // NOLINT
-      case 0xFF8B:  // 8B FF = mov edi, edi
-      case 0xEC8B:  // 8B EC = mov ebp, esp
-      case 0xC033:  // 33 C0 = xor eax, eax
-        cursor += 2;
-        continue;
-      case 0x458B:  // 8B 45 XX = mov eax, dword ptr [ebp+XXh]
-      case 0x5D8B:  // 8B 5D XX = mov ebx, dword ptr [ebp+XXh]
-      case 0xEC83:  // 83 EC XX = sub esp, XX
-      case 0x75FF:  // FF 75 XX = push dword ptr [ebp+XXh]
-        cursor += 3;
-        continue;
-      case 0xC1F7:  // F7 C1 XX YY ZZ WW = test ecx, WWZZYYXX
-      case 0x25FF:  // FF 25 XX YY ZZ WW = jmp dword ptr ds:[WWZZYYXX]
-        cursor += 6;
-        continue;
-      case 0x3D83:  // 83 3D XX YY ZZ WW TT = cmp TT, WWZZYYXX
-        cursor += 7;
-        continue;
-    }
-    switch (0x00FFFFFF & *(unsigned int*)(code + cursor)) {
-      case 0x24448A:  // 8A 44 24 XX = mov eal, dword ptr [esp+XXh]
-      case 0x24448B:  // 8B 44 24 XX = mov eax, dword ptr [esp+XXh]
-      case 0x244C8B:  // 8B 4C 24 XX = mov ecx, dword ptr [esp+XXh]
-      case 0x24548B:  // 8B 54 24 XX = mov edx, dword ptr [esp+XXh]
-      case 0x24748B:  // 8B 74 24 XX = mov esi, dword ptr [esp+XXh]
-      case 0x247C8B:  // 8B 7C 24 XX = mov edi, dword ptr [esp+XXh]
-        cursor += 4;
-        continue;
-    }
+static size_t GetInstructionSize(uptr address) {
+  switch (*(u8*)address) {
+    case 0x90:  // 90 : nop
+      return 1;
 
-    // Unknown instruction!
-    // FIXME: Unknown instruction failures might happen when we add a new
-    // interceptor or a new compiler version. In either case, they should result
-    // in visible and readable error messages. However, merely calling abort()
-    // leads to an infinite recursion in CheckFailed.
-    // Do we have a good way to abort with an error message here?
-    __debugbreak();
-    return 0;
+    case 0x50:  // push eax / rax
+    case 0x51:  // push ecx / rcx
+    case 0x52:  // push edx / rdx
+    case 0x53:  // push ebx / rbx
+    case 0x54:  // push esp / rsp
+    case 0x55:  // push ebp / rbp
+    case 0x56:  // push esi / rsi
+    case 0x57:  // push edi / rdi
+    case 0x5D:  // pop ebp / rbp
+      return 1;
+
+    case 0x6A:  // 6A XX = push XX
+      return 2;
+
+    case 0xb8:  // b8 XX XX XX XX : mov eax, XX XX XX XX
+    case 0xB9:  // b9 XX XX XX XX : mov ecx, XX XX XX XX
+    case 0xA1:  // A1 XX XX XX XX : mov eax, dword ptr ds:[XXXXXXXX]
+      return 5;
+
+    // Cannot overwrite control-instruction. Return 0 to indicate failure.
+    case 0xE9:  // E9 XX XX XX XX : jmp <label>
+    case 0xE8:  // E8 XX XX XX XX : call <func>
+    case 0xC3:  // C3 : ret
+    case 0xEB:  // EB XX : jmp XX (short jump)
+    case 0x70:  // 7Y YY : jy XX (short conditional jump)
+    case 0x71:
+    case 0x72:
+    case 0x73:
+    case 0x74:
+    case 0x75:
+    case 0x76:
+    case 0x77:
+    case 0x78:
+    case 0x79:
+    case 0x7A:
+    case 0x7B:
+    case 0x7C:
+    case 0x7D:
+    case 0x7E:
+    case 0x7F:
+      return 0;
   }
 
+  switch (*(u16*)(address)) {
+    case 0xFF8B:  // 8B FF : mov edi, edi
+    case 0xEC8B:  // 8B EC : mov ebp, esp
+    case 0xc889:  // 89 C8 : mov eax, ecx
+    case 0xC18B:  // 8B C1 : mov eax, ecx
+    case 0xC033:  // 33 C0 : xor eax, eax
+    case 0xC933:  // 33 C9 : xor ecx, ecx
+    case 0xD233:  // 33 D2 : xor edx, edx
+      return 2;
+
+    // Cannot overwrite control-instruction. Return 0 to indicate failure.
+    case 0x25FF:  // FF 25 XX XX XX XX : jmp [XXXXXXXX]
+      return 0;
+  }
+
+#if SANITIZER_WINDOWS64
+  switch (*(u16*)address) {
+    case 0x5040:  // push rax
+    case 0x5140:  // push rcx
+    case 0x5240:  // push rdx
+    case 0x5340:  // push rbx
+    case 0x5440:  // push rsp
+    case 0x5540:  // push rbp
+    case 0x5640:  // push rsi
+    case 0x5740:  // push rdi
+    case 0x5441:  // push r12
+    case 0x5541:  // push r13
+    case 0x5641:  // push r14
+    case 0x5741:  // push r15
+    case 0x9066:  // Two-byte NOP
+      return 2;
+  }
+
+  switch (0x00FFFFFF & *(u32*)address) {
+    case 0xe58948:    // 48 8b c4 : mov rbp, rsp
+    case 0xc18b48:    // 48 8b c1 : mov rax, rcx
+    case 0xc48b48:    // 48 8b c4 : mov rax, rsp
+    case 0xd9f748:    // 48 f7 d9 : neg rcx
+    case 0xd12b48:    // 48 2b d1 : sub rdx, rcx
+    case 0x07c1f6:    // f6 c1 07 : test cl, 0x7
+    case 0xc0854d:    // 4d 85 c0 : test r8, r8
+    case 0xc2b60f:    // 0f b6 c2 : movzx eax, dl
+    case 0xc03345:    // 45 33 c0 : xor r8d, r8d
+    case 0xd98b4c:    // 4c 8b d9 : mov r11, rcx
+    case 0xd28b4c:    // 4c 8b d2 : mov r10, rdx
+    case 0xd2b60f:    // 0f b6 d2 : movzx edx, dl
+    case 0xca2b48:    // 48 2b ca : sub rcx, rdx
+    case 0x10b70f:    // 0f b7 10 : movzx edx, WORD PTR [rax]
+    case 0xc00b4d:    // 3d 0b c0 : or r8, r8
+    case 0xd18b48:    // 48 8b d1 : mov rdx, rcx
+    case 0xdc8b4c:    // 4c 8b dc : mov r11,rsp
+    case 0xd18b4c:    // 4c 8b d1 : mov r10, rcx
+      return 3;
+
+    case 0xec8348:    // 48 83 ec XX : sub rsp, XX
+    case 0xf88349:    // 49 83 f8 XX : cmp r8, XX
+    case 0x588948:    // 48 89 58 XX : mov QWORD PTR[rax + XX], rbx
+      return 4;
+
+    case 0x058b48:    // 48 8b 05 XX XX XX XX :
+                      //   mov rax, QWORD PTR [rip + XXXXXXXX]
+    case 0x25ff48:    // 48 ff 25 XX XX XX XX :
+                      //   rex.W jmp QWORD PTR [rip + XXXXXXXX]
+      return 7;
+  }
+
+  switch (*(u32*)(address)) {
+    case 0x24448b48:  // 48 8b 44 24 XX : mov rax, qword ptr [rsp + XX]
+    case 0x245c8948:  // 48 89 5c 24 XX : mov QWORD PTR [rsp + XX], rbx
+    case 0x24748948:  // 48 89 74 24 XX : mov QWORD PTR [rsp + XX], rsi
+      return 5;
+  }
+
+#else
+
+  switch (*(u16*)address) {
+    case 0x458B:  // 8B 45 XX : mov eax, dword ptr [ebp + XX]
+    case 0x5D8B:  // 8B 5D XX : mov ebx, dword ptr [ebp + XX]
+    case 0x7D8B:  // 8B 7D XX : mov edi, dword ptr [ebp + XX]
+    case 0xEC83:  // 83 EC XX : sub esp, XX
+    case 0x75FF:  // FF 75 XX : push dword ptr [ebp + XX]
+      return 3;
+    case 0xC1F7:  // F7 C1 XX YY ZZ WW : test ecx, WWZZYYXX
+    case 0x25FF:  // FF 25 XX YY ZZ WW : jmp dword ptr ds:[WWZZYYXX]
+      return 6;
+    case 0x3D83:  // 83 3D XX YY ZZ WW TT : cmp TT, WWZZYYXX
+      return 7;
+    case 0x7D83:  // 83 7D XX YY : cmp dword ptr [ebp + XX], YY
+      return 4;
+  }
+
+  switch (0x00FFFFFF & *(u32*)address) {
+    case 0x24448A:  // 8A 44 24 XX : mov eal, dword ptr [esp + XX]
+    case 0x24448B:  // 8B 44 24 XX : mov eax, dword ptr [esp + XX]
+    case 0x244C8B:  // 8B 4C 24 XX : mov ecx, dword ptr [esp + XX]
+    case 0x24548B:  // 8B 54 24 XX : mov edx, dword ptr [esp + XX]
+    case 0x24748B:  // 8B 74 24 XX : mov esi, dword ptr [esp + XX]
+    case 0x247C8B:  // 8B 7C 24 XX : mov edi, dword ptr [esp + XX]
+      return 4;
+  }
+
+  switch (*(u32*)address) {
+    case 0x2444B60F:  // 0F B6 44 24 XX : movzx eax, byte ptr [esp + XX]
+      return 5;
+  }
+#endif
+
+  // Unknown instruction!
+  // FIXME: Unknown instruction failures might happen when we add a new
+  // interceptor or a new compiler version. In either case, they should result
+  // in visible and readable error messages. However, merely calling abort()
+  // leads to an infinite recursion in CheckFailed.
+  InterceptionFailed();
+  return 0;
+}
+
+// Returns 0 on error.
+static size_t RoundUpToInstrBoundary(size_t size, uptr address) {
+  size_t cursor = 0;
+  while (cursor < size) {
+    size_t instruction_size = GetInstructionSize(address + cursor);
+    if (!instruction_size)
+      return 0;
+    cursor += instruction_size;
+  }
   return cursor;
 }
 
-bool OverrideFunction(uptr old_func, uptr new_func, uptr *orig_old_func) {
-#ifdef _WIN64
-#error OverrideFunction is not yet supported on x64
-#endif
-  // Function overriding works basically like this:
-  // We write "jmp <new_func>" (5 bytes) at the beginning of the 'old_func'
-  // to override it.
-  // We might want to be able to execute the original 'old_func' from the
-  // wrapper, in this case we need to keep the leading 5+ bytes ('head')
-  // of the original code somewhere with a "jmp <old_func+head>".
-  // We call these 'head'+5 bytes of instructions a "trampoline".
-  char *old_bytes = (char *)old_func;
+#if !SANITIZER_WINDOWS64
+bool OverrideFunctionWithDetour(
+    uptr old_func, uptr new_func, uptr *orig_old_func) {
+  const int kDetourHeaderLen = 5;
+  const u16 kDetourInstruction = 0xFF8B;
 
-  // We'll need at least 5 bytes for a 'jmp'.
-  size_t head = 5;
-  if (orig_old_func) {
-    // Find out the number of bytes of the instructions we need to copy
-    // to the trampoline and store it in 'head'.
-    head = RoundUpToInstrBoundary(head, old_bytes);
-    if (!head)
-      return false;
+  uptr header = (uptr)old_func - kDetourHeaderLen;
+  uptr patch_length = kDetourHeaderLen + kShortJumpInstructionLength;
 
-    // Put the needed instructions into the trampoline bytes.
-    char *trampoline = GetMemoryForTrampoline(head + 5);
-    if (!trampoline)
-      return false;
-    _memcpy(trampoline, old_bytes, head);
-    WriteJumpInstruction(trampoline + head, old_bytes + head);
-    *orig_old_func = (uptr)trampoline;
-  }
-
-  // Now put the "jmp <new_func>" instruction at the original code location.
-  // We should preserve the EXECUTE flag as some of our own code might be
-  // located in the same page (sic!).  FIXME: might consider putting the
-  // __interception code into a separate section or something?
-  DWORD old_prot, unused_prot;
-  if (!VirtualProtect((void *)old_bytes, head, PAGE_EXECUTE_READWRITE,
-                      &old_prot))
+  // Validate that the function is hookable.
+  if (*(u16*)old_func != kDetourInstruction ||
+      !IsMemoryPadding(header, kDetourHeaderLen))
     return false;
 
-  WriteJumpInstruction(old_bytes, (char *)new_func);
-  _memset(old_bytes + 5, 0xCC /* int 3 */, head - 5);
+  // Change memory protection to writable.
+  DWORD protection = 0;
+  if (!ChangeMemoryProtection(header, patch_length, &protection))
+    return false;
 
-  // Restore the original permissions.
-  if (!VirtualProtect((void *)old_bytes, head, old_prot, &unused_prot))
-    return false;  // not clear if this failure bothers us.
+  // Write a relative jump to the redirected function.
+  WriteJumpInstruction(header, new_func);
+
+  // Write the short jump to the function prefix.
+  WriteShortJumpInstruction(old_func, header);
+
+  // Restore previous memory protection.
+  if (!RestoreMemoryProtection(header, patch_length, protection))
+    return false;
+
+  if (orig_old_func)
+    *orig_old_func = old_func + kShortJumpInstructionLength;
+
+  return true;
+}
+#endif
+
+bool OverrideFunctionWithRedirectJump(
+    uptr old_func, uptr new_func, uptr *orig_old_func) {
+  // Check whether the first instruction is a relative jump.
+  if (*(u8*)old_func != 0xE9)
+    return false;
+
+  if (orig_old_func) {
+    uptr relative_offset = *(u32*)(old_func + 1);
+    uptr absolute_target = old_func + relative_offset + kJumpInstructionLength;
+    *orig_old_func = absolute_target;
+  }
+
+#if SANITIZER_WINDOWS64
+  // If needed, get memory space for a trampoline jump.
+  uptr trampoline = AllocateMemoryForTrampoline(old_func, kDirectBranchLength);
+  if (!trampoline)
+    return false;
+  WriteDirectBranch(trampoline, new_func);
+#endif
+
+  // Change memory protection to writable.
+  DWORD protection = 0;
+  if (!ChangeMemoryProtection(old_func, kJumpInstructionLength, &protection))
+    return false;
+
+  // Write a relative jump to the redirected function.
+  WriteJumpInstruction(old_func, FIRST_32_SECOND_64(new_func, trampoline));
+
+  // Restore previous memory protection.
+  if (!RestoreMemoryProtection(old_func, kJumpInstructionLength, protection))
+    return false;
 
   return true;
 }
 
+bool OverrideFunctionWithHotPatch(
+    uptr old_func, uptr new_func, uptr *orig_old_func) {
+  const int kHotPatchHeaderLen = kBranchLength;
+
+  uptr header = (uptr)old_func - kHotPatchHeaderLen;
+  uptr patch_length = kHotPatchHeaderLen + kShortJumpInstructionLength;
+
+  // Validate that the function is hot patchable.
+  size_t instruction_size = GetInstructionSize(old_func);
+  if (instruction_size < kShortJumpInstructionLength ||
+      !FunctionHasPadding(old_func, kHotPatchHeaderLen))
+    return false;
+
+  if (orig_old_func) {
+    // Put the needed instructions into the trampoline bytes.
+    uptr trampoline_length = instruction_size + kDirectBranchLength;
+    uptr trampoline = AllocateMemoryForTrampoline(old_func, trampoline_length);
+    if (!trampoline)
+      return false;
+    CopyInstructions(trampoline, old_func, instruction_size);
+    WriteDirectBranch(trampoline + instruction_size,
+                      old_func + instruction_size);
+    *orig_old_func = trampoline;
+  }
+
+  // If needed, get memory space for indirect address.
+  uptr indirect_address = 0;
+#if SANITIZER_WINDOWS64
+  indirect_address = AllocateMemoryForTrampoline(old_func, kAddressLength);
+  if (!indirect_address)
+    return false;
+#endif
+
+  // Change memory protection to writable.
+  DWORD protection = 0;
+  if (!ChangeMemoryProtection(header, patch_length, &protection))
+    return false;
+
+  // Write jumps to the redirected function.
+  WriteBranch(header, indirect_address, new_func);
+  WriteShortJumpInstruction(old_func, header);
+
+  // Restore previous memory protection.
+  if (!RestoreMemoryProtection(header, patch_length, protection))
+    return false;
+
+  return true;
+}
+
+bool OverrideFunctionWithTrampoline(
+    uptr old_func, uptr new_func, uptr *orig_old_func) {
+
+  size_t instructions_length = kBranchLength;
+  size_t padding_length = 0;
+  uptr indirect_address = 0;
+
+  if (orig_old_func) {
+    // Find out the number of bytes of the instructions we need to copy
+    // to the trampoline.
+    instructions_length = RoundUpToInstrBoundary(kBranchLength, old_func);
+    if (!instructions_length)
+      return false;
+
+    // Put the needed instructions into the trampoline bytes.
+    uptr trampoline_length = instructions_length + kDirectBranchLength;
+    uptr trampoline = AllocateMemoryForTrampoline(old_func, trampoline_length);
+    if (!trampoline)
+      return false;
+    CopyInstructions(trampoline, old_func, instructions_length);
+    WriteDirectBranch(trampoline + instructions_length,
+                      old_func + instructions_length);
+    *orig_old_func = trampoline;
+  }
+
+#if SANITIZER_WINDOWS64
+  // Check if the targeted address can be encoded in the function padding.
+  // Otherwise, allocate it in the trampoline region.
+  if (IsMemoryPadding(old_func - kAddressLength, kAddressLength)) {
+    indirect_address = old_func - kAddressLength;
+    padding_length = kAddressLength;
+  } else {
+    indirect_address = AllocateMemoryForTrampoline(old_func, kAddressLength);
+    if (!indirect_address)
+      return false;
+  }
+#endif
+
+  // Change memory protection to writable.
+  uptr patch_address = old_func - padding_length;
+  uptr patch_length = instructions_length + padding_length;
+  DWORD protection = 0;
+  if (!ChangeMemoryProtection(patch_address, patch_length, &protection))
+    return false;
+
+  // Patch the original function.
+  WriteBranch(old_func, indirect_address, new_func);
+
+  // Restore previous memory protection.
+  if (!RestoreMemoryProtection(patch_address, patch_length, protection))
+    return false;
+
+  return true;
+}
+
+bool OverrideFunction(
+    uptr old_func, uptr new_func, uptr *orig_old_func) {
+#if !SANITIZER_WINDOWS64
+  if (OverrideFunctionWithDetour(old_func, new_func, orig_old_func))
+    return true;
+#endif
+  if (OverrideFunctionWithRedirectJump(old_func, new_func, orig_old_func))
+    return true;
+  if (OverrideFunctionWithHotPatch(old_func, new_func, orig_old_func))
+    return true;
+  if (OverrideFunctionWithTrampoline(old_func, new_func, orig_old_func))
+    return true;
+  return false;
+}
+
 static void **InterestingDLLsAvailable() {
-  const char *InterestingDLLs[] = {
-    "kernel32.dll",
-    "msvcr110.dll", // VS2012
-    "msvcr120.dll", // VS2013
-    // NTDLL should go last as it exports some functions that we should override
-    // in the CRT [presumably only used internally].
-    "ntdll.dll", NULL
-  };
+  static const char *InterestingDLLs[] = {
+      "kernel32.dll",
+      "msvcr110.dll",      // VS2012
+      "msvcr120.dll",      // VS2013
+      "vcruntime140.dll",  // VS2015
+      "ucrtbase.dll",      // Universal CRT
+      // NTDLL should go last as it exports some functions that we should
+      // override in the CRT [presumably only used internally].
+      "ntdll.dll", NULL};
   static void *result[ARRAY_SIZE(InterestingDLLs)] = { 0 };
   if (!result[0]) {
     for (size_t i = 0, j = 0; InterestingDLLs[i]; ++i) {
@@ -268,6 +836,71 @@
   return OverrideFunction(orig_func, new_func, orig_old_func);
 }
 
+bool OverrideImportedFunction(const char *module_to_patch,
+                              const char *imported_module,
+                              const char *function_name, uptr new_function,
+                              uptr *orig_old_func) {
+  HMODULE module = GetModuleHandleA(module_to_patch);
+  if (!module)
+    return false;
+
+  // Check that the module header is full and present.
+  RVAPtr<IMAGE_DOS_HEADER> dos_stub(module, 0);
+  RVAPtr<IMAGE_NT_HEADERS> headers(module, dos_stub->e_lfanew);
+  if (!module || dos_stub->e_magic != IMAGE_DOS_SIGNATURE || // "MZ"
+      headers->Signature != IMAGE_NT_SIGNATURE ||            // "PE\0\0"
+      headers->FileHeader.SizeOfOptionalHeader <
+          sizeof(IMAGE_OPTIONAL_HEADER)) {
+    return false;
+  }
+
+  IMAGE_DATA_DIRECTORY *import_directory =
+      &headers->OptionalHeader.DataDirectory[IMAGE_DIRECTORY_ENTRY_IMPORT];
+
+  // Iterate the list of imported DLLs. FirstThunk will be null for the last
+  // entry.
+  RVAPtr<IMAGE_IMPORT_DESCRIPTOR> imports(module,
+                                          import_directory->VirtualAddress);
+  for (; imports->FirstThunk != 0; ++imports) {
+    RVAPtr<const char> modname(module, imports->Name);
+    if (_stricmp(&*modname, imported_module) == 0)
+      break;
+  }
+  if (imports->FirstThunk == 0)
+    return false;
+
+  // We have two parallel arrays: the import address table (IAT) and the table
+  // of names. They start out containing the same data, but the loader rewrites
+  // the IAT to hold imported addresses and leaves the name table in
+  // OriginalFirstThunk alone.
+  RVAPtr<IMAGE_THUNK_DATA> name_table(module, imports->OriginalFirstThunk);
+  RVAPtr<IMAGE_THUNK_DATA> iat(module, imports->FirstThunk);
+  for (; name_table->u1.Ordinal != 0; ++name_table, ++iat) {
+    if (!IMAGE_SNAP_BY_ORDINAL(name_table->u1.Ordinal)) {
+      RVAPtr<IMAGE_IMPORT_BY_NAME> import_by_name(
+          module, name_table->u1.ForwarderString);
+      const char *funcname = &import_by_name->Name[0];
+      if (strcmp(funcname, function_name) == 0)
+        break;
+    }
+  }
+  if (name_table->u1.Ordinal == 0)
+    return false;
+
+  // Now we have the correct IAT entry. Do the swap. We have to make the page
+  // read/write first.
+  if (orig_old_func)
+    *orig_old_func = iat->u1.AddressOfData;
+  DWORD old_prot, unused_prot;
+  if (!VirtualProtect(&iat->u1.AddressOfData, 4, PAGE_EXECUTE_READWRITE,
+                      &old_prot))
+    return false;
+  iat->u1.AddressOfData = new_function;
+  if (!VirtualProtect(&iat->u1.AddressOfData, 4, old_prot, &unused_prot))
+    return false;  // Not clear if this failure bothers us.
+  return true;
+}
+
 }  // namespace __interception
 
 #endif  // _WIN32
diff --git a/lib/interception/interception_win.h b/lib/interception/interception_win.h
index 96c4a0c..9061f9e 100644
--- a/lib/interception/interception_win.h
+++ b/lib/interception/interception_win.h
@@ -34,6 +34,31 @@
 // Windows-only replacement for GetProcAddress. Useful for some sanitizers.
 uptr InternalGetProcAddress(void *module, const char *func_name);
 
+// Overrides a function only when it is called from a specific DLL. For example,
+// this is used to override calls to HeapAlloc/HeapFree from ucrtbase without
+// affecting other third party libraries.
+bool OverrideImportedFunction(const char *module_to_patch,
+                              const char *imported_module,
+                              const char *function_name, uptr new_function,
+                              uptr *orig_old_func);
+
+#if !SANITIZER_WINDOWS64
+// Exposed for unittests
+bool OverrideFunctionWithDetour(
+    uptr old_func, uptr new_func, uptr *orig_old_func);
+#endif
+
+// Exposed for unittests
+bool OverrideFunctionWithRedirectJump(
+    uptr old_func, uptr new_func, uptr *orig_old_func);
+bool OverrideFunctionWithHotPatch(
+    uptr old_func, uptr new_func, uptr *orig_old_func);
+bool OverrideFunctionWithTrampoline(
+    uptr old_func, uptr new_func, uptr *orig_old_func);
+
+// Exposed for unittests
+void TestOnlyReleaseTrampolineRegions();
+
 }  // namespace __interception
 
 #if defined(INTERCEPTION_DYNAMIC_CRT)
@@ -50,5 +75,10 @@
 
 #define INTERCEPT_FUNCTION_VER_WIN(func, symver) INTERCEPT_FUNCTION_WIN(func)
 
+#define INTERCEPT_FUNCTION_DLLIMPORT(user_dll, provider_dll, func)       \
+  ::__interception::OverrideImportedFunction(                            \
+      user_dll, provider_dll, #func, (::__interception::uptr)WRAP(func), \
+      (::__interception::uptr *)&REAL(func))
+
 #endif  // INTERCEPTION_WIN_H
 #endif  // _WIN32
diff --git a/lib/interception/tests/CMakeLists.txt b/lib/interception/tests/CMakeLists.txt
new file mode 100644
index 0000000..bfe41fe
--- /dev/null
+++ b/lib/interception/tests/CMakeLists.txt
@@ -0,0 +1,142 @@
+include(CompilerRTCompile)
+
+filter_available_targets(INTERCEPTION_UNITTEST_SUPPORTED_ARCH x86_64 i386 mips64 mips64el)
+
+set(INTERCEPTION_UNITTESTS
+  interception_linux_test.cc
+  interception_test_main.cc
+  interception_win_test.cc
+)
+
+set(INTERCEPTION_TEST_HEADERS)
+
+set(INTERCEPTION_TEST_CFLAGS_COMMON
+  ${COMPILER_RT_UNITTEST_CFLAGS}
+  ${COMPILER_RT_GTEST_CFLAGS}
+  -I${COMPILER_RT_SOURCE_DIR}/include
+  -I${COMPILER_RT_SOURCE_DIR}/lib
+  -I${COMPILER_RT_SOURCE_DIR}/lib/interception
+  -fno-rtti
+  -O2
+  -Werror=sign-compare
+  -Wno-non-virtual-dtor)
+
+# -gline-tables-only must be enough for these tests, so use it if possible.
+if(COMPILER_RT_TEST_COMPILER_ID MATCHES "Clang")
+  list(APPEND INTERCEPTION_TEST_CFLAGS_COMMON -gline-tables-only)
+else()
+  list(APPEND INTERCEPTION_TEST_CFLAGS_COMMON -g)
+endif()
+if(MSVC)
+  list(APPEND INTERCEPTION_TEST_CFLAGS_COMMON -gcodeview)
+endif()
+list(APPEND INTERCEPTION_TEST_LINK_FLAGS_COMMON -g)
+
+if(NOT MSVC)
+  list(APPEND INTERCEPTION_TEST_LINK_FLAGS_COMMON --driver-mode=g++)
+endif()
+
+if(ANDROID)
+  list(APPEND INTERCEPTION_TEST_LINK_FLAGS_COMMON -pie)
+endif()
+
+set(INTERCEPTION_TEST_LINK_LIBS)
+append_list_if(COMPILER_RT_HAS_LIBLOG log INTERCEPTION_TEST_LINK_LIBS)
+# NDK r10 requires -latomic almost always.
+append_list_if(ANDROID atomic INTERCEPTION_TEST_LINK_LIBS)
+
+append_list_if(COMPILER_RT_HAS_LIBDL -ldl INTERCEPTION_TEST_LINK_FLAGS_COMMON)
+append_list_if(COMPILER_RT_HAS_LIBRT -lrt INTERCEPTION_TEST_LINK_FLAGS_COMMON)
+append_list_if(COMPILER_RT_HAS_LIBPTHREAD -pthread INTERCEPTION_TEST_LINK_FLAGS_COMMON)
+# x86_64 FreeBSD 9.2 additionally requires libc++ to build the tests. Also,
+# 'libm' shall be specified explicitly to build i386 tests.
+if(CMAKE_SYSTEM MATCHES "FreeBSD-9.2-RELEASE")
+  list(APPEND INTERCEPTION_TEST_LINK_FLAGS_COMMON "-lc++ -lm")
+endif()
+
+include_directories(..)
+include_directories(../..)
+
+# Adds static library which contains interception object file
+# (universal binary on Mac and arch-specific object files on Linux).
+macro(add_interceptor_lib library)
+  add_library(${library} STATIC ${ARGN})
+  set_target_properties(${library} PROPERTIES
+    ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+    FOLDER "Compiler-RT Runtime tests")
+endmacro()
+
+function(get_interception_lib_for_arch arch lib lib_name)
+  if(APPLE)
+    set(tgt_name "RTInterception.test.osx")
+  else()
+    set(tgt_name "RTInterception.test.${arch}")
+  endif()
+  set(${lib} "${tgt_name}" PARENT_SCOPE)
+  if(CMAKE_CONFIGURATION_TYPES)
+   set(configuration_path "${CMAKE_CFG_INTDIR}/")
+  else()
+   set(configuration_path "")
+  endif()
+  if(NOT MSVC)
+    set(${lib_name} "${configuration_path}lib${tgt_name}.a" PARENT_SCOPE)
+  else()
+    set(${lib_name} "${configuration_path}${tgt_name}.lib" PARENT_SCOPE)
+  endif()
+endfunction()
+
+# Interception unit tests testsuite.
+add_custom_target(InterceptionUnitTests)
+set_target_properties(InterceptionUnitTests PROPERTIES
+  FOLDER "Compiler-RT Tests")
+
+# Adds interception tests for architecture.
+macro(add_interception_tests_for_arch arch)
+  get_target_flags_for_arch(${arch} TARGET_FLAGS)
+  set(INTERCEPTION_TEST_SOURCES ${INTERCEPTION_UNITTESTS}
+                             ${COMPILER_RT_GTEST_SOURCE})
+  set(INTERCEPTION_TEST_COMPILE_DEPS ${INTERCEPTION_TEST_HEADERS})
+  if(NOT COMPILER_RT_STANDALONE_BUILD)
+    list(APPEND INTERCEPTION_TEST_COMPILE_DEPS gtest)
+  endif()
+  set(INTERCEPTION_TEST_OBJECTS)
+  foreach(source ${INTERCEPTION_TEST_SOURCES})
+    get_filename_component(basename ${source} NAME)
+    if(CMAKE_CONFIGURATION_TYPES)
+      set(output_obj "${CMAKE_CFG_INTDIR}/${basename}.${arch}.o")
+    else()
+      set(output_obj "${basename}.${arch}.o")
+    endif()
+    clang_compile(${output_obj} ${source}
+                  CFLAGS ${INTERCEPTION_TEST_CFLAGS_COMMON} ${TARGET_FLAGS}
+                  DEPS ${INTERCEPTION_TEST_COMPILE_DEPS})
+    list(APPEND INTERCEPTION_TEST_OBJECTS ${output_obj})
+  endforeach()
+  get_interception_lib_for_arch(${arch} INTERCEPTION_COMMON_LIB
+                                INTERCEPTION_COMMON_LIB_NAME)
+  # Add unittest target.
+  set(INTERCEPTION_TEST_NAME "Interception-${arch}-Test")
+  add_compiler_rt_test(InterceptionUnitTests ${INTERCEPTION_TEST_NAME}
+                       OBJECTS ${INTERCEPTION_TEST_OBJECTS}
+                               ${INTERCEPTION_COMMON_LIB_NAME}
+                       DEPS ${INTERCEPTION_TEST_OBJECTS} ${INTERCEPTION_COMMON_LIB}
+                       LINK_FLAGS ${INTERCEPTION_TEST_LINK_FLAGS_COMMON}
+                                  ${TARGET_FLAGS})
+endmacro()
+
+if(COMPILER_RT_CAN_EXECUTE_TESTS AND NOT ANDROID AND NOT APPLE)
+  # We use just-built clang to build interception unittests, so we must
+  # be sure that produced binaries would work.
+  if(APPLE)
+    add_interceptor_lib("RTInterception.test.osx"
+                        $<TARGET_OBJECTS:RTInterception.osx>)
+  else()
+    foreach(arch ${INTERCEPTION_UNITTEST_SUPPORTED_ARCH})
+      add_interceptor_lib("RTInterception.test.${arch}"
+                          $<TARGET_OBJECTS:RTInterception.${arch}>)
+    endforeach()
+  endif()
+  foreach(arch ${INTERCEPTION_UNITTEST_SUPPORTED_ARCH})
+    add_interception_tests_for_arch(${arch})
+  endforeach()
+endif()
diff --git a/lib/interception/tests/interception_linux_test.cc b/lib/interception/tests/interception_linux_test.cc
new file mode 100644
index 0000000..4a1ae78
--- /dev/null
+++ b/lib/interception/tests/interception_linux_test.cc
@@ -0,0 +1,65 @@
+//===-- interception_linux_test.cc ----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of ThreadSanitizer/AddressSanitizer runtime.
+// Tests for interception_linux.h.
+//
+//===----------------------------------------------------------------------===//
+#include "interception/interception.h"
+
+#include "gtest/gtest.h"
+
+// Too slow for debug build
+#if !SANITIZER_DEBUG
+#if SANITIZER_LINUX
+
+static int InterceptorFunctionCalled;
+
+DECLARE_REAL(int, isdigit, int);
+
+INTERCEPTOR(int, isdigit, int d) {
+  ++InterceptorFunctionCalled;
+  return d >= '0' && d <= '9';
+}
+
+namespace __interception {
+
+TEST(Interception, GetRealFunctionAddress) {
+  uptr expected_malloc_address = (uptr)(void*)&malloc;
+  uptr malloc_address = 0;
+  EXPECT_TRUE(GetRealFunctionAddress("malloc", &malloc_address, 0, 0));
+  EXPECT_EQ(expected_malloc_address, malloc_address);
+
+  uptr dummy_address = 0;
+  EXPECT_TRUE(
+      GetRealFunctionAddress("dummy_doesnt_exist__", &dummy_address, 0, 0));
+  EXPECT_EQ(0U, dummy_address);
+}
+
+TEST(Interception, Basic) {
+  ASSERT_TRUE(INTERCEPT_FUNCTION(isdigit));
+
+  // After interception, the counter should be incremented.
+  InterceptorFunctionCalled = 0;
+  EXPECT_NE(0, isdigit('1'));
+  EXPECT_EQ(1, InterceptorFunctionCalled);
+  EXPECT_EQ(0, isdigit('a'));
+  EXPECT_EQ(2, InterceptorFunctionCalled);
+
+  // Calling the REAL function should not affect the counter.
+  InterceptorFunctionCalled = 0;
+  EXPECT_NE(0, REAL(isdigit)('1'));
+  EXPECT_EQ(0, REAL(isdigit)('a'));
+  EXPECT_EQ(0, InterceptorFunctionCalled);
+}
+
+}  // namespace __interception
+
+#endif  // SANITIZER_LINUX
+#endif  // #if !SANITIZER_DEBUG
diff --git a/lib/interception/tests/interception_test_main.cc b/lib/interception/tests/interception_test_main.cc
new file mode 100644
index 0000000..311da51
--- /dev/null
+++ b/lib/interception/tests/interception_test_main.cc
@@ -0,0 +1,22 @@
+//===-- interception_test_main.cc------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of AddressSanitizer, an address sanity checker.
+//
+// Testing the machinery for providing replacements/wrappers for system
+// functions.
+//===----------------------------------------------------------------------===//
+
+#include "gtest/gtest.h"
+
+int main(int argc, char **argv) {
+  testing::GTEST_FLAG(death_test_style) = "threadsafe";
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/lib/interception/tests/interception_win_test.cc b/lib/interception/tests/interception_win_test.cc
new file mode 100644
index 0000000..611354f
--- /dev/null
+++ b/lib/interception/tests/interception_win_test.cc
@@ -0,0 +1,592 @@
+//===-- interception_win_test.cc ------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of ThreadSanitizer/AddressSanitizer runtime.
+// Tests for interception_win.h.
+//
+//===----------------------------------------------------------------------===//
+#include "interception/interception.h"
+
+#include "gtest/gtest.h"
+
+// Too slow for debug build
+#if !SANITIZER_DEBUG
+#if SANITIZER_WINDOWS
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+namespace __interception {
+namespace {
+
+enum FunctionPrefixKind {
+  FunctionPrefixNone,
+  FunctionPrefixPadding,
+  FunctionPrefixHotPatch,
+  FunctionPrefixDetour,
+};
+
+typedef bool (*TestOverrideFunction)(uptr, uptr, uptr*);
+typedef int (*IdentityFunction)(int);
+
+#if SANITIZER_WINDOWS64
+
+const u8 kIdentityCodeWithPrologue[] = {
+    0x55,                   // push        rbp
+    0x48, 0x89, 0xE5,       // mov         rbp,rsp
+    0x8B, 0xC1,             // mov         eax,ecx
+    0x5D,                   // pop         rbp
+    0xC3,                   // ret
+};
+
+const u8 kIdentityCodeWithPushPop[] = {
+    0x55,                   // push        rbp
+    0x48, 0x89, 0xE5,       // mov         rbp,rsp
+    0x53,                   // push        rbx
+    0x50,                   // push        rax
+    0x58,                   // pop         rax
+    0x8B, 0xC1,             // mov         rax,rcx
+    0x5B,                   // pop         rbx
+    0x5D,                   // pop         rbp
+    0xC3,                   // ret
+};
+
+const u8 kIdentityTwiceOffset = 16;
+const u8 kIdentityTwice[] = {
+    0x55,                   // push        rbp
+    0x48, 0x89, 0xE5,       // mov         rbp,rsp
+    0x8B, 0xC1,             // mov         eax,ecx
+    0x5D,                   // pop         rbp
+    0xC3,                   // ret
+    0x90, 0x90, 0x90, 0x90,
+    0x90, 0x90, 0x90, 0x90,
+    0x55,                   // push        rbp
+    0x48, 0x89, 0xE5,       // mov         rbp,rsp
+    0x8B, 0xC1,             // mov         eax,ecx
+    0x5D,                   // pop         rbp
+    0xC3,                   // ret
+};
+
+const u8 kIdentityCodeWithMov[] = {
+    0x89, 0xC8,             // mov         eax, ecx
+    0xC3,                   // ret
+};
+
+const u8 kIdentityCodeWithJump[] = {
+    0xE9, 0x04, 0x00, 0x00,
+    0x00,                   // jmp + 4
+    0xCC, 0xCC, 0xCC, 0xCC,
+    0x89, 0xC8,             // mov         eax, ecx
+    0xC3,                   // ret
+};
+
+#else
+
+const u8 kIdentityCodeWithPrologue[] = {
+    0x55,                   // push        ebp
+    0x8B, 0xEC,             // mov         ebp,esp
+    0x8B, 0x45, 0x08,       // mov         eax,dword ptr [ebp + 8]
+    0x5D,                   // pop         ebp
+    0xC3,                   // ret
+};
+
+const u8 kIdentityCodeWithPushPop[] = {
+    0x55,                   // push        ebp
+    0x8B, 0xEC,             // mov         ebp,esp
+    0x53,                   // push        ebx
+    0x50,                   // push        eax
+    0x58,                   // pop         eax
+    0x8B, 0x45, 0x08,       // mov         eax,dword ptr [ebp + 8]
+    0x5B,                   // pop         ebx
+    0x5D,                   // pop         ebp
+    0xC3,                   // ret
+};
+
+const u8 kIdentityTwiceOffset = 8;
+const u8 kIdentityTwice[] = {
+    0x55,                   // push        ebp
+    0x8B, 0xEC,             // mov         ebp,esp
+    0x8B, 0x45, 0x08,       // mov         eax,dword ptr [ebp + 8]
+    0x5D,                   // pop         ebp
+    0xC3,                   // ret
+    0x55,                   // push        ebp
+    0x8B, 0xEC,             // mov         ebp,esp
+    0x8B, 0x45, 0x08,       // mov         eax,dword ptr [ebp + 8]
+    0x5D,                   // pop         ebp
+    0xC3,                   // ret
+};
+
+const u8 kIdentityCodeWithMov[] = {
+    0x8B, 0x44, 0x24, 0x04, // mov         eax,dword ptr [esp + 4]
+    0xC3,                   // ret
+};
+
+const u8 kIdentityCodeWithJump[] = {
+    0xE9, 0x04, 0x00, 0x00,
+    0x00,                   // jmp + 4
+    0xCC, 0xCC, 0xCC, 0xCC,
+    0x8B, 0x44, 0x24, 0x04, // mov         eax,dword ptr [esp + 4]
+    0xC3,                   // ret
+};
+
+#endif
+
+const u8 kPatchableCode1[] = {
+    0xB8, 0x4B, 0x00, 0x00, 0x00,   // mov eax,4B
+    0x33, 0xC9,                     // xor ecx,ecx
+    0xC3,                           // ret
+};
+
+const u8 kPatchableCode2[] = {
+    0x55,                           // push ebp
+    0x8B, 0xEC,                     // mov ebp,esp
+    0x33, 0xC0,                     // xor eax,eax
+    0x5D,                           // pop ebp
+    0xC3,                           // ret
+};
+
+const u8 kPatchableCode3[] = {
+    0x55,                           // push ebp
+    0x8B, 0xEC,                     // mov ebp,esp
+    0x6A, 0x00,                     // push 0
+    0xE8, 0x3D, 0xFF, 0xFF, 0xFF,   // call <func>
+};
+
+const u8 kPatchableCode4[] = {
+    0xE9, 0xCC, 0xCC, 0xCC, 0xCC,   // jmp <label>
+    0x90, 0x90, 0x90, 0x90,
+};
+
+const u8 kUnpatchableCode1[] = {
+    0xC3,                           // ret
+};
+
+const u8 kUnpatchableCode2[] = {
+    0x33, 0xC9,                     // xor ecx,ecx
+    0xC3,                           // ret
+};
+
+const u8 kUnpatchableCode3[] = {
+    0x75, 0xCC,                     // jne <label>
+    0x33, 0xC9,                     // xor ecx,ecx
+    0xC3,                           // ret
+};
+
+const u8 kUnpatchableCode4[] = {
+    0x74, 0xCC,                     // jne <label>
+    0x33, 0xC9,                     // xor ecx,ecx
+    0xC3,                           // ret
+};
+
+const u8 kUnpatchableCode5[] = {
+    0xEB, 0x02,                     // jmp <label>
+    0x33, 0xC9,                     // xor ecx,ecx
+    0xC3,                           // ret
+};
+
+const u8 kUnpatchableCode6[] = {
+    0xE8, 0xCC, 0xCC, 0xCC, 0xCC,   // call <func>
+    0x90, 0x90, 0x90, 0x90,
+};
+
+// A buffer holding the dynamically generated code under test.
+u8* ActiveCode;
+size_t ActiveCodeLength = 4096;
+
+template<class T>
+static void LoadActiveCode(
+    const T &code,
+    uptr *entry_point,
+    FunctionPrefixKind prefix_kind = FunctionPrefixNone) {
+  if (ActiveCode == nullptr) {
+    ActiveCode =
+        (u8*)::VirtualAlloc(nullptr, ActiveCodeLength,
+                            MEM_COMMIT | MEM_RESERVE,
+                            PAGE_EXECUTE_READWRITE);
+    ASSERT_NE(ActiveCode, nullptr);
+  }
+
+  size_t position = 0;
+
+  // Add padding to avoid memory violation when scanning the prefix.
+  for (int i = 0; i < 16; ++i)
+    ActiveCode[position++] = 0xC3;  // Instruction 'ret'.
+
+  // Add function padding.
+  size_t padding = 0;
+  if (prefix_kind == FunctionPrefixPadding)
+    padding = 16;
+  else if (prefix_kind == FunctionPrefixDetour ||
+           prefix_kind == FunctionPrefixHotPatch)
+    padding = FIRST_32_SECOND_64(5, 6);
+  // Insert |padding| instructions 'nop'.
+  for (size_t i = 0; i < padding; ++i)
+    ActiveCode[position++] = 0x90;
+
+  // Keep track of the entry point.
+  *entry_point = (uptr)&ActiveCode[position];
+
+  // Add the detour instruction (i.e. mov edi, edi)
+  if (prefix_kind == FunctionPrefixDetour) {
+#if SANITIZER_WINDOWS64
+    // Note that "mov edi,edi" is NOP in 32-bit only, in 64-bit it clears
+    // higher bits of RDI.
+    // Use 66,90H as NOP for Windows64.
+    ActiveCode[position++] = 0x66;
+    ActiveCode[position++] = 0x90;
+#else
+    // mov edi,edi.
+    ActiveCode[position++] = 0x8B;
+    ActiveCode[position++] = 0xFF;
+#endif
+
+  }
+
+  // Copy the function body.
+  for (size_t i = 0; i < sizeof(T); ++i)
+    ActiveCode[position++] = code[i];
+}
+
+int InterceptorFunctionCalled;
+IdentityFunction InterceptedRealFunction;
+
+int InterceptorFunction(int x) {
+  ++InterceptorFunctionCalled;
+  return InterceptedRealFunction(x);
+}
+
+}  // namespace
+
+// Tests for interception_win.h
+TEST(Interception, InternalGetProcAddress) {
+  HMODULE ntdll_handle = ::GetModuleHandle("ntdll");
+  ASSERT_NE(nullptr, ntdll_handle);
+  uptr DbgPrint_expected = (uptr)::GetProcAddress(ntdll_handle, "DbgPrint");
+  uptr isdigit_expected = (uptr)::GetProcAddress(ntdll_handle, "isdigit");
+  uptr DbgPrint_adddress = InternalGetProcAddress(ntdll_handle, "DbgPrint");
+  uptr isdigit_address = InternalGetProcAddress(ntdll_handle, "isdigit");
+
+  EXPECT_EQ(DbgPrint_expected, DbgPrint_adddress);
+  EXPECT_EQ(isdigit_expected, isdigit_address);
+  EXPECT_NE(DbgPrint_adddress, isdigit_address);
+}
+
+template<class T>
+static void TestIdentityFunctionPatching(
+    const T &code,
+    TestOverrideFunction override,
+    FunctionPrefixKind prefix_kind = FunctionPrefixNone) {
+  uptr identity_address;
+  LoadActiveCode(code, &identity_address, prefix_kind);
+  IdentityFunction identity = (IdentityFunction)identity_address;
+
+  // Validate behavior before dynamic patching.
+  InterceptorFunctionCalled = 0;
+  EXPECT_EQ(0, identity(0));
+  EXPECT_EQ(42, identity(42));
+  EXPECT_EQ(0, InterceptorFunctionCalled);
+
+  // Patch the function.
+  uptr real_identity_address = 0;
+  bool success = override(identity_address,
+                         (uptr)&InterceptorFunction,
+                         &real_identity_address);
+  EXPECT_TRUE(success);
+  EXPECT_NE(0U, real_identity_address);
+  IdentityFunction real_identity = (IdentityFunction)real_identity_address;
+  InterceptedRealFunction = real_identity;
+
+  // Don't run tests if hooking failed or the real function is not valid.
+  if (!success || !real_identity_address)
+    return;
+
+  // Calling the redirected function.
+  InterceptorFunctionCalled = 0;
+  EXPECT_EQ(0, identity(0));
+  EXPECT_EQ(42, identity(42));
+  EXPECT_EQ(2, InterceptorFunctionCalled);
+
+  // Calling the real function.
+  InterceptorFunctionCalled = 0;
+  EXPECT_EQ(0, real_identity(0));
+  EXPECT_EQ(42, real_identity(42));
+  EXPECT_EQ(0, InterceptorFunctionCalled);
+
+  TestOnlyReleaseTrampolineRegions();
+}
+
+#if !SANITIZER_WINDOWS64
+TEST(Interception, OverrideFunctionWithDetour) {
+  TestOverrideFunction override = OverrideFunctionWithDetour;
+  FunctionPrefixKind prefix = FunctionPrefixDetour;
+  TestIdentityFunctionPatching(kIdentityCodeWithPrologue, override, prefix);
+  TestIdentityFunctionPatching(kIdentityCodeWithPushPop, override, prefix);
+  TestIdentityFunctionPatching(kIdentityCodeWithMov, override, prefix);
+  TestIdentityFunctionPatching(kIdentityCodeWithJump, override, prefix);
+}
+#endif  // !SANITIZER_WINDOWS64
+
+TEST(Interception, OverrideFunctionWithRedirectJump) {
+  TestOverrideFunction override = OverrideFunctionWithRedirectJump;
+  TestIdentityFunctionPatching(kIdentityCodeWithJump, override);
+}
+
+TEST(Interception, OverrideFunctionWithHotPatch) {
+  TestOverrideFunction override = OverrideFunctionWithHotPatch;
+  FunctionPrefixKind prefix = FunctionPrefixHotPatch;
+  TestIdentityFunctionPatching(kIdentityCodeWithMov, override, prefix);
+}
+
+TEST(Interception, OverrideFunctionWithTrampoline) {
+  TestOverrideFunction override = OverrideFunctionWithTrampoline;
+  FunctionPrefixKind prefix = FunctionPrefixNone;
+  TestIdentityFunctionPatching(kIdentityCodeWithPrologue, override, prefix);
+  TestIdentityFunctionPatching(kIdentityCodeWithPushPop, override, prefix);
+
+  prefix = FunctionPrefixPadding;
+  TestIdentityFunctionPatching(kIdentityCodeWithPrologue, override, prefix);
+  TestIdentityFunctionPatching(kIdentityCodeWithPushPop, override, prefix);
+}
+
+TEST(Interception, OverrideFunction) {
+  TestOverrideFunction override = OverrideFunction;
+  FunctionPrefixKind prefix = FunctionPrefixNone;
+  TestIdentityFunctionPatching(kIdentityCodeWithPrologue, override, prefix);
+  TestIdentityFunctionPatching(kIdentityCodeWithPushPop, override, prefix);
+  TestIdentityFunctionPatching(kIdentityCodeWithJump, override, prefix);
+
+  prefix = FunctionPrefixPadding;
+  TestIdentityFunctionPatching(kIdentityCodeWithPrologue, override, prefix);
+  TestIdentityFunctionPatching(kIdentityCodeWithPushPop, override, prefix);
+  TestIdentityFunctionPatching(kIdentityCodeWithMov, override, prefix);
+  TestIdentityFunctionPatching(kIdentityCodeWithJump, override, prefix);
+
+  prefix = FunctionPrefixHotPatch;
+  TestIdentityFunctionPatching(kIdentityCodeWithPrologue, override, prefix);
+  TestIdentityFunctionPatching(kIdentityCodeWithPushPop, override, prefix);
+  TestIdentityFunctionPatching(kIdentityCodeWithMov, override, prefix);
+  TestIdentityFunctionPatching(kIdentityCodeWithJump, override, prefix);
+
+  prefix = FunctionPrefixDetour;
+  TestIdentityFunctionPatching(kIdentityCodeWithPrologue, override, prefix);
+  TestIdentityFunctionPatching(kIdentityCodeWithPushPop, override, prefix);
+  TestIdentityFunctionPatching(kIdentityCodeWithMov, override, prefix);
+  TestIdentityFunctionPatching(kIdentityCodeWithJump, override, prefix);
+}
+
+template<class T>
+static void TestIdentityFunctionMultiplePatching(
+    const T &code,
+    TestOverrideFunction override,
+    FunctionPrefixKind prefix_kind = FunctionPrefixNone) {
+  uptr identity_address;
+  LoadActiveCode(code, &identity_address, prefix_kind);
+
+  // Patch the function.
+  uptr real_identity_address = 0;
+  bool success = override(identity_address,
+                          (uptr)&InterceptorFunction,
+                          &real_identity_address);
+  EXPECT_TRUE(success);
+  EXPECT_NE(0U, real_identity_address);
+
+  // Re-patching the function should not work.
+  success = override(identity_address,
+                     (uptr)&InterceptorFunction,
+                     &real_identity_address);
+  EXPECT_FALSE(success);
+
+  TestOnlyReleaseTrampolineRegions();
+}
+
+TEST(Interception, OverrideFunctionMultiplePatchingIsFailing) {
+#if !SANITIZER_WINDOWS64
+  TestIdentityFunctionMultiplePatching(kIdentityCodeWithPrologue,
+                                       OverrideFunctionWithDetour,
+                                       FunctionPrefixDetour);
+#endif
+
+  TestIdentityFunctionMultiplePatching(kIdentityCodeWithMov,
+                                       OverrideFunctionWithHotPatch,
+                                       FunctionPrefixHotPatch);
+
+  TestIdentityFunctionMultiplePatching(kIdentityCodeWithPushPop,
+                                       OverrideFunctionWithTrampoline,
+                                       FunctionPrefixPadding);
+}
+
+TEST(Interception, OverrideFunctionTwice) {
+  uptr identity_address1;
+  LoadActiveCode(kIdentityTwice, &identity_address1);
+  uptr identity_address2 = identity_address1 + kIdentityTwiceOffset;
+  IdentityFunction identity1 = (IdentityFunction)identity_address1;
+  IdentityFunction identity2 = (IdentityFunction)identity_address2;
+
+  // Patch the two functions.
+  uptr real_identity_address = 0;
+  EXPECT_TRUE(OverrideFunction(identity_address1,
+                               (uptr)&InterceptorFunction,
+                               &real_identity_address));
+  EXPECT_TRUE(OverrideFunction(identity_address2,
+                               (uptr)&InterceptorFunction,
+                               &real_identity_address));
+  IdentityFunction real_identity = (IdentityFunction)real_identity_address;
+  InterceptedRealFunction = real_identity;
+
+  // Calling the redirected function.
+  InterceptorFunctionCalled = 0;
+  EXPECT_EQ(42, identity1(42));
+  EXPECT_EQ(42, identity2(42));
+  EXPECT_EQ(2, InterceptorFunctionCalled);
+
+  TestOnlyReleaseTrampolineRegions();
+}
+
+template<class T>
+static bool TestFunctionPatching(
+    const T &code,
+    TestOverrideFunction override,
+    FunctionPrefixKind prefix_kind = FunctionPrefixNone) {
+  uptr address;
+  LoadActiveCode(code, &address, prefix_kind);
+  uptr unused_real_address = 0;
+  bool result = override(
+      address, (uptr)&InterceptorFunction, &unused_real_address);
+
+  TestOnlyReleaseTrampolineRegions();
+  return result;
+}
+
+TEST(Interception, PatchableFunction) {
+  TestOverrideFunction override = OverrideFunction;
+  // Test without function padding.
+  EXPECT_TRUE(TestFunctionPatching(kPatchableCode1, override));
+  EXPECT_TRUE(TestFunctionPatching(kPatchableCode2, override));
+#if SANITIZER_WINDOWS64
+  EXPECT_FALSE(TestFunctionPatching(kPatchableCode3, override));
+#else
+  EXPECT_TRUE(TestFunctionPatching(kPatchableCode3, override));
+#endif
+  EXPECT_TRUE(TestFunctionPatching(kPatchableCode4, override));
+
+  EXPECT_FALSE(TestFunctionPatching(kUnpatchableCode1, override));
+  EXPECT_FALSE(TestFunctionPatching(kUnpatchableCode2, override));
+  EXPECT_FALSE(TestFunctionPatching(kUnpatchableCode3, override));
+  EXPECT_FALSE(TestFunctionPatching(kUnpatchableCode4, override));
+  EXPECT_FALSE(TestFunctionPatching(kUnpatchableCode5, override));
+  EXPECT_FALSE(TestFunctionPatching(kUnpatchableCode6, override));
+}
+
+#if !SANITIZER_WINDOWS64
+TEST(Interception, PatchableFunctionWithDetour) {
+  TestOverrideFunction override = OverrideFunctionWithDetour;
+  // Without the prefix, no function can be detoured.
+  EXPECT_FALSE(TestFunctionPatching(kPatchableCode1, override));
+  EXPECT_FALSE(TestFunctionPatching(kPatchableCode2, override));
+  EXPECT_FALSE(TestFunctionPatching(kPatchableCode3, override));
+  EXPECT_FALSE(TestFunctionPatching(kPatchableCode4, override));
+  EXPECT_FALSE(TestFunctionPatching(kUnpatchableCode1, override));
+  EXPECT_FALSE(TestFunctionPatching(kUnpatchableCode2, override));
+  EXPECT_FALSE(TestFunctionPatching(kUnpatchableCode3, override));
+  EXPECT_FALSE(TestFunctionPatching(kUnpatchableCode4, override));
+  EXPECT_FALSE(TestFunctionPatching(kUnpatchableCode5, override));
+  EXPECT_FALSE(TestFunctionPatching(kUnpatchableCode6, override));
+
+  // With the prefix, all functions can be detoured.
+  FunctionPrefixKind prefix = FunctionPrefixDetour;
+  EXPECT_TRUE(TestFunctionPatching(kPatchableCode1, override, prefix));
+  EXPECT_TRUE(TestFunctionPatching(kPatchableCode2, override, prefix));
+  EXPECT_TRUE(TestFunctionPatching(kPatchableCode3, override, prefix));
+  EXPECT_TRUE(TestFunctionPatching(kPatchableCode4, override, prefix));
+  EXPECT_TRUE(TestFunctionPatching(kUnpatchableCode1, override, prefix));
+  EXPECT_TRUE(TestFunctionPatching(kUnpatchableCode2, override, prefix));
+  EXPECT_TRUE(TestFunctionPatching(kUnpatchableCode3, override, prefix));
+  EXPECT_TRUE(TestFunctionPatching(kUnpatchableCode4, override, prefix));
+  EXPECT_TRUE(TestFunctionPatching(kUnpatchableCode5, override, prefix));
+  EXPECT_TRUE(TestFunctionPatching(kUnpatchableCode6, override, prefix));
+}
+#endif  // !SANITIZER_WINDOWS64
+
+TEST(Interception, PatchableFunctionWithRedirectJump) {
+  TestOverrideFunction override = OverrideFunctionWithRedirectJump;
+  EXPECT_FALSE(TestFunctionPatching(kPatchableCode1, override));
+  EXPECT_FALSE(TestFunctionPatching(kPatchableCode2, override));
+  EXPECT_FALSE(TestFunctionPatching(kPatchableCode3, override));
+  EXPECT_TRUE(TestFunctionPatching(kPatchableCode4, override));
+  EXPECT_FALSE(TestFunctionPatching(kUnpatchableCode1, override));
+  EXPECT_FALSE(TestFunctionPatching(kUnpatchableCode2, override));
+  EXPECT_FALSE(TestFunctionPatching(kUnpatchableCode3, override));
+  EXPECT_FALSE(TestFunctionPatching(kUnpatchableCode4, override));
+  EXPECT_FALSE(TestFunctionPatching(kUnpatchableCode5, override));
+  EXPECT_FALSE(TestFunctionPatching(kUnpatchableCode6, override));
+}
+
+TEST(Interception, PatchableFunctionWithHotPatch) {
+  TestOverrideFunction override = OverrideFunctionWithHotPatch;
+  FunctionPrefixKind prefix = FunctionPrefixHotPatch;
+
+  EXPECT_TRUE(TestFunctionPatching(kPatchableCode1, override, prefix));
+  EXPECT_FALSE(TestFunctionPatching(kPatchableCode2, override, prefix));
+  EXPECT_FALSE(TestFunctionPatching(kPatchableCode3, override, prefix));
+  EXPECT_FALSE(TestFunctionPatching(kPatchableCode4, override, prefix));
+
+  EXPECT_FALSE(TestFunctionPatching(kUnpatchableCode1, override, prefix));
+  EXPECT_TRUE(TestFunctionPatching(kUnpatchableCode2, override, prefix));
+  EXPECT_FALSE(TestFunctionPatching(kUnpatchableCode3, override, prefix));
+  EXPECT_FALSE(TestFunctionPatching(kUnpatchableCode4, override, prefix));
+  EXPECT_FALSE(TestFunctionPatching(kUnpatchableCode5, override, prefix));
+  EXPECT_FALSE(TestFunctionPatching(kUnpatchableCode6, override, prefix));
+}
+
+TEST(Interception, PatchableFunctionWithTrampoline) {
+  TestOverrideFunction override = OverrideFunctionWithTrampoline;
+  FunctionPrefixKind prefix = FunctionPrefixPadding;
+
+  EXPECT_TRUE(TestFunctionPatching(kPatchableCode1, override, prefix));
+  EXPECT_TRUE(TestFunctionPatching(kPatchableCode2, override, prefix));
+#if SANITIZER_WINDOWS64
+  EXPECT_FALSE(TestFunctionPatching(kPatchableCode3, override, prefix));
+#else
+  EXPECT_TRUE(TestFunctionPatching(kPatchableCode3, override, prefix));
+#endif
+  EXPECT_FALSE(TestFunctionPatching(kPatchableCode4, override, prefix));
+
+  EXPECT_FALSE(TestFunctionPatching(kUnpatchableCode1, override, prefix));
+  EXPECT_FALSE(TestFunctionPatching(kUnpatchableCode2, override, prefix));
+  EXPECT_FALSE(TestFunctionPatching(kUnpatchableCode3, override, prefix));
+  EXPECT_FALSE(TestFunctionPatching(kUnpatchableCode4, override, prefix));
+  EXPECT_FALSE(TestFunctionPatching(kUnpatchableCode5, override, prefix));
+  EXPECT_FALSE(TestFunctionPatching(kUnpatchableCode6, override, prefix));
+}
+
+TEST(Interception, PatchableFunctionPadding) {
+  TestOverrideFunction override = OverrideFunction;
+  FunctionPrefixKind prefix = FunctionPrefixPadding;
+
+  EXPECT_TRUE(TestFunctionPatching(kPatchableCode1, override, prefix));
+  EXPECT_TRUE(TestFunctionPatching(kPatchableCode2, override, prefix));
+#if SANITIZER_WINDOWS64
+  EXPECT_FALSE(TestFunctionPatching(kPatchableCode3, override, prefix));
+#else
+  EXPECT_TRUE(TestFunctionPatching(kPatchableCode3, override, prefix));
+#endif
+  EXPECT_TRUE(TestFunctionPatching(kPatchableCode4, override, prefix));
+
+  EXPECT_FALSE(TestFunctionPatching(kUnpatchableCode1, override, prefix));
+  EXPECT_TRUE(TestFunctionPatching(kUnpatchableCode2, override, prefix));
+  EXPECT_FALSE(TestFunctionPatching(kUnpatchableCode3, override, prefix));
+  EXPECT_FALSE(TestFunctionPatching(kUnpatchableCode4, override, prefix));
+  EXPECT_FALSE(TestFunctionPatching(kUnpatchableCode5, override, prefix));
+  EXPECT_FALSE(TestFunctionPatching(kUnpatchableCode6, override, prefix));
+}
+
+}  // namespace __interception
+
+#endif  // SANITIZER_WINDOWS
+#endif  // #if !SANITIZER_DEBUG
diff --git a/lib/lsan/CMakeLists.txt b/lib/lsan/CMakeLists.txt
index 20e4093..9412c7a 100644
--- a/lib/lsan/CMakeLists.txt
+++ b/lib/lsan/CMakeLists.txt
@@ -1,7 +1,7 @@
 include_directories(..)
 
 set(LSAN_CFLAGS ${SANITIZER_COMMON_CFLAGS})
-append_no_rtti_flag(LSAN_CFLAGS)
+append_rtti_flag(OFF LSAN_CFLAGS)
 
 set(LSAN_COMMON_SOURCES
   lsan_common.cc
@@ -17,6 +17,7 @@
 set(LSAN_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 
 add_custom_target(lsan)
+set_target_properties(lsan PROPERTIES FOLDER "Compiler-RT Misc")
 
 add_compiler_rt_object_libraries(RTLSanCommon
     OS ${SANITIZER_COMMON_SUPPORTED_OS}
diff --git a/lib/lsan/Makefile.mk b/lib/lsan/Makefile.mk
deleted file mode 100644
index 5e70634..0000000
--- a/lib/lsan/Makefile.mk
+++ /dev/null
@@ -1,25 +0,0 @@
-#===- lib/lsan/Makefile.mk ---------------------------------*- Makefile -*--===#
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-#===------------------------------------------------------------------------===#
-
-ModuleName := lsan
-SubDirs := 
-
-Sources := $(foreach file,$(wildcard $(Dir)/*.cc),$(notdir $(file)))
-ObjNames := $(Sources:%.cc=%.o)
-
-Implementation := Generic
-
-# FIXME: use automatic dependencies?
-Dependencies := $(wildcard $(Dir)/*.h)
-Dependencies += $(wildcard $(Dir)/../interception/*.h)
-Dependencies += $(wildcard $(Dir)/../sanitizer_common/*.h)
-
-# lsan functions used in another sanitizers.
-LsanCommonSources := $(foreach file,$(wildcard $(Dir)/lsan_common*.cc),$(notdir $(file)))
-LsanCommonFunctions := $(LsanCommonSources:%.cc=%)
diff --git a/lib/lsan/lsan.cc b/lib/lsan/lsan.cc
index f3e6ad7..c7c3429 100644
--- a/lib/lsan/lsan.cc
+++ b/lib/lsan/lsan.cc
@@ -43,6 +43,7 @@
     cf.CopyFrom(*common_flags());
     cf.external_symbolizer_path = GetEnv("LSAN_SYMBOLIZER_PATH");
     cf.malloc_context_size = 30;
+    cf.intercept_tls_get_addr = true;
     cf.detect_leaks = true;
     cf.exitcode = 23;
     OverrideCommonFlags(cf);
@@ -71,6 +72,7 @@
   lsan_init_is_running = true;
   SanitizerToolName = "LeakSanitizer";
   CacheBinaryName();
+  AvoidCVE_2016_2143();
   InitializeFlags();
   InitCommonLsan();
   InitializeAllocator();
diff --git a/lib/lsan/lsan.h b/lib/lsan/lsan.h
index 53783cd..ec5eb93 100644
--- a/lib/lsan/lsan.h
+++ b/lib/lsan/lsan.h
@@ -24,8 +24,11 @@
       stack_top = t->stack_end();                                              \
       stack_bottom = t->stack_begin();                                         \
     }                                                                          \
-    stack.Unwind(max_size, StackTrace::GetCurrentPc(), GET_CURRENT_FRAME(),    \
-                 /* context */ 0, stack_top, stack_bottom, fast);              \
+    if (!SANITIZER_MIPS ||                                                     \
+        IsValidFrame(GET_CURRENT_FRAME(), stack_top, stack_bottom)) {          \
+      stack.Unwind(max_size, StackTrace::GetCurrentPc(), GET_CURRENT_FRAME(),  \
+                   /* context */ 0, stack_top, stack_bottom, fast);            \
+    }                                                                          \
   }
 
 #define GET_STACK_TRACE_FATAL \
diff --git a/lib/lsan/lsan_allocator.cc b/lib/lsan/lsan_allocator.cc
index 0a36781..a5220f1 100644
--- a/lib/lsan/lsan_allocator.cc
+++ b/lib/lsan/lsan_allocator.cc
@@ -99,11 +99,13 @@
     memset(p, 0, size);
   RegisterAllocation(stack, p, size);
   if (&__sanitizer_malloc_hook) __sanitizer_malloc_hook(p, size);
+  RunMallocHooks(p, size);
   return p;
 }
 
 void Deallocate(void *p) {
   if (&__sanitizer_free_hook) __sanitizer_free_hook(p);
+  RunFreeHooks(p);
   RegisterDeallocation(p);
   allocator.Deallocate(&cache, p);
 }
diff --git a/lib/lsan/lsan_common.cc b/lib/lsan/lsan_common.cc
index 1cffac4..888a25b 100644
--- a/lib/lsan/lsan_common.cc
+++ b/lib/lsan/lsan_common.cc
@@ -23,6 +23,7 @@
 #include "sanitizer_common/sanitizer_stacktrace.h"
 #include "sanitizer_common/sanitizer_suppressions.h"
 #include "sanitizer_common/sanitizer_report_decorator.h"
+#include "sanitizer_common/sanitizer_tls_get_addr.h"
 
 #if CAN_SANITIZE_LEAKS
 namespace __lsan {
@@ -33,6 +34,14 @@
 
 THREADLOCAL int disable_counter;
 bool DisabledInThisThread() { return disable_counter > 0; }
+void DisableInThisThread() { disable_counter++; }
+void EnableInThisThread() {
+  if (!disable_counter && common_flags()->detect_leaks) {
+    Report("Unmatched call to __lsan_enable().\n");
+    Die();
+  }
+  disable_counter--;
+}
 
 Flags lsan_flags;
 
@@ -185,9 +194,10 @@
     uptr os_id = static_cast<uptr>(suspended_threads.GetThreadID(i));
     LOG_THREADS("Processing thread %d.\n", os_id);
     uptr stack_begin, stack_end, tls_begin, tls_end, cache_begin, cache_end;
+    DTLS *dtls;
     bool thread_found = GetThreadRangesLocked(os_id, &stack_begin, &stack_end,
                                               &tls_begin, &tls_end,
-                                              &cache_begin, &cache_end);
+                                              &cache_begin, &cache_end, &dtls);
     if (!thread_found) {
       // If a thread can't be found in the thread registry, it's probably in the
       // process of destruction. Log this event and move on.
@@ -211,9 +221,18 @@
       LOG_THREADS("Stack at %p-%p (SP = %p).\n", stack_begin, stack_end, sp);
       if (sp < stack_begin || sp >= stack_end) {
         // SP is outside the recorded stack range (e.g. the thread is running a
-        // signal handler on alternate stack). Again, consider the entire stack
-        // range to be reachable.
+        // signal handler on alternate stack, or swapcontext was used).
+        // Again, consider the entire stack range to be reachable.
         LOG_THREADS("WARNING: stack pointer not in stack range.\n");
+        uptr page_size = GetPageSizeCached();
+        int skipped = 0;
+        while (stack_begin < stack_end &&
+               !IsAccessibleMemoryRange(stack_begin, 1)) {
+          skipped++;
+          stack_begin += page_size;
+        }
+        LOG_THREADS("Skipped %d guard page(s) to obtain stack %p-%p.\n",
+                    skipped, stack_begin, stack_end);
       } else {
         // Shrink the stack range to ignore out-of-scope values.
         stack_begin = sp;
@@ -238,6 +257,17 @@
         if (tls_end > cache_end)
           ScanRangeForPointers(cache_end, tls_end, frontier, "TLS", kReachable);
       }
+      if (dtls) {
+        for (uptr j = 0; j < dtls->dtv_size; ++j) {
+          uptr dtls_beg = dtls->dtv[j].beg;
+          uptr dtls_end = dtls_beg + dtls->dtv[j].size;
+          if (dtls_beg < dtls_end) {
+            LOG_THREADS("DTLS %zu at %p-%p.\n", j, dtls_beg, dtls_end);
+            ScanRangeForPointers(dtls_beg, dtls_end, frontier, "DTLS",
+                                 kReachable);
+          }
+        }
+      }
     }
   }
 }
@@ -416,6 +446,9 @@
 
   if (!param.success) {
     Report("LeakSanitizer has encountered a fatal error.\n");
+    Report(
+        "HINT: For debugging, try setting environment variable "
+        "LSAN_OPTIONS=verbosity=1:log_threads=1\n");
     Die();
   }
   param.leak_report.ApplySuppressions();
@@ -617,6 +650,13 @@
 }
 
 } // namespace __lsan
+#else // CAN_SANITIZE_LEAKS
+namespace __lsan {
+void InitCommonLsan() { }
+void DoLeakCheck() { }
+void DisableInThisThread() { }
+void EnableInThisThread() { }
+}
 #endif // CAN_SANITIZE_LEAKS
 
 using namespace __lsan;  // NOLINT
@@ -682,18 +722,14 @@
 SANITIZER_INTERFACE_ATTRIBUTE
 void __lsan_disable() {
 #if CAN_SANITIZE_LEAKS
-  __lsan::disable_counter++;
+  __lsan::DisableInThisThread();
 #endif
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
 void __lsan_enable() {
 #if CAN_SANITIZE_LEAKS
-  if (!__lsan::disable_counter && common_flags()->detect_leaks) {
-    Report("Unmatched call to __lsan_enable().\n");
-    Die();
-  }
-  __lsan::disable_counter--;
+  __lsan::EnableInThisThread();
 #endif
 }
 
diff --git a/lib/lsan/lsan_common.h b/lib/lsan/lsan_common.h
index 0dfd0d4..890ce65 100644
--- a/lib/lsan/lsan_common.h
+++ b/lib/lsan/lsan_common.h
@@ -31,6 +31,7 @@
 
 namespace __sanitizer {
 class FlagParser;
+struct DTLS;
 }
 
 namespace __lsan {
@@ -118,6 +119,16 @@
 void DoLeakCheck();
 bool DisabledInThisThread();
 
+// Used to implement __lsan::ScopedDisabler.
+void DisableInThisThread();
+void EnableInThisThread();
+// Can be used to ignore memory allocated by an intercepted
+// function.
+struct ScopedInterceptorDisabler {
+  ScopedInterceptorDisabler() { DisableInThisThread(); }
+  ~ScopedInterceptorDisabler() { EnableInThisThread(); }
+};
+
 // Special case for "new T[0]" where T is a type with DTOR.
 // new T[0] will allocate one word for the array size (0) and store a pointer
 // to the end of allocated chunk.
@@ -141,8 +152,8 @@
 void LockThreadRegistry();
 void UnlockThreadRegistry();
 bool GetThreadRangesLocked(uptr os_id, uptr *stack_begin, uptr *stack_end,
-                           uptr *tls_begin, uptr *tls_end,
-                           uptr *cache_begin, uptr *cache_end);
+                           uptr *tls_begin, uptr *tls_end, uptr *cache_begin,
+                           uptr *cache_end, DTLS **dtls);
 void ForEachExtraStackRange(uptr os_id, RangeIteratorCallback callback,
                             void *arg);
 // If called from the main thread, updates the main thread's TID in the thread
diff --git a/lib/lsan/lsan_common_linux.cc b/lib/lsan/lsan_common_linux.cc
index 1dc0561..1f54303 100644
--- a/lib/lsan/lsan_common_linux.cc
+++ b/lib/lsan/lsan_common_linux.cc
@@ -26,9 +26,8 @@
 namespace __lsan {
 
 static const char kLinkerName[] = "ld";
-// We request 2 modules matching "ld", so we can print a warning if there's more
-// than one match. But only the first one is actually used.
-static char linker_placeholder[2 * sizeof(LoadedModule)] ALIGNED(64);
+
+static char linker_placeholder[sizeof(LoadedModule)] ALIGNED(64);
 static LoadedModule *linker = nullptr;
 
 static bool IsLinker(const char* full_name) {
@@ -36,20 +35,24 @@
 }
 
 void InitializePlatformSpecificModules() {
-  internal_memset(linker_placeholder, 0, sizeof(linker_placeholder));
-  uptr num_matches = GetListOfModules(
-      reinterpret_cast<LoadedModule *>(linker_placeholder), 2, IsLinker);
-  if (num_matches == 1) {
-    linker = reinterpret_cast<LoadedModule *>(linker_placeholder);
-    return;
+  ListOfModules modules;
+  modules.init();
+  for (LoadedModule &module : modules) {
+    if (!IsLinker(module.full_name())) continue;
+    if (linker == nullptr) {
+      linker = reinterpret_cast<LoadedModule *>(linker_placeholder);
+      *linker = module;
+      module = LoadedModule();
+    } else {
+      VReport(1, "LeakSanitizer: Multiple modules match \"%s\". "
+              "TLS will not be handled correctly.\n", kLinkerName);
+      linker->clear();
+      linker = nullptr;
+      return;
+    }
   }
-  if (num_matches == 0)
-    VReport(1, "LeakSanitizer: Dynamic linker not found. "
-            "TLS will not be handled correctly.\n");
-  else if (num_matches > 1)
-    VReport(1, "LeakSanitizer: Multiple modules match \"%s\". "
-            "TLS will not be handled correctly.\n", kLinkerName);
-  linker = nullptr;
+  VReport(1, "LeakSanitizer: Dynamic linker not found. "
+             "TLS will not be handled correctly.\n");
 }
 
 static int ProcessGlobalRegionsCallback(struct dl_phdr_info *info, size_t size,
@@ -100,6 +103,7 @@
 struct ProcessPlatformAllocParam {
   Frontier *frontier;
   StackDepotReverseMap *stack_depot_reverse_map;
+  bool skip_linker_allocations;
 };
 
 // ForEachChunk callback. Identifies unreachable chunks which must be treated as
@@ -117,7 +121,8 @@
       caller_pc = GetCallerPC(stack_id, param->stack_depot_reverse_map);
     // If caller_pc is unknown, this chunk may be allocated in a coroutine. Mark
     // it as reachable, as we can't properly report its allocation stack anyway.
-    if (caller_pc == 0 || linker->containsAddress(caller_pc)) {
+    if (caller_pc == 0 || (param->skip_linker_allocations &&
+                           linker->containsAddress(caller_pc))) {
       m.set_tag(kReachable);
       param->frontier->push_back(chunk);
     }
@@ -142,10 +147,12 @@
 // guaranteed to include all dynamic TLS blocks (and possibly other allocations
 // which we don't care about).
 void ProcessPlatformSpecificAllocations(Frontier *frontier) {
-  if (!flags()->use_tls) return;
-  if (!linker) return;
   StackDepotReverseMap stack_depot_reverse_map;
-  ProcessPlatformAllocParam arg = {frontier, &stack_depot_reverse_map};
+  ProcessPlatformAllocParam arg;
+  arg.frontier = frontier;
+  arg.stack_depot_reverse_map = &stack_depot_reverse_map;
+  arg.skip_linker_allocations =
+      flags()->use_tls && flags()->use_ld_allocations && linker != nullptr;
   ForEachChunk(ProcessPlatformSpecificAllocationsCb, &arg);
 }
 
diff --git a/lib/lsan/lsan_flags.inc b/lib/lsan/lsan_flags.inc
index c405005..e390e2a 100644
--- a/lib/lsan/lsan_flags.inc
+++ b/lib/lsan/lsan_flags.inc
@@ -34,6 +34,10 @@
           "Root set: include TLS and thread-specific storage")
 LSAN_FLAG(bool, use_root_regions, true,
           "Root set: include regions added via __lsan_register_root_region().")
+LSAN_FLAG(bool, use_ld_allocations, true,
+          "Root set: mark as reachable all allocations made from dynamic "
+          "linker. This was the old way to handle dynamic TLS, and will "
+          "be removed soon. Do not use this flag.")
 
 LSAN_FLAG(bool, use_unaligned, false, "Consider unaligned pointers valid.")
 LSAN_FLAG(bool, use_poisoned, false,
diff --git a/lib/lsan/lsan_interceptors.cc b/lib/lsan/lsan_interceptors.cc
index be0d0dd..28f1786 100644
--- a/lib/lsan/lsan_interceptors.cc
+++ b/lib/lsan/lsan_interceptors.cc
@@ -20,8 +20,10 @@
 #include "sanitizer_common/sanitizer_internal_defs.h"
 #include "sanitizer_common/sanitizer_linux.h"
 #include "sanitizer_common/sanitizer_platform_limits_posix.h"
+#include "sanitizer_common/sanitizer_tls_get_addr.h"
 #include "lsan.h"
 #include "lsan_allocator.h"
+#include "lsan_common.h"
 #include "lsan_thread.h"
 
 using namespace __lsan;
@@ -104,6 +106,14 @@
   return 0;
 }
 
+INTERCEPTOR(void *, __libc_memalign, uptr alignment, uptr size) {
+  ENSURE_LSAN_INITED;
+  GET_STACK_TRACE_MALLOC;
+  void *res = Allocate(stack, size, alignment, kAlwaysClearMemory);
+  DTLS_on_libc_memalign(res, size);
+  return res;
+}
+
 INTERCEPTOR(void*, valloc, uptr size) {
   ENSURE_LSAN_INITED;
   GET_STACK_TRACE_MALLOC;
@@ -174,11 +184,6 @@
   OPERATOR_DELETE_BODY;
 }
 
-// We need this to intercept the __libc_memalign calls that are used to
-// allocate dynamic TLS space in ld-linux.so.
-INTERCEPTOR(void *, __libc_memalign, uptr align, uptr s)
-    ALIAS(WRAPPER_NAME(memalign));
-
 ///// Thread initialization and finalization. /////
 
 static unsigned g_thread_finalize_key;
@@ -237,7 +242,15 @@
   p.callback = callback;
   p.param = param;
   atomic_store(&p.tid, 0, memory_order_relaxed);
-  int res = REAL(pthread_create)(th, attr, __lsan_thread_start_func, &p);
+  int res;
+  {
+    // Ignore all allocations made by pthread_create: thread stack/TLS may be
+    // stored by pthread for future reuse even after thread destruction, and
+    // the linked list it's stored in doesn't even hold valid pointers to the
+    // objects, the latter are calculated by obscure pointer arithmetic.
+    ScopedInterceptorDisabler disabler;
+    res = REAL(pthread_create)(th, attr, __lsan_thread_start_func, &p);
+  }
   if (res == 0) {
     int tid = ThreadCreate(GetCurrentThread(), *(uptr *)th, detached);
     CHECK_NE(tid, 0);
diff --git a/lib/lsan/lsan_thread.cc b/lib/lsan/lsan_thread.cc
index 10ac2c9..8bd6d90 100644
--- a/lib/lsan/lsan_thread.cc
+++ b/lib/lsan/lsan_thread.cc
@@ -17,6 +17,7 @@
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_placement_new.h"
 #include "sanitizer_common/sanitizer_thread_registry.h"
+#include "sanitizer_common/sanitizer_tls_get_addr.h"
 #include "lsan_allocator.h"
 
 namespace __lsan {
@@ -49,18 +50,20 @@
 }
 
 ThreadContext::ThreadContext(int tid)
-  : ThreadContextBase(tid),
-    stack_begin_(0),
-    stack_end_(0),
-    cache_begin_(0),
-    cache_end_(0),
-    tls_begin_(0),
-    tls_end_(0) {}
+    : ThreadContextBase(tid),
+      stack_begin_(0),
+      stack_end_(0),
+      cache_begin_(0),
+      cache_end_(0),
+      tls_begin_(0),
+      tls_end_(0),
+      dtls_(nullptr) {}
 
 struct OnStartedArgs {
   uptr stack_begin, stack_end,
        cache_begin, cache_end,
        tls_begin, tls_end;
+  DTLS *dtls;
 };
 
 void ThreadContext::OnStarted(void *arg) {
@@ -71,10 +74,12 @@
   tls_end_ = args->tls_end;
   cache_begin_ = args->cache_begin;
   cache_end_ = args->cache_end;
+  dtls_ = args->dtls;
 }
 
 void ThreadContext::OnFinished() {
   AllocatorThreadFinish();
+  DTLS_Destroy();
 }
 
 u32 ThreadCreate(u32 parent_tid, uptr user_id, bool detached) {
@@ -91,6 +96,7 @@
   args.stack_end = args.stack_begin + stack_size;
   args.tls_end = args.tls_begin + tls_size;
   GetAllocatorCacheRange(&args.cache_begin, &args.cache_end);
+  args.dtls = DTLS_Get();
   thread_registry->StartThread(tid, os_id, &args);
 }
 
@@ -131,8 +137,8 @@
 ///// Interface to the common LSan module. /////
 
 bool GetThreadRangesLocked(uptr os_id, uptr *stack_begin, uptr *stack_end,
-                           uptr *tls_begin, uptr *tls_end,
-                           uptr *cache_begin, uptr *cache_end) {
+                           uptr *tls_begin, uptr *tls_end, uptr *cache_begin,
+                           uptr *cache_end, DTLS **dtls) {
   ThreadContext *context = static_cast<ThreadContext *>(
       thread_registry->FindThreadContextByOsIDLocked(os_id));
   if (!context) return false;
@@ -142,6 +148,7 @@
   *tls_end = context->tls_end();
   *cache_begin = context->cache_begin();
   *cache_end = context->cache_end();
+  *dtls = context->dtls();
   return true;
 }
 
diff --git a/lib/lsan/lsan_thread.h b/lib/lsan/lsan_thread.h
index 99e2c1d..10b7b57 100644
--- a/lib/lsan/lsan_thread.h
+++ b/lib/lsan/lsan_thread.h
@@ -17,6 +17,10 @@
 
 #include "sanitizer_common/sanitizer_thread_registry.h"
 
+namespace __sanitizer {
+struct DTLS;
+}
+
 namespace __lsan {
 
 class ThreadContext : public ThreadContextBase {
@@ -30,10 +34,13 @@
   uptr tls_end() { return tls_end_; }
   uptr cache_begin() { return cache_begin_; }
   uptr cache_end() { return cache_end_; }
+  DTLS *dtls() { return dtls_; }
+
  private:
   uptr stack_begin_, stack_end_,
        cache_begin_, cache_end_,
        tls_begin_, tls_end_;
+  DTLS *dtls_;
 };
 
 void InitializeThreadRegistry();
diff --git a/lib/msan/CMakeLists.txt b/lib/msan/CMakeLists.txt
index 1b48def..e7f2877 100644
--- a/lib/msan/CMakeLists.txt
+++ b/lib/msan/CMakeLists.txt
@@ -17,7 +17,7 @@
 
 
 set(MSAN_RTL_CFLAGS ${SANITIZER_COMMON_CFLAGS})
-append_no_rtti_flag(MSAN_RTL_CFLAGS)
+append_rtti_flag(OFF MSAN_RTL_CFLAGS)
 append_list_if(COMPILER_RT_HAS_FPIE_FLAG -fPIE MSAN_RTL_CFLAGS)
 # Prevent clang from generating libc calls.
 append_list_if(COMPILER_RT_HAS_FFREESTANDING_FLAG -ffreestanding MSAN_RTL_CFLAGS)
@@ -26,6 +26,8 @@
 
 # Static runtime library.
 add_custom_target(msan)
+set_target_properties(msan PROPERTIES FOLDER "Compiler-RT Misc")
+
 foreach(arch ${MSAN_SUPPORTED_ARCH})
   add_compiler_rt_runtime(clang_rt.msan
     STATIC
@@ -58,8 +60,7 @@
   endif()
 endforeach()
 
-add_compiler_rt_resource_file(msan_blacklist msan_blacklist.txt)
-add_dependencies(msan msan_blacklist)
+add_compiler_rt_resource_file(msan_blacklist msan_blacklist.txt msan)
 add_dependencies(compiler-rt msan)
 
 if(COMPILER_RT_INCLUDE_TESTS)
diff --git a/lib/msan/msan.cc b/lib/msan/msan.cc
index 9949db4..d2981f0 100644
--- a/lib/msan/msan.cc
+++ b/lib/msan/msan.cc
@@ -178,7 +178,7 @@
 #endif
   VPrintf(1, "MSAN_OPTIONS: %s\n", msan_options ? msan_options : "<empty>");
 
-  SetVerbosity(common_flags()->verbosity);
+  InitializeCommonFlags();
 
   if (Verbosity()) ReportUnrecognizedFlags();
 
@@ -375,6 +375,7 @@
   msan_init_is_running = 1;
   SanitizerToolName = "MemorySanitizer";
 
+  AvoidCVE_2016_2143();
   InitTlsSize();
 
   CacheBinaryName();
@@ -462,13 +463,8 @@
   }
 
   unsigned char *s = (unsigned char*)MEM_TO_SHADOW(x);
-  for (uptr i = 0; i < size; i++) {
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-    Printf("%x%x ", s[i] & 0xf, s[i] >> 4);
-#else
+  for (uptr i = 0; i < size; i++)
     Printf("%x%x ", s[i] >> 4, s[i] & 0xf);
-#endif
-  }
   Printf("\n");
 }
 
@@ -542,6 +538,13 @@
     u32 idx = atomic_fetch_add(&NumStackOriginDescrs, 1, memory_order_relaxed);
     CHECK_LT(idx, kNumStackOriginDescrs);
     StackOriginDescr[idx] = descr + 4;
+#if SANITIZER_PPC64V1
+    // On PowerPC64 ELFv1, the address of a function actually points to a
+    // three-doubleword data structure with the first field containing
+    // the address of the function's code.
+    if (pc)
+      pc = *reinterpret_cast<uptr*>(pc);
+#endif
     StackOriginPC[idx] = pc;
     id = Origin::CreateStackOrigin(idx).raw_id();
     *id_ptr = id;
@@ -580,13 +583,13 @@
 }
 
 u16 __sanitizer_unaligned_load16(const uu16 *p) {
-  __msan_retval_tls[0] = *(uu16 *)MEM_TO_SHADOW((uptr)p);
+  *(uu16 *)&__msan_retval_tls[0] = *(uu16 *)MEM_TO_SHADOW((uptr)p);
   if (__msan_get_track_origins())
     __msan_retval_origin_tls = GetOriginIfPoisoned((uptr)p, sizeof(*p));
   return *p;
 }
 u32 __sanitizer_unaligned_load32(const uu32 *p) {
-  __msan_retval_tls[0] = *(uu32 *)MEM_TO_SHADOW((uptr)p);
+  *(uu32 *)&__msan_retval_tls[0] = *(uu32 *)MEM_TO_SHADOW((uptr)p);
   if (__msan_get_track_origins())
     __msan_retval_origin_tls = GetOriginIfPoisoned((uptr)p, sizeof(*p));
   return *p;
@@ -598,7 +601,7 @@
   return *p;
 }
 void __sanitizer_unaligned_store16(uu16 *p, u16 x) {
-  u16 s = __msan_param_tls[1];
+  u16 s = *(uu16 *)&__msan_param_tls[1];
   *(uu16 *)MEM_TO_SHADOW((uptr)p) = s;
   if (s && __msan_get_track_origins())
     if (uu32 o = __msan_param_origin_tls[2])
@@ -606,7 +609,7 @@
   *p = x;
 }
 void __sanitizer_unaligned_store32(uu32 *p, u32 x) {
-  u32 s = __msan_param_tls[1];
+  u32 s = *(uu32 *)&__msan_param_tls[1];
   *(uu32 *)MEM_TO_SHADOW((uptr)p) = s;
   if (s && __msan_get_track_origins())
     if (uu32 o = __msan_param_origin_tls[2])
diff --git a/lib/msan/msan.h b/lib/msan/msan.h
index 2079a59..1f2ff59 100644
--- a/lib/msan/msan.h
+++ b/lib/msan/msan.h
@@ -107,7 +107,7 @@
 # define MEM_TO_SHADOW(mem) ((uptr)mem ^ 0x6000000000ULL)
 # define SHADOW_TO_ORIGIN(shadow) (((uptr)(shadow)) + 0x1000000000ULL)
 
-#elif SANITIZER_LINUX && defined(__powerpc64__)
+#elif SANITIZER_LINUX && SANITIZER_PPC64
 
 const MappingDesc kMemoryLayout[] = {
     {0x000000000000ULL, 0x000100000000ULL, MappingDesc::APP, "low memory"},
@@ -309,9 +309,21 @@
 
 }  // namespace __msan
 
-#define MSAN_MALLOC_HOOK(ptr, size) \
-  if (&__sanitizer_malloc_hook) __sanitizer_malloc_hook(ptr, size)
-#define MSAN_FREE_HOOK(ptr) \
-  if (&__sanitizer_free_hook) __sanitizer_free_hook(ptr)
+#define MSAN_MALLOC_HOOK(ptr, size)       \
+  do {                                    \
+    if (&__sanitizer_malloc_hook) {       \
+      UnpoisonParam(2);                   \
+      __sanitizer_malloc_hook(ptr, size); \
+    }                                     \
+    RunMallocHooks(ptr, size);            \
+  } while (false)
+#define MSAN_FREE_HOOK(ptr)       \
+  do {                            \
+    if (&__sanitizer_free_hook) { \
+      UnpoisonParam(1);           \
+      __sanitizer_free_hook(ptr); \
+    }                             \
+    RunFreeHooks(ptr);            \
+  } while (false)
 
 #endif  // MSAN_H
diff --git a/lib/msan/msan_interceptors.cc b/lib/msan/msan_interceptors.cc
index fc28e08..f23d3ee 100644
--- a/lib/msan/msan_interceptors.cc
+++ b/lib/msan/msan_interceptors.cc
@@ -43,6 +43,9 @@
 using __sanitizer::atomic_store;
 using __sanitizer::atomic_uintptr_t;
 
+DECLARE_REAL(SIZE_T, strlen, const char *s)
+DECLARE_REAL(SIZE_T, strnlen, const char *s, SIZE_T maxlen)
+
 #if SANITIZER_FREEBSD
 #define __errno_location __error
 #endif
@@ -195,7 +198,7 @@
   GET_MALLOC_STACK_TRACE;
   CHECK_EQ(boundary & (boundary - 1), 0);
   void *ptr = MsanReallocate(&stack, nullptr, size, boundary, false);
-  DTLS_on_libc_memalign(ptr, size * boundary);
+  DTLS_on_libc_memalign(ptr, size);
   return ptr;
 }
 
@@ -280,23 +283,6 @@
 #define MSAN_MAYBE_INTERCEPT_MALLOC_STATS
 #endif
 
-INTERCEPTOR(SIZE_T, strlen, const char *s) {
-  if (msan_init_is_running)
-    return REAL(strlen)(s);
-  ENSURE_MSAN_INITED();
-  SIZE_T res = REAL(strlen)(s);
-  CHECK_UNPOISONED(s, res + 1);
-  return res;
-}
-
-INTERCEPTOR(SIZE_T, strnlen, const char *s, SIZE_T n) {
-  ENSURE_MSAN_INITED();
-  SIZE_T res = REAL(strnlen)(s, n);
-  SIZE_T scan_size = (res == n) ? res : res + 1;
-  CHECK_UNPOISONED(s, scan_size);
-  return res;
-}
-
 INTERCEPTOR(char *, strcpy, char *dest, const char *src) {  // NOLINT
   ENSURE_MSAN_INITED();
   GET_STORE_STACK_TRACE;
@@ -756,65 +742,6 @@
 #define MSAN_MAYBE_INTERCEPT___FXSTATAT64
 #endif
 
-#if SANITIZER_FREEBSD
-INTERCEPTOR(int, stat, char *path, void *buf) {
-  ENSURE_MSAN_INITED();
-  int res = REAL(stat)(path, buf);
-  if (!res)
-    __msan_unpoison(buf, __sanitizer::struct_stat_sz);
-  return res;
-}
-# define MSAN_INTERCEPT_STAT INTERCEPT_FUNCTION(stat)
-#else
-INTERCEPTOR(int, __xstat, int magic, char *path, void *buf) {
-  ENSURE_MSAN_INITED();
-  int res = REAL(__xstat)(magic, path, buf);
-  if (!res)
-    __msan_unpoison(buf, __sanitizer::struct_stat_sz);
-  return res;
-}
-# define MSAN_INTERCEPT_STAT INTERCEPT_FUNCTION(__xstat)
-#endif
-
-#if !SANITIZER_FREEBSD
-INTERCEPTOR(int, __xstat64, int magic, char *path, void *buf) {
-  ENSURE_MSAN_INITED();
-  int res = REAL(__xstat64)(magic, path, buf);
-  if (!res)
-    __msan_unpoison(buf, __sanitizer::struct_stat64_sz);
-  return res;
-}
-#define MSAN_MAYBE_INTERCEPT___XSTAT64 INTERCEPT_FUNCTION(__xstat64)
-#else
-#define MSAN_MAYBE_INTERCEPT___XSTAT64
-#endif
-
-#if !SANITIZER_FREEBSD
-INTERCEPTOR(int, __lxstat, int magic, char *path, void *buf) {
-  ENSURE_MSAN_INITED();
-  int res = REAL(__lxstat)(magic, path, buf);
-  if (!res)
-    __msan_unpoison(buf, __sanitizer::struct_stat_sz);
-  return res;
-}
-#define MSAN_MAYBE_INTERCEPT___LXSTAT INTERCEPT_FUNCTION(__lxstat)
-#else
-#define MSAN_MAYBE_INTERCEPT___LXSTAT
-#endif
-
-#if !SANITIZER_FREEBSD
-INTERCEPTOR(int, __lxstat64, int magic, char *path, void *buf) {
-  ENSURE_MSAN_INITED();
-  int res = REAL(__lxstat64)(magic, path, buf);
-  if (!res)
-    __msan_unpoison(buf, __sanitizer::struct_stat64_sz);
-  return res;
-}
-#define MSAN_MAYBE_INTERCEPT___LXSTAT64 INTERCEPT_FUNCTION(__lxstat64)
-#else
-#define MSAN_MAYBE_INTERCEPT___LXSTAT64
-#endif
-
 INTERCEPTOR(int, pipe, int pipefd[2]) {
   if (msan_init_is_running)
     return REAL(pipe)(pipefd);
@@ -874,17 +801,42 @@
 
 #if !SANITIZER_FREEBSD
 INTERCEPTOR(int, getrlimit64, int resource, void *rlim) {
-  if (msan_init_is_running)
-    return REAL(getrlimit64)(resource, rlim);
+  if (msan_init_is_running) return REAL(getrlimit64)(resource, rlim);
   ENSURE_MSAN_INITED();
   int res = REAL(getrlimit64)(resource, rlim);
-  if (!res)
-    __msan_unpoison(rlim, __sanitizer::struct_rlimit64_sz);
+  if (!res) __msan_unpoison(rlim, __sanitizer::struct_rlimit64_sz);
   return res;
 }
+
+INTERCEPTOR(int, prlimit, int pid, int resource, void *new_rlimit,
+            void *old_rlimit) {
+  if (msan_init_is_running)
+    return REAL(prlimit)(pid, resource, new_rlimit, old_rlimit);
+  ENSURE_MSAN_INITED();
+  CHECK_UNPOISONED(new_rlimit, __sanitizer::struct_rlimit_sz);
+  int res = REAL(prlimit)(pid, resource, new_rlimit, old_rlimit);
+  if (!res) __msan_unpoison(old_rlimit, __sanitizer::struct_rlimit_sz);
+  return res;
+}
+
+INTERCEPTOR(int, prlimit64, int pid, int resource, void *new_rlimit,
+            void *old_rlimit) {
+  if (msan_init_is_running)
+    return REAL(prlimit64)(pid, resource, new_rlimit, old_rlimit);
+  ENSURE_MSAN_INITED();
+  CHECK_UNPOISONED(new_rlimit, __sanitizer::struct_rlimit64_sz);
+  int res = REAL(prlimit64)(pid, resource, new_rlimit, old_rlimit);
+  if (!res) __msan_unpoison(old_rlimit, __sanitizer::struct_rlimit64_sz);
+  return res;
+}
+
 #define MSAN_MAYBE_INTERCEPT_GETRLIMIT64 INTERCEPT_FUNCTION(getrlimit64)
+#define MSAN_MAYBE_INTERCEPT_PRLIMIT INTERCEPT_FUNCTION(prlimit)
+#define MSAN_MAYBE_INTERCEPT_PRLIMIT64 INTERCEPT_FUNCTION(prlimit64)
 #else
 #define MSAN_MAYBE_INTERCEPT_GETRLIMIT64
+#define MSAN_MAYBE_INTERCEPT_PRLIMIT
+#define MSAN_MAYBE_INTERCEPT_PRLIMIT64
 #endif
 
 #if SANITIZER_FREEBSD
@@ -953,30 +905,6 @@
 #define MSAN_MAYBE_INTERCEPT_EPOLL_PWAIT
 #endif
 
-INTERCEPTOR(SSIZE_T, recv, int fd, void *buf, SIZE_T len, int flags) {
-  ENSURE_MSAN_INITED();
-  SSIZE_T res = REAL(recv)(fd, buf, len, flags);
-  if (res > 0)
-    __msan_unpoison(buf, res);
-  return res;
-}
-
-INTERCEPTOR(SSIZE_T, recvfrom, int fd, void *buf, SIZE_T len, int flags,
-            void *srcaddr, int *addrlen) {
-  ENSURE_MSAN_INITED();
-  SIZE_T srcaddr_sz;
-  if (srcaddr) srcaddr_sz = *addrlen;
-  SSIZE_T res = REAL(recvfrom)(fd, buf, len, flags, srcaddr, addrlen);
-  if (res > 0) {
-    __msan_unpoison(buf, res);
-    if (srcaddr) {
-      SIZE_T sz = *addrlen;
-      __msan_unpoison(srcaddr, Min(sz, srcaddr_sz));
-    }
-  }
-  return res;
-}
-
 INTERCEPTOR(void *, calloc, SIZE_T nmemb, SIZE_T size) {
   GET_MALLOC_STACK_TRACE;
   if (UNLIKELY(!msan_inited)) {
@@ -1065,63 +993,6 @@
 #define MSAN_MAYBE_INTERCEPT_MMAP64
 #endif
 
-struct dlinfo {
-  char *dli_fname;
-  void *dli_fbase;
-  char *dli_sname;
-  void *dli_saddr;
-};
-
-INTERCEPTOR(int, dladdr, void *addr, dlinfo *info) {
-  ENSURE_MSAN_INITED();
-  int res = REAL(dladdr)(addr, info);
-  if (res != 0) {
-    __msan_unpoison(info, sizeof(*info));
-    if (info->dli_fname)
-      __msan_unpoison(info->dli_fname, REAL(strlen)(info->dli_fname) + 1);
-    if (info->dli_sname)
-      __msan_unpoison(info->dli_sname, REAL(strlen)(info->dli_sname) + 1);
-  }
-  return res;
-}
-
-INTERCEPTOR(char *, dlerror, int fake) {
-  ENSURE_MSAN_INITED();
-  char *res = REAL(dlerror)(fake);
-  if (res) __msan_unpoison(res, REAL(strlen)(res) + 1);
-  return res;
-}
-
-typedef int (*dl_iterate_phdr_cb)(__sanitizer_dl_phdr_info *info, SIZE_T size,
-                                  void *data);
-struct dl_iterate_phdr_data {
-  dl_iterate_phdr_cb callback;
-  void *data;
-};
-
-static int msan_dl_iterate_phdr_cb(__sanitizer_dl_phdr_info *info, SIZE_T size,
-                                   void *data) {
-  if (info) {
-    __msan_unpoison(info, size);
-    if (info->dlpi_phdr && info->dlpi_phnum)
-      __msan_unpoison(info->dlpi_phdr, struct_ElfW_Phdr_sz * info->dlpi_phnum);
-    if (info->dlpi_name)
-      __msan_unpoison(info->dlpi_name, REAL(strlen)(info->dlpi_name) + 1);
-  }
-  dl_iterate_phdr_data *cbdata = (dl_iterate_phdr_data *)data;
-  UnpoisonParam(3);
-  return cbdata->callback(info, size, cbdata->data);
-}
-
-INTERCEPTOR(int, dl_iterate_phdr, dl_iterate_phdr_cb callback, void *data) {
-  ENSURE_MSAN_INITED();
-  dl_iterate_phdr_data cbdata;
-  cbdata.callback = callback;
-  cbdata.data = data;
-  int res = REAL(dl_iterate_phdr)(msan_dl_iterate_phdr_cb, (void *)&cbdata);
-  return res;
-}
-
 INTERCEPTOR(int, getrusage, int who, void *usage) {
   ENSURE_MSAN_INITED();
   int res = REAL(getrusage)(who, usage);
@@ -1397,7 +1268,16 @@
       VReport(1, "MemorySanitizer: failed to intercept '" #name "'\n"); \
   } while (0)
 
+#define MSAN_INTERCEPT_FUNC_VER(name, ver)                                    \
+  do {                                                                        \
+    if ((!INTERCEPT_FUNCTION_VER(name, ver) || !REAL(name)))                  \
+      VReport(                                                                \
+          1, "MemorySanitizer: failed to intercept '" #name "@@" #ver "'\n"); \
+  } while (0)
+
 #define COMMON_INTERCEPT_FUNCTION(name) MSAN_INTERCEPT_FUNC(name)
+#define COMMON_INTERCEPT_FUNCTION_VER(name, ver)                          \
+  MSAN_INTERCEPT_FUNC_VER(name, ver)
 #define COMMON_INTERCEPTOR_UNPOISON_PARAM(count)  \
   UnpoisonParam(count)
 #define COMMON_INTERCEPTOR_WRITE_RANGE(ctx, ptr, size) \
@@ -1408,12 +1288,12 @@
   __msan_unpoison(ptr, size)
 #define COMMON_INTERCEPTOR_ENTER(ctx, func, ...)                  \
   if (msan_init_is_running) return REAL(func)(__VA_ARGS__);       \
+  ENSURE_MSAN_INITED();                                           \
   MSanInterceptorContext msan_ctx = {IsInInterceptorScope()};     \
   ctx = (void *)&msan_ctx;                                        \
   (void)ctx;                                                      \
   InterceptorScope interceptor_scope;                             \
-  __msan_unpoison(__errno_location(), sizeof(int)); /* NOLINT */  \
-  ENSURE_MSAN_INITED();
+  __msan_unpoison(__errno_location(), sizeof(int)); /* NOLINT */
 #define COMMON_INTERCEPTOR_DIR_ACQUIRE(ctx, path) \
   do {                                            \
   } while (false)
@@ -1449,6 +1329,11 @@
     *begin = *end = 0;                                                         \
   }
 
+#include "sanitizer_common/sanitizer_platform_interceptors.h"
+// Msan needs custom handling of these:
+#undef SANITIZER_INTERCEPT_MEMSET
+#undef SANITIZER_INTERCEPT_MEMMOVE
+#undef SANITIZER_INTERCEPT_MEMCPY
 #include "sanitizer_common/sanitizer_common_interceptors.inc"
 
 #define COMMON_SYSCALL_PRE_READ_RANGE(p, s) CHECK_UNPOISONED(p, s)
@@ -1461,6 +1346,66 @@
 #define COMMON_SYSCALL_POST_WRITE_RANGE(p, s) __msan_unpoison(p, s)
 #include "sanitizer_common/sanitizer_common_syscalls.inc"
 
+struct dlinfo {
+  char *dli_fname;
+  void *dli_fbase;
+  char *dli_sname;
+  void *dli_saddr;
+};
+
+INTERCEPTOR(int, dladdr, void *addr, dlinfo *info) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, dladdr, addr, info);
+  int res = REAL(dladdr)(addr, info);
+  if (res != 0) {
+    __msan_unpoison(info, sizeof(*info));
+    if (info->dli_fname)
+      __msan_unpoison(info->dli_fname, REAL(strlen)(info->dli_fname) + 1);
+    if (info->dli_sname)
+      __msan_unpoison(info->dli_sname, REAL(strlen)(info->dli_sname) + 1);
+  }
+  return res;
+}
+
+INTERCEPTOR(char *, dlerror, int fake) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, dlerror, fake);
+  char *res = REAL(dlerror)(fake);
+  if (res) __msan_unpoison(res, REAL(strlen)(res) + 1);
+  return res;
+}
+
+typedef int (*dl_iterate_phdr_cb)(__sanitizer_dl_phdr_info *info, SIZE_T size,
+                                  void *data);
+struct dl_iterate_phdr_data {
+  dl_iterate_phdr_cb callback;
+  void *data;
+};
+
+static int msan_dl_iterate_phdr_cb(__sanitizer_dl_phdr_info *info, SIZE_T size,
+                                   void *data) {
+  if (info) {
+    __msan_unpoison(info, size);
+    if (info->dlpi_phdr && info->dlpi_phnum)
+      __msan_unpoison(info->dlpi_phdr, struct_ElfW_Phdr_sz * info->dlpi_phnum);
+    if (info->dlpi_name)
+      __msan_unpoison(info->dlpi_name, REAL(strlen)(info->dlpi_name) + 1);
+  }
+  dl_iterate_phdr_data *cbdata = (dl_iterate_phdr_data *)data;
+  UnpoisonParam(3);
+  return cbdata->callback(info, size, cbdata->data);
+}
+
+INTERCEPTOR(int, dl_iterate_phdr, dl_iterate_phdr_cb callback, void *data) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, dl_iterate_phdr, callback, data);
+  dl_iterate_phdr_data cbdata;
+  cbdata.callback = callback;
+  cbdata.data = data;
+  int res = REAL(dl_iterate_phdr)(msan_dl_iterate_phdr_cb, (void *)&cbdata);
+  return res;
+}
+
 // These interface functions reside here so that they can use
 // REAL(memset), etc.
 void __msan_unpoison(const void *a, uptr size) {
@@ -1561,8 +1506,6 @@
   INTERCEPT_FUNCTION(strndup);
   MSAN_MAYBE_INTERCEPT___STRNDUP;
   INTERCEPT_FUNCTION(strncpy);  // NOLINT
-  INTERCEPT_FUNCTION(strlen);
-  INTERCEPT_FUNCTION(strnlen);
   INTERCEPT_FUNCTION(gcvt);
   INTERCEPT_FUNCTION(strcat);  // NOLINT
   INTERCEPT_FUNCTION(strncat);  // NOLINT
@@ -1580,8 +1523,13 @@
   INTERCEPT_STRTO(wcstoul);
   INTERCEPT_STRTO(wcstoll);
   INTERCEPT_STRTO(wcstoull);
+#ifdef SANITIZER_NLDBL_VERSION
+  INTERCEPT_FUNCTION_VER(vswprintf, SANITIZER_NLDBL_VERSION);
+  INTERCEPT_FUNCTION_VER(swprintf, SANITIZER_NLDBL_VERSION);
+#else
   INTERCEPT_FUNCTION(vswprintf);
   INTERCEPT_FUNCTION(swprintf);
+#endif
   INTERCEPT_FUNCTION(strxfrm);
   INTERCEPT_FUNCTION(strxfrm_l);
   INTERCEPT_FUNCTION(strftime);
@@ -1603,12 +1551,8 @@
   INTERCEPT_FUNCTION(fcvt);
   MSAN_MAYBE_INTERCEPT___FXSTAT;
   MSAN_INTERCEPT_FSTATAT;
-  MSAN_INTERCEPT_STAT;
-  MSAN_MAYBE_INTERCEPT___LXSTAT;
   MSAN_MAYBE_INTERCEPT___FXSTAT64;
   MSAN_MAYBE_INTERCEPT___FXSTATAT64;
-  MSAN_MAYBE_INTERCEPT___XSTAT64;
-  MSAN_MAYBE_INTERCEPT___LXSTAT64;
   INTERCEPT_FUNCTION(pipe);
   INTERCEPT_FUNCTION(pipe2);
   INTERCEPT_FUNCTION(socketpair);
@@ -1616,19 +1560,23 @@
   MSAN_MAYBE_INTERCEPT_FGETS_UNLOCKED;
   INTERCEPT_FUNCTION(getrlimit);
   MSAN_MAYBE_INTERCEPT_GETRLIMIT64;
+  MSAN_MAYBE_INTERCEPT_PRLIMIT;
+  MSAN_MAYBE_INTERCEPT_PRLIMIT64;
   MSAN_INTERCEPT_UNAME;
   INTERCEPT_FUNCTION(gethostname);
   MSAN_MAYBE_INTERCEPT_EPOLL_WAIT;
   MSAN_MAYBE_INTERCEPT_EPOLL_PWAIT;
-  INTERCEPT_FUNCTION(recv);
-  INTERCEPT_FUNCTION(recvfrom);
   INTERCEPT_FUNCTION(dladdr);
   INTERCEPT_FUNCTION(dlerror);
   INTERCEPT_FUNCTION(dl_iterate_phdr);
   INTERCEPT_FUNCTION(getrusage);
   INTERCEPT_FUNCTION(sigaction);
   INTERCEPT_FUNCTION(signal);
+#if defined(__mips__)
+  INTERCEPT_FUNCTION_VER(pthread_create, "GLIBC_2.2");
+#else
   INTERCEPT_FUNCTION(pthread_create);
+#endif
   INTERCEPT_FUNCTION(pthread_key_create);
   INTERCEPT_FUNCTION(pthread_join);
   INTERCEPT_FUNCTION(tzset);
diff --git a/lib/msan/msan_linux.cc b/lib/msan/msan_linux.cc
index ab3be91..d6a9588 100644
--- a/lib/msan/msan_linux.cc
+++ b/lib/msan/msan_linux.cc
@@ -55,14 +55,14 @@
 
 static bool ProtectMemoryRange(uptr beg, uptr size, const char *name) {
   if (size > 0) {
-    void *addr = MmapNoAccess(beg, size, name);
+    void *addr = MmapFixedNoAccess(beg, size, name);
     if (beg == 0 && addr) {
       // Depending on the kernel configuration, we may not be able to protect
       // the page at address zero.
       uptr gap = 16 * GetPageSizeCached();
       beg += gap;
       size -= gap;
-      addr = MmapNoAccess(beg, size, name);
+      addr = MmapFixedNoAccess(beg, size, name);
     }
     if ((uptr)addr != beg) {
       uptr end = beg + size - 1;
diff --git a/lib/msan/msan_report.cc b/lib/msan/msan_report.cc
index ddb8070..9a35c9c 100644
--- a/lib/msan/msan_report.cc
+++ b/lib/msan/msan_report.cc
@@ -221,11 +221,7 @@
     } else {
       unsigned char v = *(unsigned char *)s;
       if (v) last_quad_poisoned = true;
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-      Printf("%x%x", v & 0xf, v >> 4);
-#else
       Printf("%x%x", v >> 4, v & 0xf);
-#endif
     }
     // Group end.
     if (pos % 4 == 3 && with_origins) {
diff --git a/lib/msan/tests/CMakeLists.txt b/lib/msan/tests/CMakeLists.txt
index 087b1af..130a872 100644
--- a/lib/msan/tests/CMakeLists.txt
+++ b/lib/msan/tests/CMakeLists.txt
@@ -20,7 +20,7 @@
 set(MSAN_UNITTEST_COMMON_CFLAGS
   -nostdinc++
   -isystem ${COMPILER_RT_LIBCXX_PATH}/include
-  ${COMPILER_RT_TEST_CFLAGS}
+  ${COMPILER_RT_UNITTEST_CFLAGS}
   ${COMPILER_RT_GTEST_CFLAGS}
   -I${COMPILER_RT_SOURCE_DIR}/include
   -I${COMPILER_RT_SOURCE_DIR}/lib
diff --git a/lib/msan/tests/msan_test.cc b/lib/msan/tests/msan_test.cc
index b7162b3..e4076b5 100644
--- a/lib/msan/tests/msan_test.cc
+++ b/lib/msan/tests/msan_test.cc
@@ -115,7 +115,10 @@
 # define SUPERUSER_GROUP "root"
 #endif
 
-const size_t kPageSize = 4096;
+static uintptr_t GetPageSize() {
+  return sysconf(_SC_PAGESIZE);
+}
+
 const size_t kMaxPathLength = 4096;
 
 typedef unsigned char      U1;
@@ -1117,8 +1120,8 @@
   struct hostent he;
   struct hostent *result;
   int err;
-  int res = gethostbyname_r("localhost", &he, buf, sizeof(buf), &result, &err);
-  ASSERT_EQ(ERANGE, res);
+  gethostbyname_r("localhost", &he, buf, sizeof(buf), &result, &err);
+  ASSERT_EQ(ERANGE, errno);
   EXPECT_NOT_POISONED(err);
 }
 
@@ -1214,17 +1217,21 @@
 }
 
 TEST(MemorySanitizer, shmat) {
-  void *p = mmap(NULL, 4096, PROT_READ | PROT_WRITE,
-                 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-  ASSERT_NE(MAP_FAILED, p);
+  const int kShmSize = 4096;
+  void *mapping_start = mmap(NULL, kShmSize + SHMLBA, PROT_READ | PROT_WRITE,
+                             MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  ASSERT_NE(MAP_FAILED, mapping_start);
+
+  void *p = (void *)(((unsigned long)mapping_start + SHMLBA - 1) / SHMLBA * SHMLBA);
+  // p is now SHMLBA-aligned;
 
   ((char *)p)[10] = *GetPoisoned<U1>();
-  ((char *)p)[4095] = *GetPoisoned<U1>();
+  ((char *)p)[kShmSize - 1] = *GetPoisoned<U1>();
 
-  int res = munmap(p, 4096);
+  int res = munmap(mapping_start, kShmSize + SHMLBA);
   ASSERT_EQ(0, res);
 
-  int id = shmget(IPC_PRIVATE, 4096, 0644 | IPC_CREAT);
+  int id = shmget(IPC_PRIVATE, kShmSize, 0644 | IPC_CREAT);
   ASSERT_GT(id, -1);
 
   void *q = shmat(id, p, 0);
@@ -1232,7 +1239,7 @@
 
   EXPECT_NOT_POISONED(((char *)q)[0]);
   EXPECT_NOT_POISONED(((char *)q)[10]);
-  EXPECT_NOT_POISONED(((char *)q)[4095]);
+  EXPECT_NOT_POISONED(((char *)q)[kShmSize - 1]);
 
   res = shmdt(q);
   ASSERT_EQ(0, res);
@@ -2389,13 +2396,19 @@
 
 TEST(MemorySanitizer, ptrtoint) {
   // Test that shadow is propagated through pointer-to-integer conversion.
-  void* p = (void*)0xABCD;
-  __msan_poison(((char*)&p) + 1, sizeof(p));
-  EXPECT_NOT_POISONED((((uintptr_t)p) & 0xFF) == 0);
+  unsigned char c = 0;
+  __msan_poison(&c, 1);
+  uintptr_t u = (uintptr_t)c << 8;
+  EXPECT_NOT_POISONED(u & 0xFF00FF);
+  EXPECT_POISONED(u & 0xFF00);
 
-  void* q = (void*)0xABCD;
-  __msan_poison(&q, sizeof(q) - 1);
-  EXPECT_POISONED((((uintptr_t)q) & 0xFF) == 0);
+  break_optimization(&u);
+  void* p = (void*)u;
+
+  break_optimization(&p);
+  EXPECT_POISONED(p);
+  EXPECT_NOT_POISONED(((uintptr_t)p) & 0xFF00FF);
+  EXPECT_POISONED(((uintptr_t)p) & 0xFF00);
 }
 
 static void vaargsfn2(int guard, ...) {
@@ -2449,6 +2462,20 @@
   vaargsfn_many(1, 2, *x, 3, 4, 5, 6, 7, 8, 9, *y);
 }
 
+static void vaargsfn_manyfix(int g1, int g2, int g3, int g4, int g5, int g6, int g7, int g8, int g9, ...) {
+  va_list vl;
+  va_start(vl, g9);
+  EXPECT_NOT_POISONED(va_arg(vl, int));
+  EXPECT_POISONED(va_arg(vl, int));
+  va_end(vl);
+}
+
+TEST(MemorySanitizer, VAArgManyFixTest) {
+  int* x = GetPoisoned<int>();
+  int* y = GetPoisoned<int>();
+  vaargsfn_manyfix(1, *x, 3, 4, 5, 6, 7, 8, 9, 10, *y);
+}
+
 static void vaargsfn_pass2(va_list vl) {
   EXPECT_NOT_POISONED(va_arg(vl, int));
   EXPECT_NOT_POISONED(va_arg(vl, int));
@@ -2805,6 +2832,22 @@
   ASSERT_EQ(result, 0);
   EXPECT_NOT_POISONED(limit.rlim_cur);
   EXPECT_NOT_POISONED(limit.rlim_max);
+
+  struct rlimit limit2;
+  __msan_poison(&limit2, sizeof(limit2));
+  result = prlimit(getpid(), RLIMIT_DATA, &limit, &limit2);
+  ASSERT_EQ(result, 0);
+  EXPECT_NOT_POISONED(limit2.rlim_cur);
+  EXPECT_NOT_POISONED(limit2.rlim_max);
+
+  __msan_poison(&limit, sizeof(limit));
+  result = prlimit(getpid(), RLIMIT_DATA, nullptr, &limit);
+  ASSERT_EQ(result, 0);
+  EXPECT_NOT_POISONED(limit.rlim_cur);
+  EXPECT_NOT_POISONED(limit.rlim_max);
+
+  result = prlimit(getpid(), RLIMIT_DATA, &limit, nullptr);
+  ASSERT_EQ(result, 0);
 }
 
 TEST(MemorySanitizer, getrusage) {
@@ -2888,6 +2931,10 @@
   static const char basename[] = "libmsan_loadable.mips64el.so";
 #elif defined(__aarch64__)
   static const char basename[] = "libmsan_loadable.aarch64.so";
+#elif defined(__powerpc64__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  static const char basename[] = "libmsan_loadable.powerpc64.so";
+#elif defined(__powerpc64__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  static const char basename[] = "libmsan_loadable.powerpc64le.so";
 #endif
   int res = snprintf(buf, sz, "%.*s/%s",
                      (int)dir_len, program_path, basename);
@@ -3195,28 +3242,30 @@
 #if !defined(__FreeBSD__)
 TEST(MemorySanitizer, memalign) {
   void *p = memalign(4096, 13);
-  EXPECT_EQ(0U, (uintptr_t)p % kPageSize);
+  EXPECT_EQ(0U, (uintptr_t)p % 4096);
   free(p);
 }
 #endif
 
 TEST(MemorySanitizer, valloc) {
   void *a = valloc(100);
-  EXPECT_EQ(0U, (uintptr_t)a % kPageSize);
+  uintptr_t PageSize = GetPageSize();
+  EXPECT_EQ(0U, (uintptr_t)a % PageSize);
   free(a);
 }
 
 // There's no pvalloc() on FreeBSD.
 #if !defined(__FreeBSD__)
 TEST(MemorySanitizer, pvalloc) {
-  void *p = pvalloc(kPageSize + 100);
-  EXPECT_EQ(0U, (uintptr_t)p % kPageSize);
-  EXPECT_EQ(2 * kPageSize, __sanitizer_get_allocated_size(p));
+  uintptr_t PageSize = GetPageSize();
+  void *p = pvalloc(PageSize + 100);
+  EXPECT_EQ(0U, (uintptr_t)p % PageSize);
+  EXPECT_EQ(2 * PageSize, __sanitizer_get_allocated_size(p));
   free(p);
 
   p = pvalloc(0);  // pvalloc(0) should allocate at least one page.
-  EXPECT_EQ(0U, (uintptr_t)p % kPageSize);
-  EXPECT_EQ(kPageSize, __sanitizer_get_allocated_size(p));
+  EXPECT_EQ(0U, (uintptr_t)p % PageSize);
+  EXPECT_EQ(PageSize, __sanitizer_get_allocated_size(p));
   free(p);
 }
 #endif
@@ -3920,7 +3969,48 @@
 
   EXPECT_EQ((unsigned)(2 * 102 + 3 * 103), c[1]);
 }
-#endif  // defined(__clang__)
+
+TEST(VectorCmpTest, mm_cmpneq_ps) {
+  V4x32 c;
+  c = _mm_cmpneq_ps(V4x32{Poisoned<U4>(), 1, 2, 3}, V4x32{4, 5, Poisoned<U4>(), 6});
+  EXPECT_POISONED(c[0]);
+  EXPECT_NOT_POISONED(c[1]);
+  EXPECT_POISONED(c[2]);
+  EXPECT_NOT_POISONED(c[3]);
+
+  c = _mm_cmpneq_ps(V4x32{0, 1, 2, 3}, V4x32{4, 5, 6, 7});
+  EXPECT_NOT_POISONED(c);
+}
+
+TEST(VectorCmpTest, mm_cmpneq_sd) {
+  V2x64 c;
+  c = _mm_cmpneq_sd(V2x64{Poisoned<U8>(), 1}, V2x64{2, 3});
+  EXPECT_POISONED(c[0]);
+  c = _mm_cmpneq_sd(V2x64{1, 2}, V2x64{Poisoned<U8>(), 3});
+  EXPECT_POISONED(c[0]);
+  c = _mm_cmpneq_sd(V2x64{1, 2}, V2x64{3, 4});
+  EXPECT_NOT_POISONED(c[0]);
+  c = _mm_cmpneq_sd(V2x64{1, Poisoned<U8>()}, V2x64{2, Poisoned<U8>()});
+  EXPECT_NOT_POISONED(c[0]);
+  c = _mm_cmpneq_sd(V2x64{1, Poisoned<U8>()}, V2x64{1, Poisoned<U8>()});
+  EXPECT_NOT_POISONED(c[0]);
+}
+
+TEST(VectorCmpTest, builtin_ia32_ucomisdlt) {
+  U4 c;
+  c = __builtin_ia32_ucomisdlt(V2x64{Poisoned<U8>(), 1}, V2x64{2, 3});
+  EXPECT_POISONED(c);
+  c = __builtin_ia32_ucomisdlt(V2x64{1, 2}, V2x64{Poisoned<U8>(), 3});
+  EXPECT_POISONED(c);
+  c = __builtin_ia32_ucomisdlt(V2x64{1, 2}, V2x64{3, 4});
+  EXPECT_NOT_POISONED(c);
+  c = __builtin_ia32_ucomisdlt(V2x64{1, Poisoned<U8>()}, V2x64{2, Poisoned<U8>()});
+  EXPECT_NOT_POISONED(c);
+  c = __builtin_ia32_ucomisdlt(V2x64{1, Poisoned<U8>()}, V2x64{1, Poisoned<U8>()});
+  EXPECT_NOT_POISONED(c);
+}
+
+#endif // defined(__x86_64__) && defined(__clang__)
 
 TEST(MemorySanitizerOrigins, SetGet) {
   EXPECT_EQ(TrackingOrigins(), !!__msan_get_track_origins());
@@ -4173,7 +4263,7 @@
   U4 origin = __LINE__;
   __msan_set_origin(&x, sizeof(x), origin);
   __msan_poison(&x, sizeof(x));
-  __builtin_ia32_storeups((float*)&y, x);
+  _mm_storeu_ps((float*)&y, x);
   EXPECT_POISONED_O(y, origin);
 }
 #endif
diff --git a/lib/profile/CMakeLists.txt b/lib/profile/CMakeLists.txt
index 1b10ade..ccf79d7 100644
--- a/lib/profile/CMakeLists.txt
+++ b/lib/profile/CMakeLists.txt
@@ -22,7 +22,24 @@
       }
 " COMPILER_RT_TARGET_HAS_ATOMICS)
 
+CHECK_CXX_SOURCE_COMPILES("
+#if defined(__linux__)
+#include <unistd.h>
+#endif
+#include <fcntl.h>
+int fd;
+int main() {
+ struct flock s_flock;
+
+ s_flock.l_type = F_WRLCK;
+ fcntl(fd, F_SETLKW, &s_flock);
+ return 0;
+}
+
+" COMPILER_RT_TARGET_HAS_FCNTL_LCK)
+
 add_custom_target(profile)
+set_target_properties(profile PROPERTIES FOLDER "Compiler-RT Misc")
 
 set(PROFILE_SOURCES
   GCDAProfiling.c
@@ -30,6 +47,8 @@
   InstrProfilingValue.c
   InstrProfilingBuffer.c
   InstrProfilingFile.c
+  InstrProfilingMerge.c
+  InstrProfilingMergeFile.c
   InstrProfilingWriter.c
   InstrProfilingPlatformDarwin.c
   InstrProfilingPlatformLinux.c
@@ -37,13 +56,14 @@
   InstrProfilingRuntime.cc
   InstrProfilingUtil.c)
 
+if(WIN32)
+    list(APPEND PROFILE_SOURCES WindowsMMap.c)
+endif()
+
 if(UNIX)
  set(EXTRA_FLAGS
      -fPIC
      -Wno-pedantic)
-else()
- set(EXTRA_FLAGS
-     -fPIC)
 endif()
 
 if(COMPILER_RT_TARGET_HAS_ATOMICS)
@@ -52,11 +72,23 @@
      -DCOMPILER_RT_HAS_ATOMICS=1)
 endif() 
 
+if(COMPILER_RT_TARGET_HAS_FCNTL_LCK)
+ set(EXTRA_FLAGS
+     ${EXTRA_FLAGS}
+     -DCOMPILER_RT_HAS_FCNTL_LCK=1)
+endif()
+
+# This appears to be a C-only warning banning the use of locals in aggregate
+# initializers. All other compilers accept this, though.
+# nonstandard extension used : 'identifier' : cannot be initialized using address of automatic variable
+append_list_if(COMPILER_RT_HAS_WD4221_FLAG /wd4221 EXTRA_FLAGS)
+
 if(APPLE)
   add_compiler_rt_runtime(clang_rt.profile
     STATIC
     OS ${PROFILE_SUPPORTED_OS}
     ARCHS ${PROFILE_SUPPORTED_ARCH}
+    CFLAGS ${EXTRA_FLAGS}
     SOURCES ${PROFILE_SOURCES}
     PARENT_TARGET profile)
 else()
diff --git a/lib/profile/GCDAProfiling.c b/lib/profile/GCDAProfiling.c
index aec2328..1079f24 100644
--- a/lib/profile/GCDAProfiling.c
+++ b/lib/profile/GCDAProfiling.c
@@ -27,16 +27,26 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+
+#if defined(_WIN32)
+#include "WindowsMMap.h"
+#else
 #include <sys/mman.h>
 #include <sys/file.h>
+#endif
 
-#define I386_FREEBSD (defined(__FreeBSD__) && defined(__i386__))
+#if defined(__FreeBSD__) && defined(__i386__)
+#define I386_FREEBSD 1
+#else
+#define I386_FREEBSD 0
+#endif
 
 #if !defined(_MSC_VER) && !I386_FREEBSD
 #include <stdint.h>
 #endif
 
 #if defined(_MSC_VER)
+typedef unsigned char uint8_t;
 typedef unsigned int uint32_t;
 typedef unsigned long long uint64_t;
 #elif I386_FREEBSD
@@ -476,8 +486,8 @@
       unmap_file();
     }
 
-    fclose(output_file);
     flock(fd, LOCK_UN);
+    fclose(output_file);
     output_file = NULL;
     write_buffer = NULL;
   }
@@ -501,7 +511,7 @@
   }
 }
 
-void llvm_writeout_files() {
+void llvm_writeout_files(void) {
   struct writeout_fn_node *curr = writeout_fn_head;
 
   while (curr) {
@@ -510,7 +520,7 @@
   }
 }
 
-void llvm_delete_writeout_function_list() {
+void llvm_delete_writeout_function_list(void) {
   while (writeout_fn_head) {
     struct writeout_fn_node *node = writeout_fn_head;
     writeout_fn_head = writeout_fn_head->next;
@@ -542,7 +552,7 @@
   }
 }
 
-void llvm_delete_flush_function_list() {
+void llvm_delete_flush_function_list(void) {
   while (flush_fn_head) {
     struct flush_fn_node *node = flush_fn_head;
     flush_fn_head = flush_fn_head->next;
diff --git a/lib/profile/InstrProfData.inc b/lib/profile/InstrProfData.inc
index 48dae50..93d14ac 100644
--- a/lib/profile/InstrProfData.inc
+++ b/lib/profile/InstrProfData.inc
@@ -1,4 +1,4 @@
-/*===-- InstrProfData.inc - instr profiling runtime structures -----------=== *\
+/*===-- InstrProfData.inc - instr profiling runtime structures -*- C++ -*-=== *\
 |*
 |*                     The LLVM Compiler Infrastructure
 |*
@@ -28,7 +28,7 @@
  *
  * Examples of how the template is used to instantiate structure definition:
  * 1. To declare a structure:
- * 
+ *
  * struct ProfData {
  * #define INSTR_PROF_DATA(Type, LLVMType, Name, Initializer) \
  *    Type Name;
@@ -57,6 +57,12 @@
  *
 \*===----------------------------------------------------------------------===*/
 
+/* Functions marked with INSTR_PROF_VISIBILITY must have hidden visibility in
+ * the compiler runtime. */
+#ifndef INSTR_PROF_VISIBILITY
+#define INSTR_PROF_VISIBILITY
+#endif
+
 /* INSTR_PROF_DATA start. */
 /* Definition of member fields of the per-function control structure. */
 #ifndef INSTR_PROF_DATA
@@ -64,29 +70,57 @@
 #else
 #define INSTR_PROF_DATA_DEFINED
 #endif
-
-INSTR_PROF_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), NameSize, \
-                ConstantInt::get(llvm::Type::getInt32Ty(Ctx), \
-                NamePtr->getType()->getPointerElementType()->getArrayNumElements()))
-INSTR_PROF_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), NumCounters, \
-                ConstantInt::get(llvm::Type::getInt32Ty(Ctx), NumCounters))
+INSTR_PROF_DATA(const uint64_t, llvm::Type::getInt64Ty(Ctx), NameRef, \
+                ConstantInt::get(llvm::Type::getInt64Ty(Ctx), \
+		IndexedInstrProf::ComputeHash(getPGOFuncNameVarInitializer(Inc->getName()))))
 INSTR_PROF_DATA(const uint64_t, llvm::Type::getInt64Ty(Ctx), FuncHash, \
                 ConstantInt::get(llvm::Type::getInt64Ty(Ctx), \
                 Inc->getHash()->getZExtValue()))
-INSTR_PROF_DATA(const IntPtrT, llvm::Type::getInt8PtrTy(Ctx), NamePtr, \
-                ConstantExpr::getBitCast(NamePtr, llvm::Type::getInt8PtrTy(Ctx)))
 INSTR_PROF_DATA(const IntPtrT, llvm::Type::getInt64PtrTy(Ctx), CounterPtr, \
                 ConstantExpr::getBitCast(CounterPtr, \
                 llvm::Type::getInt64PtrTy(Ctx)))
+/* This is used to map function pointers for the indirect call targets to
+ * function name hashes during the conversion from raw to merged profile
+ * data.
+ */
 INSTR_PROF_DATA(const IntPtrT, llvm::Type::getInt8PtrTy(Ctx), FunctionPointer, \
                 FunctionAddr)
 INSTR_PROF_DATA(IntPtrT, llvm::Type::getInt8PtrTy(Ctx), Values, \
-                ConstantPointerNull::get(Int8PtrTy))
+                ValuesPtrExpr)
+INSTR_PROF_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), NumCounters, \
+                ConstantInt::get(llvm::Type::getInt32Ty(Ctx), NumCounters))
 INSTR_PROF_DATA(const uint16_t, Int16ArrayTy, NumValueSites[IPVK_Last+1], \
                 ConstantArray::get(Int16ArrayTy, Int16ArrayVals))
 #undef INSTR_PROF_DATA
 /* INSTR_PROF_DATA end. */
 
+
+/* This is an internal data structure used by value profiler. It
+ * is defined here to allow serialization code sharing by LLVM
+ * to be used in unit test.
+ *
+ * typedef struct ValueProfNode {
+ *   // InstrProfValueData VData;
+ *   uint64_t Value;
+ *   uint64_t Count;
+ *   struct ValueProfNode *Next;
+ * } ValueProfNode;
+ */
+/* INSTR_PROF_VALUE_NODE start. */
+#ifndef INSTR_PROF_VALUE_NODE
+#define INSTR_PROF_VALUE_NODE(Type, LLVMType, Name, Initializer)
+#else
+#define INSTR_PROF_DATA_DEFINED
+#endif
+INSTR_PROF_VALUE_NODE(uint64_t, llvm::Type::getInt64Ty(Ctx), Value, \
+                      ConstantInt::get(llvm::Type::GetInt64Ty(Ctx), 0))
+INSTR_PROF_VALUE_NODE(uint64_t, llvm::Type::getInt64Ty(Ctx), Count, \
+                      ConstantInt::get(llvm::Type::GetInt64Ty(Ctx), 0))
+INSTR_PROF_VALUE_NODE(PtrToNodeT, llvm::Type::getInt8PtrTy(Ctx), Next, \
+                      ConstantInt::get(llvm::Type::GetInt8PtrTy(Ctx), 0))
+#undef INSTR_PROF_VALUE_NODE
+/* INSTR_PROF_VALUE_NODE end. */
+
 /* INSTR_PROF_RAW_HEADER  start */
 /* Definition of member fields of the raw profile header data structure. */
 #ifndef INSTR_PROF_RAW_HEADER
@@ -102,8 +136,6 @@
 INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin)
 INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
 INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
-INSTR_PROF_RAW_HEADER(uint64_t, ValueDataSize, ValueDataSize)
-INSTR_PROF_RAW_HEADER(uint64_t, ValueDataDelta, (uintptr_t)ValueDataBegin)
 #undef INSTR_PROF_RAW_HEADER
 /* INSTR_PROF_RAW_HEADER  end */
 
@@ -132,6 +164,15 @@
 #else
 #define INSTR_PROF_DATA_DEFINED
 #endif
+/* For indirect function call value profiling, the addresses of the target
+ * functions are profiled by the instrumented code. The target addresses are
+ * written in the raw profile data and converted to target function name's MD5
+ * hash by the profile reader during deserialization.  Typically, this happens
+ * when the the raw profile data is read during profile merging.
+ *
+ * For this remapping the ProfData is used.  ProfData contains both the function
+ * name hash and the function address.
+ */
 VALUE_PROF_KIND(IPVK_IndirectCallTarget, 0)
 /* These two kinds must be the last to be
  * declared. This is to make sure the string
@@ -153,12 +194,18 @@
 #else
 #define INSTR_PROF_DATA_DEFINED
 #endif
+#ifdef COVMAP_V1
 COVMAP_FUNC_RECORD(const IntPtrT, llvm::Type::getInt8PtrTy(Ctx), \
                    NamePtr, llvm::ConstantExpr::getBitCast(NamePtr, \
-                   llvm::Type::getInt8PtrTy(Ctx))) 
+                   llvm::Type::getInt8PtrTy(Ctx)))
 COVMAP_FUNC_RECORD(const uint32_t, llvm::Type::getInt32Ty(Ctx), NameSize, \
-                   llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx),\
+                   llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), \
                    NameValue.size()))
+#else
+COVMAP_FUNC_RECORD(const int64_t, llvm::Type::getInt64Ty(Ctx), NameRef, \
+                   llvm::ConstantInt::get(llvm::Type::getInt64Ty(Ctx), \
+	           llvm::IndexedInstrProf::ComputeHash(NameValue)))
+#endif
 COVMAP_FUNC_RECORD(const uint32_t, llvm::Type::getInt32Ty(Ctx), DataSize, \
                    llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx),\
                    CoverageMapping.size()))
@@ -167,11 +214,31 @@
 #undef COVMAP_FUNC_RECORD
 /* COVMAP_FUNC_RECORD end.  */
 
+/* COVMAP_HEADER start */
+/* Definition of member fields of coverage map header.
+ */
+#ifndef COVMAP_HEADER
+#define COVMAP_HEADER(Type, LLVMType, Name, Initializer)
+#else
+#define INSTR_PROF_DATA_DEFINED
+#endif
+COVMAP_HEADER(uint32_t, Int32Ty, NRecords, \
+              llvm::ConstantInt::get(Int32Ty,  FunctionRecords.size()))
+COVMAP_HEADER(uint32_t, Int32Ty, FilenamesSize, \
+              llvm::ConstantInt::get(Int32Ty, FilenamesSize))
+COVMAP_HEADER(uint32_t, Int32Ty, CoverageSize, \
+              llvm::ConstantInt::get(Int32Ty, CoverageMappingSize))
+COVMAP_HEADER(uint32_t, Int32Ty, Version, \
+              llvm::ConstantInt::get(Int32Ty, CovMapVersion::CurrentVersion))
+#undef COVMAP_HEADER
+/* COVMAP_HEADER end.  */
+
 
 #ifdef INSTR_PROF_VALUE_PROF_DATA
 #define INSTR_PROF_DATA_DEFINED
 
-/*! 
+#define INSTR_PROF_MAX_NUM_VAL_PER_SITE 255
+/*!
  * This is the header of the data structure that defines the on-disk
  * layout of the value profile data of a particular kind for one function.
  */
@@ -183,7 +250,7 @@
    * otherwise the record for this kind won't be emitted.
    */
   uint32_t NumValueSites;
-  /* 
+  /*
    * The first element of the array that stores the number of profiled
    * values for each value site. The size of the array is NumValueSites.
    * Since NumValueSites is greater than zero, there is at least one
@@ -207,7 +274,7 @@
    * \brief Return the number of value sites.
    */
   uint32_t getNumValueSites() const { return NumValueSites; }
-  /*! 
+  /*!
    * \brief Read data from this record and save it to Record.
    */
   void deserializeTo(InstrProfRecord &Record,
@@ -228,10 +295,10 @@
 typedef struct ValueProfData {
   /*
    * Total size in bytes including this field. It must be a multiple
-   * of sizeof(uint64_t). 
+   * of sizeof(uint64_t).
    */
   uint32_t TotalSize;
-  /* 
+  /*
    *The number of value profile kinds that has value profile data.
    * In this implementation, a value profile kind is considered to
    * have profile data if the number of value profile sites for the
@@ -241,7 +308,7 @@
    */
   uint32_t NumValueKinds;
 
-  /* 
+  /*
    * Following are a sequence of variable length records. The prefix/header
    * of each record is defined by ValueProfRecord type. The number of
    * records is NumValueKinds.
@@ -261,16 +328,15 @@
   static std::unique_ptr<ValueProfData>
   serializeFrom(const InstrProfRecord &Record);
   /*!
-   * Check the integrity of the record. Return the error code when
-   * an error is detected, otherwise return instrprof_error::success.
+   * Check the integrity of the record.
    */
-  instrprof_error checkIntegrity();
+  Error checkIntegrity();
   /*!
    * Return a pointer to \c ValueProfileData instance ready to be read.
    * All data in the instance are properly byte swapped. The input
    * data is assumed to be in little endian order.
    */
-  static ErrorOr<std::unique_ptr<ValueProfData>>
+  static Expected<std::unique_ptr<ValueProfData>>
   getValueProfData(const unsigned char *SrcBuffer,
                    const unsigned char *const SrcBufferEnd,
                    support::endianness SrcDataEndianness);
@@ -295,7 +361,7 @@
 #endif
 } ValueProfData;
 
-/* 
+/*
  * The closure is designed to abstact away two types of value profile data:
  * - InstrProfRecord which is the primary data structure used to
  *   represent profile data in host tools (reader, writer, and profile-use)
@@ -316,64 +382,38 @@
   uint32_t (*GetNumValueData)(const void *Record, uint32_t VKind);
   uint32_t (*GetNumValueDataForSite)(const void *R, uint32_t VK, uint32_t S);
 
-  /* 
+  /*
    * After extracting the value profile data from the value profile record,
    * this method is used to map the in-memory value to on-disk value. If
    * the method is null, value will be written out untranslated.
    */
   uint64_t (*RemapValueData)(uint32_t, uint64_t Value);
   void (*GetValueForSite)(const void *R, InstrProfValueData *Dst, uint32_t K,
-                          uint32_t S, uint64_t (*Mapper)(uint32_t, uint64_t));
+                          uint32_t S);
   ValueProfData *(*AllocValueProfData)(size_t TotalSizeInBytes);
 } ValueProfRecordClosure;
 
-/* 
- * A wrapper struct that represents value profile runtime data.
- * Like InstrProfRecord class which is used by profiling host tools,
- * ValueProfRuntimeRecord also implements the abstract intefaces defined in
- * ValueProfRecordClosure so that the runtime data can be serialized using
- * shared C implementation. In this structure, NumValueSites and Nodes
- * members are the primary fields while other fields hold the derived
- * information for fast implementation of closure interfaces.
- */
-typedef struct ValueProfRuntimeRecord {
-  /* Number of sites for each value profile kind.  */
-  const uint16_t *NumValueSites;
-  /* An array of linked-list headers. The size of of the array is the
-   * total number of value profile sites : sum(NumValueSites[*])). Each
-   * linked-list stores the values profiled for a value profile site. */
-  ValueProfNode **Nodes;
-
-  /* Total number of value profile kinds which have at least one
-   *  value profile sites. */
-  uint32_t NumValueKinds;
-  /* An array recording the number of values tracked at each site.
-   * The size of the array is TotalNumValueSites. */
-  uint8_t *SiteCountArray[IPVK_Last + 1];
-  ValueProfNode **NodesKind[IPVK_Last + 1];
-} ValueProfRuntimeRecord;
-
-/* Forward declarations of C interfaces.  */
-int initializeValueProfRuntimeRecord(ValueProfRuntimeRecord *RuntimeRecord,
-                                     const uint16_t *NumValueSites,
-                                     ValueProfNode **Nodes);
-void finalizeValueProfRuntimeRecord(ValueProfRuntimeRecord *RuntimeRecord);
-uint32_t getValueProfDataSizeRT(const ValueProfRuntimeRecord *Record);
-ValueProfData *
-serializeValueProfDataFromRT(const ValueProfRuntimeRecord *Record,
-                             ValueProfData *Dst);
-uint32_t getNumValueKindsRT(const void *R);
+INSTR_PROF_VISIBILITY ValueProfRecord *
+getFirstValueProfRecord(ValueProfData *VPD);
+INSTR_PROF_VISIBILITY ValueProfRecord *
+getValueProfRecordNext(ValueProfRecord *VPR);
+INSTR_PROF_VISIBILITY InstrProfValueData *
+getValueProfRecordValueData(ValueProfRecord *VPR);
+INSTR_PROF_VISIBILITY uint32_t
+getValueProfRecordHeaderSize(uint32_t NumValueSites);
 
 #undef INSTR_PROF_VALUE_PROF_DATA
-#endif  /* INSTR_PROF_VALUE_PROF_DATA */ 
+#endif  /* INSTR_PROF_VALUE_PROF_DATA */
 
 
 #ifdef INSTR_PROF_COMMON_API_IMPL
 #define INSTR_PROF_DATA_DEFINED
 #ifdef __cplusplus
 #define INSTR_PROF_INLINE inline
+#define INSTR_PROF_NULLPTR nullptr
 #else
 #define INSTR_PROF_INLINE
+#define INSTR_PROF_NULLPTR NULL
 #endif
 
 #ifndef offsetof
@@ -384,7 +424,7 @@
  * \brief Return the \c ValueProfRecord header size including the
  * padding bytes.
  */
-INSTR_PROF_INLINE
+INSTR_PROF_VISIBILITY INSTR_PROF_INLINE
 uint32_t getValueProfRecordHeaderSize(uint32_t NumValueSites) {
   uint32_t Size = offsetof(ValueProfRecord, SiteCountArray) +
                   sizeof(uint8_t) * NumValueSites;
@@ -393,11 +433,11 @@
   return Size;
 }
 
-/*! 
+/*!
  * \brief Return the total size of the value profile record including the
  * header and the value data.
  */
-INSTR_PROF_INLINE
+INSTR_PROF_VISIBILITY INSTR_PROF_INLINE
 uint32_t getValueProfRecordSize(uint32_t NumValueSites,
                                 uint32_t NumValueData) {
   return getValueProfRecordHeaderSize(NumValueSites) +
@@ -407,16 +447,16 @@
 /*!
  * \brief Return the pointer to the start of value data array.
  */
-INSTR_PROF_INLINE
+INSTR_PROF_VISIBILITY INSTR_PROF_INLINE
 InstrProfValueData *getValueProfRecordValueData(ValueProfRecord *This) {
   return (InstrProfValueData *)((char *)This + getValueProfRecordHeaderSize(
                                                    This->NumValueSites));
 }
 
-/*! 
+/*!
  * \brief Return the total number of value data for \c This record.
  */
-INSTR_PROF_INLINE
+INSTR_PROF_VISIBILITY INSTR_PROF_INLINE
 uint32_t getValueProfRecordNumValueData(ValueProfRecord *This) {
   uint32_t NumValueData = 0;
   uint32_t I;
@@ -425,10 +465,10 @@
   return NumValueData;
 }
 
-/*! 
+/*!
  * \brief Use this method to advance to the next \c This \c ValueProfRecord.
  */
-INSTR_PROF_INLINE
+INSTR_PROF_VISIBILITY INSTR_PROF_INLINE
 ValueProfRecord *getValueProfRecordNext(ValueProfRecord *This) {
   uint32_t NumValueData = getValueProfRecordNumValueData(This);
   return (ValueProfRecord *)((char *)This +
@@ -439,24 +479,22 @@
 /*!
  * \brief Return the first \c ValueProfRecord instance.
  */
-INSTR_PROF_INLINE
+INSTR_PROF_VISIBILITY INSTR_PROF_INLINE
 ValueProfRecord *getFirstValueProfRecord(ValueProfData *This) {
   return (ValueProfRecord *)((char *)This + sizeof(ValueProfData));
 }
 
 /* Closure based interfaces.  */
 
-/*! 
+/*!
  * Return the total size in bytes of the on-disk value profile data
  * given the data stored in Record.
  */
-uint32_t getValueProfDataSize(ValueProfRecordClosure *Closure) {
+INSTR_PROF_VISIBILITY uint32_t
+getValueProfDataSize(ValueProfRecordClosure *Closure) {
   uint32_t Kind;
   uint32_t TotalSize = sizeof(ValueProfData);
   const void *Record = Closure->Record;
-  uint32_t NumValueKinds = Closure->GetNumValueKinds(Record);
-  if (NumValueKinds == 0)
-    return TotalSize;
 
   for (Kind = IPVK_First; Kind <= IPVK_Last; Kind++) {
     uint32_t NumValueSites = Closure->GetNumValueSites(Record, Kind);
@@ -472,9 +510,10 @@
  * Extract value profile data of a function for the profile kind \c ValueKind
  * from the \c Closure and serialize the data into \c This record instance.
  */
-void serializeValueProfRecordFrom(ValueProfRecord *This,
-                                  ValueProfRecordClosure *Closure,
-                                  uint32_t ValueKind, uint32_t NumValueSites) {
+INSTR_PROF_VISIBILITY void
+serializeValueProfRecordFrom(ValueProfRecord *This,
+                             ValueProfRecordClosure *Closure,
+                             uint32_t ValueKind, uint32_t NumValueSites) {
   uint32_t S;
   const void *Record = Closure->Record;
   This->Kind = ValueKind;
@@ -484,8 +523,7 @@
   for (S = 0; S < NumValueSites; S++) {
     uint32_t ND = Closure->GetNumValueDataForSite(Record, ValueKind, S);
     This->SiteCountArray[S] = ND;
-    Closure->GetValueForSite(Record, DstVD, ValueKind, S,
-                             Closure->RemapValueData);
+    Closure->GetValueForSite(Record, DstVD, ValueKind, S);
     DstVD += ND;
   }
 }
@@ -493,12 +531,16 @@
 /*!
  * Extract value profile data of a function  from the \c Closure
  * and serialize the data into \c DstData if it is not NULL or heap
- * memory allocated by the \c Closure's allocator method.
+ * memory allocated by the \c Closure's allocator method. If \c
+ * DstData is not null, the caller is expected to set the TotalSize
+ * in DstData.
  */
-ValueProfData *serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
-                                          ValueProfData *DstData) {
+INSTR_PROF_VISIBILITY ValueProfData *
+serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
+                           ValueProfData *DstData) {
   uint32_t Kind;
-  uint32_t TotalSize = getValueProfDataSize(Closure);
+  uint32_t TotalSize =
+      DstData ? DstData->TotalSize : getValueProfDataSize(Closure);
 
   ValueProfData *VPD =
       DstData ? DstData : Closure->AllocValueProfData(TotalSize);
@@ -516,144 +558,15 @@
   return VPD;
 }
 
-/* 
- * The value profiler runtime library stores the value profile data
- * for a given function in \c NumValueSites and \c Nodes structures.
- * \c ValueProfRuntimeRecord class is used to encapsulate the runtime
- * profile data and provides fast interfaces to retrieve the profile
- * information. This interface is used to initialize the runtime record
- * and pre-compute the information needed for efficient implementation
- * of callbacks required by ValueProfRecordClosure class.
- */
-int initializeValueProfRuntimeRecord(ValueProfRuntimeRecord *RuntimeRecord,
-                                     const uint16_t *NumValueSites,
-                                     ValueProfNode **Nodes) {
-  unsigned I, J, S = 0, NumValueKinds = 0;
-  RuntimeRecord->NumValueSites = NumValueSites;
-  RuntimeRecord->Nodes = Nodes;
-  for (I = 0; I <= IPVK_Last; I++) {
-    uint16_t N = NumValueSites[I];
-    if (!N) {
-      RuntimeRecord->SiteCountArray[I] = 0;
-      continue;
-    }
-    NumValueKinds++;
-    RuntimeRecord->SiteCountArray[I] = (uint8_t *)calloc(N, 1);
-    if (!RuntimeRecord->SiteCountArray[I])
-      return 1;
-    RuntimeRecord->NodesKind[I] = Nodes ? &Nodes[S] : NULL;
-    for (J = 0; J < N; J++) {
-      /* Compute value count for each site. */
-      uint32_t C = 0;
-      ValueProfNode *Site = Nodes ? RuntimeRecord->NodesKind[I][J] : NULL;
-      while (Site) {
-        C++;
-        Site = Site->Next;
-      }
-      if (C > UCHAR_MAX)
-        C = UCHAR_MAX;
-      RuntimeRecord->SiteCountArray[I][J] = C;
-    }
-    S += N;
-  }
-  RuntimeRecord->NumValueKinds = NumValueKinds;
-  return 0;
-}
-
-void finalizeValueProfRuntimeRecord(ValueProfRuntimeRecord *RuntimeRecord) {
-  unsigned I;
-  for (I = 0; I <= IPVK_Last; I++) {
-    if (RuntimeRecord->SiteCountArray[I])
-      free(RuntimeRecord->SiteCountArray[I]);
-  }
-}
-
-/* ValueProfRecordClosure Interface implementation for
- * ValueProfDataRuntimeRecord.  */
-uint32_t getNumValueKindsRT(const void *R) {
-  return ((const ValueProfRuntimeRecord *)R)->NumValueKinds;
-}
-
-uint32_t getNumValueSitesRT(const void *R, uint32_t VK) {
-  return ((const ValueProfRuntimeRecord *)R)->NumValueSites[VK];
-}
-
-uint32_t getNumValueDataForSiteRT(const void *R, uint32_t VK, uint32_t S) {
-  const ValueProfRuntimeRecord *Record = (const ValueProfRuntimeRecord *)R;
-  return Record->SiteCountArray[VK][S];
-}
-
-uint32_t getNumValueDataRT(const void *R, uint32_t VK) {
-  unsigned I, S = 0;
-  const ValueProfRuntimeRecord *Record = (const ValueProfRuntimeRecord *)R;
-  if (Record->SiteCountArray[VK] == 0)
-    return 0;
-  for (I = 0; I < Record->NumValueSites[VK]; I++)
-    S += Record->SiteCountArray[VK][I];
-  return S;
-}
-
-void getValueForSiteRT(const void *R, InstrProfValueData *Dst, uint32_t VK,
-                       uint32_t S, uint64_t (*Mapper)(uint32_t, uint64_t)) {
-  unsigned I, N = 0;
-  const ValueProfRuntimeRecord *Record = (const ValueProfRuntimeRecord *)R;
-  N = getNumValueDataForSiteRT(R, VK, S);
-  if (N == 0)
-    return;
-  ValueProfNode *VNode = Record->NodesKind[VK][S];
-  for (I = 0; I < N; I++) {
-    Dst[I] = VNode->VData;
-    VNode = VNode->Next;
-  }
-}
-
-ValueProfData *allocValueProfDataRT(size_t TotalSizeInBytes) {
-  return (ValueProfData *)calloc(TotalSizeInBytes, 1);
-}
-
-static ValueProfRecordClosure RTRecordClosure = {0,
-                                                 getNumValueKindsRT,
-                                                 getNumValueSitesRT,
-                                                 getNumValueDataRT,
-                                                 getNumValueDataForSiteRT,
-                                                 0,
-                                                 getValueForSiteRT,
-                                                 allocValueProfDataRT};
-
-/* 
- * Return the size of ValueProfData structure to store data
- * recorded in the runtime record.
- */
-uint32_t getValueProfDataSizeRT(const ValueProfRuntimeRecord *Record) {
-  RTRecordClosure.Record = Record;
-  return getValueProfDataSize(&RTRecordClosure);
-}
-
-/* 
- * Return a ValueProfData instance that stores the data collected
- * from runtime. If \c DstData is provided by the caller, the value
- * profile data will be store in *DstData and DstData is returned,
- * otherwise the method will allocate space for the value data and
- * return pointer to the newly allocated space.
- */
-ValueProfData *
-serializeValueProfDataFromRT(const ValueProfRuntimeRecord *Record,
-                             ValueProfData *DstData) {
-  RTRecordClosure.Record = Record;
-  return serializeValueProfDataFrom(&RTRecordClosure, DstData);
-}
-
-
 #undef INSTR_PROF_COMMON_API_IMPL
 #endif /* INSTR_PROF_COMMON_API_IMPL */
 
 /*============================================================================*/
 
-
 #ifndef INSTR_PROF_DATA_DEFINED
 
-#ifndef INSTR_PROF_DATA_INC_
-#define INSTR_PROF_DATA_INC_
+#ifndef INSTR_PROF_DATA_INC
+#define INSTR_PROF_DATA_INC
 
 /* Helper macros.  */
 #define INSTR_PROF_SIMPLE_QUOTE(x) #x
@@ -675,20 +588,47 @@
        (uint64_t)'p' << 40 | (uint64_t)'r' << 32 | (uint64_t)'o' << 24 |  \
         (uint64_t)'f' << 16 | (uint64_t)'R' << 8 | (uint64_t)129
 
-/* Raw profile format version. */
-#define INSTR_PROF_RAW_VERSION 2
+/* Raw profile format version (start from 1). */
+#define INSTR_PROF_RAW_VERSION 4
+/* Indexed profile format version (start from 1). */
+#define INSTR_PROF_INDEX_VERSION 4
+/* Coverage mapping format vresion (start from 0). */
+#define INSTR_PROF_COVMAP_VERSION 1
+
+/* Profile version is always of type uint64_t. Reserve the upper 8 bits in the
+ * version for other variants of profile. We set the lowest bit of the upper 8
+ * bits (i.e. bit 56) to 1 to indicate if this is an IR-level instrumentaiton
+ * generated profile, and 0 if this is a Clang FE generated profile.
+ */
+#define VARIANT_MASKS_ALL 0xff00000000000000ULL
+#define GET_VERSION(V) ((V) & ~VARIANT_MASKS_ALL)
+#define VARIANT_MASK_IR_PROF (0x1ULL << 56)
+#define IR_LEVEL_PROF_VERSION_VAR __llvm_profile_raw_version
 
 /* Runtime section names and name strings.  */
 #define INSTR_PROF_DATA_SECT_NAME __llvm_prf_data
 #define INSTR_PROF_NAME_SECT_NAME __llvm_prf_names
 #define INSTR_PROF_CNTS_SECT_NAME __llvm_prf_cnts
+/* Array of pointers. Each pointer points to a list
+ * of value nodes associated with one value site.
+ */
+#define INSTR_PROF_VALS_SECT_NAME __llvm_prf_vals
+/* Value profile nodes section. */
+#define INSTR_PROF_VNODES_SECT_NAME __llvm_prf_vnds
+#define INSTR_PROF_COVMAP_SECT_NAME __llvm_covmap
 
-#define INSTR_PROF_DATA_SECT_NAME_STR \
-        INSTR_PROF_QUOTE(INSTR_PROF_DATA_SECT_NAME)
-#define INSTR_PROF_NAME_SECT_NAME_STR \
-        INSTR_PROF_QUOTE(INSTR_PROF_NAME_SECT_NAME)
-#define INSTR_PROF_CNTS_SECT_NAME_STR \
-        INSTR_PROF_QUOTE(INSTR_PROF_CNTS_SECT_NAME)
+#define INSTR_PROF_DATA_SECT_NAME_STR                                          \
+  INSTR_PROF_QUOTE(INSTR_PROF_DATA_SECT_NAME)
+#define INSTR_PROF_NAME_SECT_NAME_STR                                          \
+  INSTR_PROF_QUOTE(INSTR_PROF_NAME_SECT_NAME)
+#define INSTR_PROF_CNTS_SECT_NAME_STR                                          \
+  INSTR_PROF_QUOTE(INSTR_PROF_CNTS_SECT_NAME)
+#define INSTR_PROF_COVMAP_SECT_NAME_STR                                        \
+  INSTR_PROF_QUOTE(INSTR_PROF_COVMAP_SECT_NAME)
+#define INSTR_PROF_VALS_SECT_NAME_STR                                          \
+  INSTR_PROF_QUOTE(INSTR_PROF_VALS_SECT_NAME)
+#define INSTR_PROF_VNODES_SECT_NAME_STR                                        \
+  INSTR_PROF_QUOTE(INSTR_PROF_VNODES_SECT_NAME)
 
 /* Macros to define start/stop section symbol for a given
  * section on Linux. For instance
@@ -718,18 +658,8 @@
   uint64_t Count;
 } InstrProfValueData;
 
-/* This is an internal data structure used by value profiler. It
- * is defined here to allow serialization code sharing by LLVM
- * to be used in unit test.
- */
-typedef struct ValueProfNode {
-  InstrProfValueData VData;
-  struct ValueProfNode *Next;
-} ValueProfNode;
-
-#endif /* INSTR_PROF_DATA_INC_ */
+#endif /* INSTR_PROF_DATA_INC */
 
 #else
 #undef INSTR_PROF_DATA_DEFINED
 #endif
-
diff --git a/lib/profile/InstrProfiling.c b/lib/profile/InstrProfiling.c
index 58778ae..c763a44 100644
--- a/lib/profile/InstrProfiling.c
+++ b/lib/profile/InstrProfiling.c
@@ -16,7 +16,9 @@
 #define INSTR_PROF_VALUE_PROF_DATA
 #include "InstrProfData.inc"
 
-char *(*GetEnvHook)(const char *) = 0;
+COMPILER_RT_VISIBILITY char *(*GetEnvHook)(const char *) = 0;
+
+COMPILER_RT_WEAK uint64_t __llvm_profile_raw_version = INSTR_PROF_RAW_VERSION;
 
 COMPILER_RT_VISIBILITY uint64_t __llvm_profile_get_magic(void) {
   return sizeof(void *) == sizeof(uint64_t) ? (INSTR_PROF_RAW_MAGIC_64)
@@ -32,7 +34,7 @@
 }
 
 COMPILER_RT_VISIBILITY uint64_t __llvm_profile_get_version(void) {
-  return INSTR_PROF_RAW_VERSION;
+  return __llvm_profile_raw_version;
 }
 
 COMPILER_RT_VISIBILITY void __llvm_profile_reset_counters(void) {
@@ -44,7 +46,7 @@
   const __llvm_profile_data *DataBegin = __llvm_profile_begin_data();
   const __llvm_profile_data *DataEnd = __llvm_profile_end_data();
   const __llvm_profile_data *DI;
-  for (DI = DataBegin; DI != DataEnd; ++DI) {
+  for (DI = DataBegin; DI < DataEnd; ++DI) {
     uint64_t CurrentVSiteCount = 0;
     uint32_t VKI, i;
     if (!DI->Values)
@@ -59,10 +61,9 @@
       ValueProfNode *CurrentVNode = ValueCounters[i];
 
       while (CurrentVNode) {
-        CurrentVNode->VData.Count = 0;
+        CurrentVNode->Count = 0;
         CurrentVNode = CurrentVNode->Next;
       }
     }
   }
 }
-
diff --git a/lib/profile/InstrProfiling.h b/lib/profile/InstrProfiling.h
index c924a42..b23bed8 100644
--- a/lib/profile/InstrProfiling.h
+++ b/lib/profile/InstrProfiling.h
@@ -11,6 +11,8 @@
 #define PROFILE_INSTRPROFILING_H_
 
 #include "InstrProfilingPort.h"
+
+#define INSTR_PROF_VISIBILITY COMPILER_RT_VISIBILITY
 #include "InstrProfData.inc"
 
 enum ValueKind {
@@ -30,6 +32,12 @@
 #include "InstrProfData.inc"
 } __llvm_profile_header;
 
+typedef struct ValueProfNode * PtrToNodeT;
+typedef struct ValueProfNode {
+#define INSTR_PROF_VALUE_NODE(Type, LLVMType, Name, Initializer) Type Name;
+#include "InstrProfData.inc"
+} ValueProfNode;
+
 /*!
  * \brief Get number of bytes necessary to pad the argument to eight
  * byte boundary.
@@ -55,6 +63,8 @@
 const char *__llvm_profile_end_names(void);
 uint64_t *__llvm_profile_begin_counters(void);
 uint64_t *__llvm_profile_end_counters(void);
+ValueProfNode *__llvm_profile_begin_vnodes();
+ValueProfNode *__llvm_profile_end_vnodes();
 
 /*!
  * \brief Clear profile counters to zero.
@@ -63,6 +73,27 @@
 void __llvm_profile_reset_counters(void);
 
 /*!
+ * \brief Merge profile data from buffer.
+ *
+ * Read profile data form buffer \p Profile  and merge with
+ * in-process profile counters. The client is expected to
+ * have checked or already knows the profile data in the
+ * buffer matches the in-process counter structure before
+ * calling it.
+ */
+void __llvm_profile_merge_from_buffer(const char *Profile, uint64_t Size);
+
+/*! \brief Check if profile in buffer matches the current binary.
+ *
+ *  Returns 0 (success) if the profile data in buffer \p Profile with size
+ *  \p Size was generated by the same binary and therefore matches
+ *  structurally the in-process counters. If the profile data in buffer is
+ *  not compatible, the interface returns 1 (failure).
+ */
+int __llvm_profile_check_compatibility(const char *Profile,
+                                       uint64_t Size);
+
+/*!
  * \brief Counts the number of times a target value is seen.
  *
  * Records the target value for the CounterIndex if not seen before. Otherwise,
@@ -73,20 +104,13 @@
 void INSTR_PROF_VALUE_PROF_FUNC(
 #define VALUE_PROF_FUNC_PARAM(ArgType, ArgName, ArgLLVMType) ArgType ArgName
 #include "InstrProfData.inc"
-);
-
-/*!
- * \brief Prepares the value profiling data for output.
- *
- * Prepares a single __llvm_profile_value_data array out of the many
- * ValueProfNode trees (one per instrumented function).
- */
-uint64_t __llvm_profile_gather_value_data(uint8_t **DataArray);
+    );
 
 /*!
  * \brief Write instrumentation data to the current file.
  *
- * Writes to the file with the last name given to \a __llvm_profile_set_filename(),
+ * Writes to the file with the last name given to \a *
+ * __llvm_profile_set_filename(),
  * or if it hasn't been called, the \c LLVM_PROFILE_FILE environment variable,
  * or if that's not set, the last name given to
  * \a __llvm_profile_override_default_filename(), or if that's not set,
@@ -130,4 +154,31 @@
 /*! \brief Get the version of the file format. */
 uint64_t __llvm_profile_get_version(void);
 
+/*! \brief Get the number of entries in the profile data section. */
+uint64_t __llvm_profile_get_data_size(const __llvm_profile_data *Begin,
+                                      const __llvm_profile_data *End);
+
+/*!
+ * This variable is defined in InstrProfilingRuntime.cc as a hidden
+ * symbol. Its main purpose is to enable profile runtime user to
+ * bypass runtime initialization code -- if the client code explicitly
+ * define this variable, then InstProfileRuntime.o won't be linked in.
+ * Note that this variable's visibility needs to be hidden so that the
+ * definition of this variable in an instrumented shared library won't
+ * affect runtime initialization decision of the main program.
+ */
+COMPILER_RT_VISIBILITY extern int __llvm_profile_runtime;
+
+/*!
+ * This variable is defined in InstrProfiling.c. Its main purpose is to
+ * encode the raw profile version value and other format related information
+ * such as whether the profile is from IR based instrumentation. The variable
+ * is defined as weak so that compiler can emit an overriding definition
+ * depending on user option.  Since we don't support mixing FE and IR based
+ * data in the same raw profile data file (in other words, shared libs and
+ * main program are expected to be instrumented in the same way), there is
+ * no need for this variable to be hidden.
+ */
+extern uint64_t __llvm_profile_raw_version;
+
 #endif /* PROFILE_INSTRPROFILING_H_ */
diff --git a/lib/profile/InstrProfilingBuffer.c b/lib/profile/InstrProfilingBuffer.c
index 8bade76..ac259e8 100644
--- a/lib/profile/InstrProfilingBuffer.c
+++ b/lib/profile/InstrProfilingBuffer.c
@@ -10,8 +10,6 @@
 #include "InstrProfiling.h"
 #include "InstrProfilingInternal.h"
 
-#include <string.h>
-
 COMPILER_RT_VISIBILITY
 uint64_t __llvm_profile_get_size_for_buffer(void) {
   const __llvm_profile_data *DataBegin = __llvm_profile_begin_data();
@@ -25,7 +23,13 @@
       DataBegin, DataEnd, CountersBegin, CountersEnd, NamesBegin, NamesEnd);
 }
 
-#define PROFILE_RANGE_SIZE(Range) (Range##End - Range##Begin)
+COMPILER_RT_VISIBILITY
+uint64_t __llvm_profile_get_data_size(const __llvm_profile_data *Begin,
+                                      const __llvm_profile_data *End) {
+  intptr_t BeginI = (intptr_t)Begin, EndI = (intptr_t)End;
+  return ((EndI + sizeof(__llvm_profile_data) - 1) - BeginI) /
+         sizeof(__llvm_profile_data);
+}
 
 COMPILER_RT_VISIBILITY
 uint64_t __llvm_profile_get_size_for_buffer_internal(
@@ -33,37 +37,23 @@
     const uint64_t *CountersBegin, const uint64_t *CountersEnd,
     const char *NamesBegin, const char *NamesEnd) {
   /* Match logic in __llvm_profile_write_buffer(). */
-  const uint64_t NamesSize = PROFILE_RANGE_SIZE(Names) * sizeof(char);
+  const uint64_t NamesSize = (NamesEnd - NamesBegin) * sizeof(char);
   const uint8_t Padding = __llvm_profile_get_num_padding_bytes(NamesSize);
   return sizeof(__llvm_profile_header) +
-         PROFILE_RANGE_SIZE(Data) * sizeof(__llvm_profile_data) +
-         PROFILE_RANGE_SIZE(Counters) * sizeof(uint64_t) + NamesSize + Padding;
-}
-
-/* The buffer writer is reponsponsible in keeping writer state
- * across the call.
- */
-static uint32_t bufferWriter(ProfDataIOVec *IOVecs, uint32_t NumIOVecs,
-                             void **WriterCtx) {
-  uint32_t I;
-  char **Buffer = (char **)WriterCtx;
-  for (I = 0; I < NumIOVecs; I++) {
-    size_t Length = IOVecs[I].ElmSize * IOVecs[I].NumElm;
-    memcpy(*Buffer, IOVecs[I].Data, Length);
-    *Buffer += Length;
-  }
-  return 0;
+         (__llvm_profile_get_data_size(DataBegin, DataEnd) *
+          sizeof(__llvm_profile_data)) +
+         (CountersEnd - CountersBegin) * sizeof(uint64_t) + NamesSize + Padding;
 }
 
 COMPILER_RT_VISIBILITY int __llvm_profile_write_buffer(char *Buffer) {
-  return llvmWriteProfData(bufferWriter, Buffer, 0, 0);
+  return lprofWriteData(lprofBufferWriter, Buffer, 0);
 }
 
 COMPILER_RT_VISIBILITY int __llvm_profile_write_buffer_internal(
     char *Buffer, const __llvm_profile_data *DataBegin,
     const __llvm_profile_data *DataEnd, const uint64_t *CountersBegin,
     const uint64_t *CountersEnd, const char *NamesBegin, const char *NamesEnd) {
-  return llvmWriteProfDataImpl(bufferWriter, Buffer, DataBegin, DataEnd,
-                               CountersBegin, CountersEnd, 0, 0, NamesBegin,
-                               NamesEnd);
+  return lprofWriteDataImpl(lprofBufferWriter, Buffer, DataBegin, DataEnd,
+                            CountersBegin, CountersEnd, 0, NamesBegin,
+                            NamesEnd);
 }
diff --git a/lib/profile/InstrProfilingFile.c b/lib/profile/InstrProfilingFile.c
index 7f2923c..1bd6c63 100644
--- a/lib/profile/InstrProfilingFile.c
+++ b/lib/profile/InstrProfilingFile.c
@@ -14,8 +14,76 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#ifdef _MSC_VER
+/* For _alloca. */
+#include <malloc.h>
+#endif
+#if defined(_WIN32)
+#include "WindowsMMap.h"
+/* For _chsize_s */
+#include <io.h>
+#else
+#include <sys/file.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#if defined(__linux__)
+#include <sys/types.h>
+#endif
+#endif
 
-#define UNCONST(ptr) ((void *)(uintptr_t)(ptr))
+/* From where is profile name specified.
+ * The order the enumerators define their
+ * precedence. Re-order them may lead to
+ * runtime behavior change. */ 
+typedef enum ProfileNameSpecifier {
+  PNS_unknown = 0,
+  PNS_default,
+  PNS_command_line,
+  PNS_environment,
+  PNS_runtime_api
+} ProfileNameSpecifier;
+
+static const char *getPNSStr(ProfileNameSpecifier PNS) {
+  switch (PNS) {
+  case PNS_default:
+    return "default setting";
+  case PNS_command_line:
+    return "command line";
+  case PNS_environment:
+    return "environment variable";
+  case PNS_runtime_api:
+    return "runtime API";
+  default:
+    return "Unknown";
+  }
+}
+
+#define MAX_PID_SIZE 16
+/* Data structure holding the result of parsed filename pattern.  */
+typedef struct lprofFilename {
+  /* File name string possibly with %p or %h specifiers. */
+  const char *FilenamePat;
+  char PidChars[MAX_PID_SIZE];
+  char Hostname[COMPILER_RT_MAX_HOSTLEN];
+  unsigned NumPids;
+  unsigned NumHosts;
+  /* When in-process merging is enabled, this parameter specifies
+   * the total number of profile data files shared by all the processes
+   * spawned from the same binary. By default the value is 1. If merging
+   * is not enabled, its value should be 0. This parameter is specified
+   * by the %[0-9]m specifier. For instance %2m enables merging using
+   * 2 profile data files. %1m is equivalent to %m. Also %m specifier
+   * can only appear once at the end of the name pattern. */
+  unsigned MergePoolSize;
+  ProfileNameSpecifier PNS;
+} lprofFilename;
+
+lprofFilename lprofCurFilename = {0, {0}, {0}, 0, 0, 0, PNS_unknown};
+
+int getpid(void);
+static int getCurFilenameLength();
+static const char *getCurFilename(char *FilenameBuf);
+static unsigned doMerging() { return lprofCurFilename.MergePoolSize; }
 
 /* Return 1 if there is an error, otherwise return  0.  */
 static uint32_t fileWriter(ProfDataIOVec *IOVecs, uint32_t NumIOVecs,
@@ -30,49 +98,141 @@
   return 0;
 }
 
-static int writeFile(FILE *File) {
-  uint8_t *ValueDataBegin = NULL;
-  const uint64_t ValueDataSize =
-      __llvm_profile_gather_value_data(&ValueDataBegin);
-  int r = llvmWriteProfData(fileWriter, File, ValueDataBegin, ValueDataSize);
-  free(ValueDataBegin);
-  return r;
+COMPILER_RT_VISIBILITY ProfBufferIO *
+lprofCreateBufferIOInternal(void *File, uint32_t BufferSz) {
+  FreeHook = &free;
+  DynamicBufferIOBuffer = (uint8_t *)calloc(BufferSz, 1);
+  VPBufferSize = BufferSz;
+  return lprofCreateBufferIO(fileWriter, File);
 }
 
-static int writeFileWithName(const char *OutputName) {
+static void setupIOBuffer() {
+  const char *BufferSzStr = 0;
+  BufferSzStr = getenv("LLVM_VP_BUFFER_SIZE");
+  if (BufferSzStr && BufferSzStr[0]) {
+    VPBufferSize = atoi(BufferSzStr);
+    DynamicBufferIOBuffer = (uint8_t *)calloc(VPBufferSize, 1);
+  }
+}
+
+/* Read profile data in \c ProfileFile and merge with in-memory
+   profile counters. Returns -1 if there is fatal error, otheriwse
+   0 is returned.
+*/
+static int doProfileMerging(FILE *ProfileFile) {
+  uint64_t ProfileFileSize;
+  char *ProfileBuffer;
+
+  if (fseek(ProfileFile, 0L, SEEK_END) == -1) {
+    PROF_ERR("Unable to merge profile data, unable to get size: %s\n",
+             strerror(errno));
+    return -1;
+  }
+  ProfileFileSize = ftell(ProfileFile);
+
+  /* Restore file offset.  */
+  if (fseek(ProfileFile, 0L, SEEK_SET) == -1) {
+    PROF_ERR("Unable to merge profile data, unable to rewind: %s\n",
+             strerror(errno));
+    return -1;
+  }
+
+  /* Nothing to merge.  */
+  if (ProfileFileSize < sizeof(__llvm_profile_header)) {
+    if (ProfileFileSize)
+      PROF_WARN("Unable to merge profile data: %s\n",
+                "source profile file is too small.");
+    return 0;
+  }
+
+  ProfileBuffer = mmap(NULL, ProfileFileSize, PROT_READ, MAP_SHARED | MAP_FILE,
+                       fileno(ProfileFile), 0);
+  if (ProfileBuffer == MAP_FAILED) {
+    PROF_ERR("Unable to merge profile data, mmap failed: %s\n",
+             strerror(errno));
+    return -1;
+  }
+
+  if (__llvm_profile_check_compatibility(ProfileBuffer, ProfileFileSize)) {
+    (void)munmap(ProfileBuffer, ProfileFileSize);
+    PROF_WARN("Unable to merge profile data: %s\n",
+              "source profile file is not compatible.");
+    return 0;
+  }
+
+  /* Now start merging */
+  __llvm_profile_merge_from_buffer(ProfileBuffer, ProfileFileSize);
+  (void)munmap(ProfileBuffer, ProfileFileSize);
+
+  return 0;
+}
+
+/* Open the profile data for merging. It opens the file in r+b mode with
+ * file locking.  If the file has content which is compatible with the
+ * current process, it also reads in the profile data in the file and merge
+ * it with in-memory counters. After the profile data is merged in memory,
+ * the original profile data is truncated and gets ready for the profile
+ * dumper. With profile merging enabled, each executable as well as any of
+ * its instrumented shared libraries dump profile data into their own data file.
+*/
+static FILE *openFileForMerging(const char *ProfileFileName) {
+  FILE *ProfileFile;
+  int rc;
+
+  ProfileFile = lprofOpenFileEx(ProfileFileName);
+  if (!ProfileFile)
+    return NULL;
+
+  rc = doProfileMerging(ProfileFile);
+  if (rc || COMPILER_RT_FTRUNCATE(ProfileFile, 0L) ||
+      fseek(ProfileFile, 0L, SEEK_SET) == -1) {
+    PROF_ERR("Profile Merging of file %s failed: %s\n", ProfileFileName,
+             strerror(errno));
+    fclose(ProfileFile);
+    return NULL;
+  }
+  fseek(ProfileFile, 0L, SEEK_SET);
+  return ProfileFile;
+}
+
+/* Write profile data to file \c OutputName.  */
+static int writeFile(const char *OutputName) {
   int RetVal;
   FILE *OutputFile;
-  if (!OutputName || !OutputName[0])
-    return -1;
 
-  /* Append to the file to support profiling multiple shared objects. */
-  OutputFile = fopen(OutputName, "ab");
+  if (!doMerging())
+    OutputFile = fopen(OutputName, "ab");
+  else
+    OutputFile = openFileForMerging(OutputName);
+
   if (!OutputFile)
     return -1;
 
-  RetVal = writeFile(OutputFile);
+  FreeHook = &free;
+  setupIOBuffer();
+  RetVal = lprofWriteData(fileWriter, OutputFile, lprofGetVPDataReader());
 
   fclose(OutputFile);
   return RetVal;
 }
 
-COMPILER_RT_WEAK int __llvm_profile_OwnsFilename = 0;
-COMPILER_RT_WEAK const char *__llvm_profile_CurrentFilename = NULL;
-
 static void truncateCurrentFile(void) {
   const char *Filename;
+  char *FilenameBuf;
   FILE *File;
+  int Length;
 
-  Filename = __llvm_profile_CurrentFilename;
-  if (!Filename || !Filename[0])
+  Length = getCurFilenameLength();
+  FilenameBuf = (char *)COMPILER_RT_ALLOCA(Length + 1);
+  Filename = getCurFilename(FilenameBuf);
+  if (!Filename)
     return;
 
   /* Create the directory holding the file, if needed. */
-  if (strchr(Filename, '/')) {
-    char *Copy = malloc(strlen(Filename) + 1);
-    strcpy(Copy, Filename);
+  if (strchr(Filename, '/') || strchr(Filename, '\\')) {
+    char *Copy = (char *)COMPILER_RT_ALLOCA(Length + 1);
+    strncpy(Copy, Filename, Length + 1);
     __llvm_profile_recursive_mkdir(Copy);
-    free(Copy);
   }
 
   /* Truncate the file.  Later we'll reopen and append. */
@@ -82,128 +242,254 @@
   fclose(File);
 }
 
-static void setFilename(const char *Filename, int OwnsFilename) {
-  /* Check if this is a new filename and therefore needs truncation. */
-  int NewFile = !__llvm_profile_CurrentFilename ||
-      (Filename && strcmp(Filename, __llvm_profile_CurrentFilename));
-  if (__llvm_profile_OwnsFilename)
-    free(UNCONST(__llvm_profile_CurrentFilename));
-
-  __llvm_profile_CurrentFilename = Filename;
-  __llvm_profile_OwnsFilename = OwnsFilename;
-
-  /* If not a new file, append to support profiling multiple shared objects. */
-  if (NewFile)
-    truncateCurrentFile();
+static const char *DefaultProfileName = "default.profraw";
+static void resetFilenameToDefault(void) {
+  memset(&lprofCurFilename, 0, sizeof(lprofCurFilename));
+  lprofCurFilename.FilenamePat = DefaultProfileName;
+  lprofCurFilename.PNS = PNS_default;
 }
 
-static void resetFilenameToDefault(void) { setFilename("default.profraw", 0); }
+static int containsMergeSpecifier(const char *FilenamePat, int I) {
+  return (FilenamePat[I] == 'm' ||
+          (FilenamePat[I] >= '1' && FilenamePat[I] <= '9' &&
+           /* If FilenamePat[I] is not '\0', the next byte is guaranteed
+            * to be in-bound as the string is null terminated. */
+           FilenamePat[I + 1] == 'm'));
+}
 
-int getpid(void);
-static int setFilenamePossiblyWithPid(const char *Filename) {
-#define MAX_PID_SIZE 16
-  char PidChars[MAX_PID_SIZE] = {0};
-  int NumPids = 0, PidLength = 0;
-  char *Allocated;
-  int I, J;
+/* Parses the pattern string \p FilenamePat and stores the result to
+ * lprofcurFilename structure. */
+static int parseFilenamePattern(const char *FilenamePat) {
+  int NumPids = 0, NumHosts = 0, I;
+  char *PidChars = &lprofCurFilename.PidChars[0];
+  char *Hostname = &lprofCurFilename.Hostname[0];
+  int MergingEnabled = 0;
 
-  /* Reset filename on NULL, except with env var which is checked by caller. */
-  if (!Filename) {
-    resetFilenameToDefault();
-    return 0;
-  }
-
+  lprofCurFilename.FilenamePat = FilenamePat;
   /* Check the filename for "%p", which indicates a pid-substitution. */
-  for (I = 0; Filename[I]; ++I)
-    if (Filename[I] == '%' && Filename[++I] == 'p')
-      if (!NumPids++) {
-        PidLength = snprintf(PidChars, MAX_PID_SIZE, "%d", getpid());
-        if (PidLength <= 0)
+  for (I = 0; FilenamePat[I]; ++I)
+    if (FilenamePat[I] == '%') {
+      if (FilenamePat[++I] == 'p') {
+        if (!NumPids++) {
+          if (snprintf(PidChars, MAX_PID_SIZE, "%d", getpid()) <= 0) {
+            PROF_WARN(
+                "Unable to parse filename pattern %s. Using the default name.",
+                FilenamePat);
+            return -1;
+          }
+        }
+      } else if (FilenamePat[I] == 'h') {
+        if (!NumHosts++)
+          if (COMPILER_RT_GETHOSTNAME(Hostname, COMPILER_RT_MAX_HOSTLEN)) {
+            PROF_WARN(
+                "Unable to parse filename pattern %s. Using the default name.",
+                FilenamePat);
+            return -1;
+          }
+      } else if (containsMergeSpecifier(FilenamePat, I)) {
+        if (MergingEnabled) {
+          PROF_WARN("%%m specifier can only be specified once in %s.\n",
+                    FilenamePat);
           return -1;
+        }
+        MergingEnabled = 1;
+        if (FilenamePat[I] == 'm')
+          lprofCurFilename.MergePoolSize = 1;
+        else {
+          lprofCurFilename.MergePoolSize = FilenamePat[I] - '0';
+          I++; /* advance to 'm' */
+        }
       }
-  if (!NumPids) {
-    setFilename(Filename, 0);
-    return 0;
-  }
+    }
 
-  /* Allocate enough space for the substituted filename. */
-  Allocated = malloc(I + NumPids*(PidLength - 2) + 1);
-  if (!Allocated)
-    return -1;
-
-  /* Construct the new filename. */
-  for (I = 0, J = 0; Filename[I]; ++I)
-    if (Filename[I] == '%') {
-      if (Filename[++I] == 'p') {
-        memcpy(Allocated + J, PidChars, PidLength);
-        J += PidLength;
-      }
-      /* Drop any unknown substitutions. */
-    } else
-      Allocated[J++] = Filename[I];
-  Allocated[J] = 0;
-
-  /* Use the computed name. */
-  setFilename(Allocated, 1);
+  lprofCurFilename.NumPids = NumPids;
+  lprofCurFilename.NumHosts = NumHosts;
   return 0;
 }
 
-static int setFilenameFromEnvironment(void) {
-  const char *Filename = getenv("LLVM_PROFILE_FILE");
+static void parseAndSetFilename(const char *FilenamePat,
+                                ProfileNameSpecifier PNS) {
 
-  if (!Filename || !Filename[0])
-    return -1;
+  const char *OldFilenamePat = lprofCurFilename.FilenamePat;
+  ProfileNameSpecifier OldPNS = lprofCurFilename.PNS;
 
-  return setFilenamePossiblyWithPid(Filename);
-}
-
-static void setFilenameAutomatically(void) {
-  if (!setFilenameFromEnvironment())
+  if (PNS < OldPNS)
     return;
 
-  resetFilenameToDefault();
+  if (!FilenamePat)
+    FilenamePat = DefaultProfileName;
+
+  /* When -fprofile-instr-generate=<path> is specified on the
+   * command line, each module will be instrumented with runtime
+   * init call to __llvm_profile_init function which calls
+   * __llvm_profile_override_default_filename. In most of the cases,
+   * the path will be identical, so bypass the parsing completely.
+   */
+  if (OldFilenamePat && !strcmp(OldFilenamePat, FilenamePat)) {
+    lprofCurFilename.PNS = PNS;
+    return;
+  }
+
+  /* When PNS >= OldPNS, the last one wins. */
+  if (!FilenamePat || parseFilenamePattern(FilenamePat))
+    resetFilenameToDefault();
+  lprofCurFilename.PNS = PNS;
+
+  if (!OldFilenamePat) {
+    PROF_NOTE("Set profile file path to \"%s\" via %s.\n",
+              lprofCurFilename.FilenamePat, getPNSStr(PNS));
+  } else {
+    PROF_NOTE("Override old profile path \"%s\" via %s to \"%s\" via %s.\n",
+              OldFilenamePat, getPNSStr(OldPNS), lprofCurFilename.FilenamePat,
+              getPNSStr(PNS));
+  }
+
+  if (!lprofCurFilename.MergePoolSize)
+    truncateCurrentFile();
 }
 
+/* Return buffer length that is required to store the current profile
+ * filename with PID and hostname substitutions. */
+/* The length to hold uint64_t followed by 2 digit pool id including '_' */
+#define SIGLEN 24
+static int getCurFilenameLength() {
+  int Len;
+  if (!lprofCurFilename.FilenamePat || !lprofCurFilename.FilenamePat[0])
+    return 0;
+
+  if (!(lprofCurFilename.NumPids || lprofCurFilename.NumHosts ||
+        lprofCurFilename.MergePoolSize))
+    return strlen(lprofCurFilename.FilenamePat);
+
+  Len = strlen(lprofCurFilename.FilenamePat) +
+        lprofCurFilename.NumPids * (strlen(lprofCurFilename.PidChars) - 2) +
+        lprofCurFilename.NumHosts * (strlen(lprofCurFilename.Hostname) - 2);
+  if (lprofCurFilename.MergePoolSize)
+    Len += SIGLEN;
+  return Len;
+}
+
+/* Return the pointer to the current profile file name (after substituting
+ * PIDs and Hostnames in filename pattern. \p FilenameBuf is the buffer
+ * to store the resulting filename. If no substitution is needed, the
+ * current filename pattern string is directly returned. */
+static const char *getCurFilename(char *FilenameBuf) {
+  int I, J, PidLength, HostNameLength;
+  const char *FilenamePat = lprofCurFilename.FilenamePat;
+
+  if (!lprofCurFilename.FilenamePat || !lprofCurFilename.FilenamePat[0])
+    return 0;
+
+  if (!(lprofCurFilename.NumPids || lprofCurFilename.NumHosts ||
+        lprofCurFilename.MergePoolSize))
+    return lprofCurFilename.FilenamePat;
+
+  PidLength = strlen(lprofCurFilename.PidChars);
+  HostNameLength = strlen(lprofCurFilename.Hostname);
+  /* Construct the new filename. */
+  for (I = 0, J = 0; FilenamePat[I]; ++I)
+    if (FilenamePat[I] == '%') {
+      if (FilenamePat[++I] == 'p') {
+        memcpy(FilenameBuf + J, lprofCurFilename.PidChars, PidLength);
+        J += PidLength;
+      } else if (FilenamePat[I] == 'h') {
+        memcpy(FilenameBuf + J, lprofCurFilename.Hostname, HostNameLength);
+        J += HostNameLength;
+      } else if (containsMergeSpecifier(FilenamePat, I)) {
+        char LoadModuleSignature[SIGLEN];
+        int S;
+        int ProfilePoolId = getpid() % lprofCurFilename.MergePoolSize;
+        S = snprintf(LoadModuleSignature, SIGLEN, "%" PRIu64 "_%d",
+                     lprofGetLoadModuleSignature(), ProfilePoolId);
+        if (S == -1 || S > SIGLEN)
+          S = SIGLEN;
+        memcpy(FilenameBuf + J, LoadModuleSignature, S);
+        J += S;
+        if (FilenamePat[I] != 'm')
+          I++;
+      }
+      /* Drop any unknown substitutions. */
+    } else
+      FilenameBuf[J++] = FilenamePat[I];
+  FilenameBuf[J] = 0;
+
+  return FilenameBuf;
+}
+
+/* Returns the pointer to the environment variable
+ * string. Returns null if the env var is not set. */
+static const char *getFilenamePatFromEnv(void) {
+  const char *Filename = getenv("LLVM_PROFILE_FILE");
+  if (!Filename || !Filename[0])
+    return 0;
+  return Filename;
+}
+
+/* This method is invoked by the runtime initialization hook
+ * InstrProfilingRuntime.o if it is linked in. Both user specified
+ * profile path via -fprofile-instr-generate= and LLVM_PROFILE_FILE
+ * environment variable can override this default value. */
 COMPILER_RT_VISIBILITY
 void __llvm_profile_initialize_file(void) {
-  /* Check if the filename has been initialized. */
-  if (__llvm_profile_CurrentFilename)
-    return;
+  const char *FilenamePat;
 
-  /* Detect the filename and truncate. */
-  setFilenameAutomatically();
+  FilenamePat = getFilenamePatFromEnv();
+  parseAndSetFilename(FilenamePat, FilenamePat ? PNS_environment : PNS_default);
 }
 
+/* This API is directly called by the user application code. It has the
+ * highest precedence compared with LLVM_PROFILE_FILE environment variable
+ * and command line option -fprofile-instr-generate=<profile_name>.
+ */
 COMPILER_RT_VISIBILITY
-void __llvm_profile_set_filename(const char *Filename) {
-  setFilenamePossiblyWithPid(Filename);
+void __llvm_profile_set_filename(const char *FilenamePat) {
+  parseAndSetFilename(FilenamePat, PNS_runtime_api);
 }
 
+/*
+ * This API is invoked by the global initializers emitted by Clang/LLVM when
+ * -fprofile-instr-generate=<..> is specified (vs -fprofile-instr-generate
+ *  without an argument). This option has lower precedence than the
+ *  LLVM_PROFILE_FILE environment variable.
+ */
 COMPILER_RT_VISIBILITY
-void __llvm_profile_override_default_filename(const char *Filename) {
-  /* If the env var is set, skip setting filename from argument. */
-  const char *Env_Filename = getenv("LLVM_PROFILE_FILE");
-  if (Env_Filename && Env_Filename[0])
-    return;
-  setFilenamePossiblyWithPid(Filename);
+void __llvm_profile_override_default_filename(const char *FilenamePat) {
+  parseAndSetFilename(FilenamePat, PNS_command_line);
 }
 
+/* The public API for writing profile data into the file with name
+ * set by previous calls to __llvm_profile_set_filename or
+ * __llvm_profile_override_default_filename or
+ * __llvm_profile_initialize_file. */
 COMPILER_RT_VISIBILITY
 int __llvm_profile_write_file(void) {
-  int rc;
+  int rc, Length;
+  const char *Filename;
+  char *FilenameBuf;
 
-  GetEnvHook = &getenv;
+  Length = getCurFilenameLength();
+  FilenameBuf = (char *)COMPILER_RT_ALLOCA(Length + 1);
+  Filename = getCurFilename(FilenameBuf);
+
   /* Check the filename. */
-  if (!__llvm_profile_CurrentFilename) {
-    PROF_ERR("LLVM Profile: Failed to write file : %s\n", "Filename not set");
+  if (!Filename) {
+    PROF_ERR("Failed to write file : %s\n", "Filename not set");
     return -1;
   }
 
-  /* Write the file. */
-  rc = writeFileWithName(__llvm_profile_CurrentFilename);
+  /* Check if there is llvm/runtime version mismatch.  */
+  if (GET_VERSION(__llvm_profile_get_version()) != INSTR_PROF_RAW_VERSION) {
+    PROF_ERR("Runtime and instrumentation version mismatch : "
+             "expected %d, but get %d\n",
+             INSTR_PROF_RAW_VERSION,
+             (int)GET_VERSION(__llvm_profile_get_version()));
+    return -1;
+  }
+
+  /* Write profile data to the file. */
+  rc = writeFile(Filename);
   if (rc)
-    PROF_ERR("LLVM Profile: Failed to write file \"%s\": %s\n",
-            __llvm_profile_CurrentFilename, strerror(errno));
+    PROF_ERR("Failed to write file \"%s\": %s\n", Filename, strerror(errno));
   return rc;
 }
 
@@ -216,6 +502,8 @@
   if (HasBeenRegistered)
     return 0;
 
+  lprofSetupValueProfiler();
+
   HasBeenRegistered = 1;
   return atexit(writeFileWithoutReturn);
 }
diff --git a/lib/profile/InstrProfilingInternal.h b/lib/profile/InstrProfilingInternal.h
index d247ca4..bcbe29a 100644
--- a/lib/profile/InstrProfilingInternal.h
+++ b/lib/profile/InstrProfilingInternal.h
@@ -39,7 +39,8 @@
     const uint64_t *CountersEnd, const char *NamesBegin, const char *NamesEnd);
 
 /*!
- * This is an internal function not intended to be used externally.
+ * The data structure describing the data to be written by the
+ * low level writer callback function.
  */
 typedef struct ProfDataIOVec {
   const void *Data;
@@ -49,18 +50,127 @@
 
 typedef uint32_t (*WriterCallback)(ProfDataIOVec *, uint32_t NumIOVecs,
                                    void **WriterCtx);
-int llvmWriteProfData(WriterCallback Writer, void *WriterCtx,
-                      const uint8_t *ValueDataBegin,
-                      const uint64_t ValueDataSize);
-int llvmWriteProfDataImpl(WriterCallback Writer, void *WriterCtx,
-                          const __llvm_profile_data *DataBegin,
-                          const __llvm_profile_data *DataEnd,
-                          const uint64_t *CountersBegin,
-                          const uint64_t *CountersEnd,
-                          const uint8_t *ValueDataBegin,
-                          const uint64_t ValueDataSize, const char *NamesBegin,
-                          const char *NamesEnd);
 
-extern char *(*GetEnvHook)(const char *);
+/*!
+ * The data structure for buffered IO of profile data.
+ */
+typedef struct ProfBufferIO {
+  /* File handle.  */
+  void *File;
+  /* Low level IO callback. */
+  WriterCallback FileWriter;
+  /* The start of the buffer. */
+  uint8_t *BufferStart;
+  /* Total size of the buffer. */
+  uint32_t BufferSz;
+  /* Current byte offset from the start of the buffer. */
+  uint32_t CurOffset;
+} ProfBufferIO;
+
+/* The creator interface used by testing.  */
+ProfBufferIO *lprofCreateBufferIOInternal(void *File, uint32_t BufferSz);
+
+/*!
+ * This is the interface to create a handle for buffered IO.
+ */
+ProfBufferIO *lprofCreateBufferIO(WriterCallback FileWriter, void *File);
+
+/*!
+ * The interface to destroy the bufferIO handle and reclaim
+ * the memory.
+ */
+void lprofDeleteBufferIO(ProfBufferIO *BufferIO);
+
+/*!
+ * This is the interface to write \c Data of \c Size bytes through
+ * \c BufferIO. Returns 0 if successful, otherwise return -1.
+ */
+int lprofBufferIOWrite(ProfBufferIO *BufferIO, const uint8_t *Data,
+                       uint32_t Size);
+/*!
+ * The interface to flush the remaining data in the buffer.
+ * through the low level writer callback.
+ */
+int lprofBufferIOFlush(ProfBufferIO *BufferIO);
+
+/* The low level interface to write data into a buffer. It is used as the
+ * callback by other high level writer methods such as buffered IO writer
+ * and profile data writer.  */
+uint32_t lprofBufferWriter(ProfDataIOVec *IOVecs, uint32_t NumIOVecs,
+                           void **WriterCtx);
+
+struct ValueProfData;
+struct ValueProfRecord;
+struct InstrProfValueData;
+struct ValueProfNode;
+
+/*!
+ * The class that defines a set of methods to read value profile
+ * data for streaming/serialization from the instrumentation runtime.
+ */
+typedef struct VPDataReaderType {
+  uint32_t (*InitRTRecord)(const __llvm_profile_data *Data,
+                           uint8_t *SiteCountArray[]);
+  /* Function pointer to getValueProfRecordHeader method. */
+  uint32_t (*GetValueProfRecordHeaderSize)(uint32_t NumSites);
+  /* Function pointer to getFristValueProfRecord method. */  
+  struct ValueProfRecord *(*GetFirstValueProfRecord)(struct ValueProfData *);
+  /* Return the number of value data for site \p Site.  */
+  uint32_t (*GetNumValueDataForSite)(uint32_t VK, uint32_t Site);
+  /* Return the total size of the value profile data of the 
+   * current function.  */
+  uint32_t (*GetValueProfDataSize)(void);
+  /*! 
+   * Read the next \p N value data for site \p Site and store the data
+   * in \p Dst. \p StartNode is the first value node to start with if
+   * it is not null. The function returns the pointer to the value
+   * node pointer to be used as the \p StartNode of the next batch reading.
+   * If there is nothing left, it returns NULL.
+   */
+  struct ValueProfNode *(*GetValueData)(uint32_t ValueKind, uint32_t Site,
+                                        struct InstrProfValueData *Dst,
+                                        struct ValueProfNode *StartNode,
+                                        uint32_t N);
+} VPDataReaderType;
+
+int lprofWriteData(WriterCallback Writer, void *WriterCtx,
+                   VPDataReaderType *VPDataReader);
+int lprofWriteDataImpl(WriterCallback Writer, void *WriterCtx,
+                       const __llvm_profile_data *DataBegin,
+                       const __llvm_profile_data *DataEnd,
+                       const uint64_t *CountersBegin,
+                       const uint64_t *CountersEnd,
+                       VPDataReaderType *VPDataReader, const char *NamesBegin,
+                       const char *NamesEnd);
+
+/* Merge value profile data pointed to by SrcValueProfData into
+ * in-memory profile counters pointed by to DstData.  */
+void lprofMergeValueProfData(struct ValueProfData *SrcValueProfData,
+                             __llvm_profile_data *DstData);
+
+VPDataReaderType *lprofGetVPDataReader();
+
+/* Internal interface used by test to reset the max number of 
+ * tracked values per value site to be \p MaxVals.
+ */
+void lprofSetMaxValsPerSite(uint32_t MaxVals);
+void lprofSetupValueProfiler();
+
+/* Return the profile header 'signature' value associated with the current
+ * executable or shared library. The signature value can be used to for
+ * a profile name that is unique to this load module so that it does not
+ * collide with profiles from other binaries. It also allows shared libraries
+ * to dump merged profile data into its own profile file. */
+uint64_t lprofGetLoadModuleSignature();
+
+COMPILER_RT_VISIBILITY extern char *(*GetEnvHook)(const char *);
+COMPILER_RT_VISIBILITY extern void (*FreeHook)(void *);
+COMPILER_RT_VISIBILITY extern uint8_t *DynamicBufferIOBuffer;
+COMPILER_RT_VISIBILITY extern uint32_t VPBufferSize;
+COMPILER_RT_VISIBILITY extern uint32_t VPMaxNumValsPerSite;
+/* Pointer to the start of static value counters to be allocted. */
+COMPILER_RT_VISIBILITY extern ValueProfNode *CurrentVNode;
+COMPILER_RT_VISIBILITY extern ValueProfNode *EndVNode;
+extern void (*VPMergeHook)(struct ValueProfData *, __llvm_profile_data *);
 
 #endif
diff --git a/lib/profile/InstrProfilingMerge.c b/lib/profile/InstrProfilingMerge.c
new file mode 100644
index 0000000..a202115
--- /dev/null
+++ b/lib/profile/InstrProfilingMerge.c
@@ -0,0 +1,132 @@
+/*===- InstrProfilingMerge.c - Profile in-process Merging  ---------------===*\
+|*
+|*                     The LLVM Compiler Infrastructure
+|*
+|* This file is distributed under the University of Illinois Open Source
+|* License. See LICENSE.TXT for details.
+|*
+|*===----------------------------------------------------------------------===*
+|* This file defines the API needed for in-process merging of profile data
+|* stored in memory buffer.
+\*===---------------------------------------------------------------------===*/
+
+#include "InstrProfiling.h"
+#include "InstrProfilingInternal.h"
+#include "InstrProfilingUtil.h"
+
+#define INSTR_PROF_VALUE_PROF_DATA
+#include "InstrProfData.inc"
+
+COMPILER_RT_WEAK void (*VPMergeHook)(ValueProfData *,
+                                     __llvm_profile_data *) = NULL;
+COMPILER_RT_VISIBILITY
+uint64_t lprofGetLoadModuleSignature() {
+  /* A very fast way to compute a module signature.  */
+  uint64_t CounterSize = (uint64_t)(__llvm_profile_end_counters() -
+                                    __llvm_profile_begin_counters());
+  uint64_t DataSize = __llvm_profile_get_data_size(__llvm_profile_begin_data(),
+                                                   __llvm_profile_end_data());
+  uint64_t NamesSize =
+      (uint64_t)(__llvm_profile_end_names() - __llvm_profile_begin_names());
+  uint64_t NumVnodes =
+      (uint64_t)(__llvm_profile_end_vnodes() - __llvm_profile_begin_vnodes());
+  const __llvm_profile_data *FirstD = __llvm_profile_begin_data();
+
+  return (NamesSize << 40) + (CounterSize << 30) + (DataSize << 20) +
+         (NumVnodes << 10) + (DataSize > 0 ? FirstD->NameRef : 0);
+}
+
+/* Returns 1 if profile is not structurally compatible.  */
+COMPILER_RT_VISIBILITY
+int __llvm_profile_check_compatibility(const char *ProfileData,
+                                       uint64_t ProfileSize) {
+  /* Check profile header only for now  */
+  __llvm_profile_header *Header = (__llvm_profile_header *)ProfileData;
+  __llvm_profile_data *SrcDataStart, *SrcDataEnd, *SrcData, *DstData;
+  SrcDataStart =
+      (__llvm_profile_data *)(ProfileData + sizeof(__llvm_profile_header));
+  SrcDataEnd = SrcDataStart + Header->DataSize;
+
+  if (ProfileSize < sizeof(__llvm_profile_header))
+    return 1;
+
+  /* Check the header first.  */
+  if (Header->Magic != __llvm_profile_get_magic() ||
+      Header->Version != __llvm_profile_get_version() ||
+      Header->DataSize !=
+          __llvm_profile_get_data_size(__llvm_profile_begin_data(),
+                                       __llvm_profile_end_data()) ||
+      Header->CountersSize != (uint64_t)(__llvm_profile_end_counters() -
+                                         __llvm_profile_begin_counters()) ||
+      Header->NamesSize != (uint64_t)(__llvm_profile_end_names() -
+                                      __llvm_profile_begin_names()) ||
+      Header->ValueKindLast != IPVK_Last)
+    return 1;
+
+  if (ProfileSize < sizeof(__llvm_profile_header) +
+                        Header->DataSize * sizeof(__llvm_profile_data) +
+                        Header->NamesSize + Header->CountersSize)
+    return 1;
+
+  for (SrcData = SrcDataStart,
+       DstData = (__llvm_profile_data *)__llvm_profile_begin_data();
+       SrcData < SrcDataEnd; ++SrcData, ++DstData) {
+    if (SrcData->NameRef != DstData->NameRef ||
+        SrcData->FuncHash != DstData->FuncHash ||
+        SrcData->NumCounters != DstData->NumCounters)
+      return 1;
+  }
+
+  /* Matched! */
+  return 0;
+}
+
+COMPILER_RT_VISIBILITY
+void __llvm_profile_merge_from_buffer(const char *ProfileData,
+                                      uint64_t ProfileSize) {
+  __llvm_profile_data *SrcDataStart, *SrcDataEnd, *SrcData, *DstData;
+  __llvm_profile_header *Header = (__llvm_profile_header *)ProfileData;
+  uint64_t *SrcCountersStart;
+  const char *SrcNameStart;
+  ValueProfData *SrcValueProfDataStart, *SrcValueProfData;
+
+  SrcDataStart =
+      (__llvm_profile_data *)(ProfileData + sizeof(__llvm_profile_header));
+  SrcDataEnd = SrcDataStart + Header->DataSize;
+  SrcCountersStart = (uint64_t *)SrcDataEnd;
+  SrcNameStart = (const char *)(SrcCountersStart + Header->CountersSize);
+  SrcValueProfDataStart =
+      (ValueProfData *)(SrcNameStart + Header->NamesSize +
+                        __llvm_profile_get_num_padding_bytes(
+                            Header->NamesSize));
+
+  for (SrcData = SrcDataStart,
+      DstData = (__llvm_profile_data *)__llvm_profile_begin_data(),
+      SrcValueProfData = SrcValueProfDataStart;
+       SrcData < SrcDataEnd; ++SrcData, ++DstData) {
+    uint64_t *SrcCounters;
+    uint64_t *DstCounters = (uint64_t *)DstData->CounterPtr;
+    unsigned I, NC, NVK = 0;
+
+    NC = SrcData->NumCounters;
+    SrcCounters = SrcCountersStart +
+                  ((size_t)SrcData->CounterPtr - Header->CountersDelta) /
+                      sizeof(uint64_t);
+    for (I = 0; I < NC; I++)
+      DstCounters[I] += SrcCounters[I];
+
+    /* Now merge value profile data.  */
+    if (!VPMergeHook)
+      continue;
+
+    for (I = 0; I <= IPVK_Last; I++)
+      NVK += (SrcData->NumValueSites[I] != 0);
+
+    if (!NVK)
+      continue;
+
+    VPMergeHook(SrcValueProfData, DstData);
+    SrcValueProfData = (ValueProfData *)((char *)SrcValueProfData +
+                                         SrcValueProfData->TotalSize);
+  }
+}
diff --git a/lib/profile/InstrProfilingMergeFile.c b/lib/profile/InstrProfilingMergeFile.c
new file mode 100644
index 0000000..ac5ee9f
--- /dev/null
+++ b/lib/profile/InstrProfilingMergeFile.c
@@ -0,0 +1,41 @@
+/*===- InstrProfilingMergeFile.c - Profile in-process Merging  ------------===*\
+|*
+|*                     The LLVM Compiler Infrastructure
+|*
+|* This file is distributed under the University of Illinois Open Source
+|* License. See LICENSE.TXT for details.
+|*
+|*===----------------------------------------------------------------------===
+|* This file defines APIs needed to support in-process merging for profile data
+|* stored in files.
+\*===----------------------------------------------------------------------===*/
+
+#include "InstrProfiling.h"
+#include "InstrProfilingInternal.h"
+#include "InstrProfilingUtil.h"
+
+#define INSTR_PROF_VALUE_PROF_DATA
+#include "InstrProfData.inc"
+
+void (*VPMergeHook)(ValueProfData *,
+                    __llvm_profile_data *) = &lprofMergeValueProfData;
+
+/* Merge value profile data pointed to by SrcValueProfData into
+ * in-memory profile counters pointed by to DstData.  */
+void lprofMergeValueProfData(ValueProfData *SrcValueProfData,
+                             __llvm_profile_data *DstData) {
+  unsigned I, S, V, C;
+  InstrProfValueData *VData;
+  ValueProfRecord *VR = getFirstValueProfRecord(SrcValueProfData);
+  for (I = 0; I < SrcValueProfData->NumValueKinds; I++) {
+    VData = getValueProfRecordValueData(VR);
+    for (S = 0; S < VR->NumValueSites; S++) {
+      uint8_t NV = VR->SiteCountArray[S];
+      for (V = 0; V < NV; V++) {
+        for (C = 0; C < VData[V].Count; C++)
+          __llvm_profile_instrument_target(VData[V].Value, DstData, S);
+      }
+    }
+    VR = getValueProfRecordNext(VR);
+  }
+}
diff --git a/lib/profile/InstrProfilingPlatformDarwin.c b/lib/profile/InstrProfilingPlatformDarwin.c
index 30ddbd2..8931aba 100644
--- a/lib/profile/InstrProfilingPlatformDarwin.c
+++ b/lib/profile/InstrProfilingPlatformDarwin.c
@@ -30,6 +30,13 @@
     CountersEnd __asm("section$end$__DATA$" INSTR_PROF_CNTS_SECT_NAME_STR);
 
 COMPILER_RT_VISIBILITY
+extern ValueProfNode
+    VNodesStart __asm("section$start$__DATA$" INSTR_PROF_VNODES_SECT_NAME_STR);
+COMPILER_RT_VISIBILITY
+extern ValueProfNode
+    VNodesEnd __asm("section$end$__DATA$" INSTR_PROF_VNODES_SECT_NAME_STR);
+
+COMPILER_RT_VISIBILITY
 const __llvm_profile_data *__llvm_profile_begin_data(void) {
   return &DataStart;
 }
@@ -43,4 +50,14 @@
 uint64_t *__llvm_profile_begin_counters(void) { return &CountersStart; }
 COMPILER_RT_VISIBILITY
 uint64_t *__llvm_profile_end_counters(void) { return &CountersEnd; }
+
+COMPILER_RT_VISIBILITY
+ValueProfNode *__llvm_profile_begin_vnodes(void) {
+  return &VNodesStart;
+}
+COMPILER_RT_VISIBILITY
+ValueProfNode *__llvm_profile_end_vnodes(void) { return &VNodesEnd; }
+
+COMPILER_RT_VISIBILITY ValueProfNode *CurrentVNode = &VNodesStart;
+COMPILER_RT_VISIBILITY ValueProfNode *EndVNode = &VNodesEnd;
 #endif
diff --git a/lib/profile/InstrProfilingPlatformLinux.c b/lib/profile/InstrProfilingPlatformLinux.c
index 7843f47..b6c780f 100644
--- a/lib/profile/InstrProfilingPlatformLinux.c
+++ b/lib/profile/InstrProfilingPlatformLinux.c
@@ -18,6 +18,8 @@
 #define PROF_NAME_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_NAME_SECT_NAME)
 #define PROF_CNTS_START INSTR_PROF_SECT_START(INSTR_PROF_CNTS_SECT_NAME)
 #define PROF_CNTS_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_CNTS_SECT_NAME)
+#define PROF_VNODES_START INSTR_PROF_SECT_START(INSTR_PROF_VNODES_SECT_NAME)
+#define PROF_VNODES_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_VNODES_SECT_NAME)
 
 /* Declare section start and stop symbols for various sections
  * generated by compiler instrumentation.
@@ -28,6 +30,8 @@
 extern uint64_t PROF_CNTS_STOP COMPILER_RT_VISIBILITY;
 extern char PROF_NAME_START COMPILER_RT_VISIBILITY;
 extern char PROF_NAME_STOP COMPILER_RT_VISIBILITY;
+extern ValueProfNode PROF_VNODES_START COMPILER_RT_VISIBILITY;
+extern ValueProfNode PROF_VNODES_STOP COMPILER_RT_VISIBILITY;
 
 /* Add dummy data to ensure the section is always created. */
 __llvm_profile_data
@@ -35,6 +39,7 @@
 uint64_t
     __prof_cnts_sect_data[0] COMPILER_RT_SECTION(INSTR_PROF_CNTS_SECT_NAME_STR);
 char __prof_nms_sect_data[0] COMPILER_RT_SECTION(INSTR_PROF_NAME_SECT_NAME_STR);
+ValueProfNode __prof_vnodes_sect_data[0] COMPILER_RT_SECTION(INSTR_PROF_VNODES_SECT_NAME_STR);
 
 COMPILER_RT_VISIBILITY const __llvm_profile_data *
 __llvm_profile_begin_data(void) {
@@ -56,4 +61,15 @@
 COMPILER_RT_VISIBILITY uint64_t *__llvm_profile_end_counters(void) {
   return &PROF_CNTS_STOP;
 }
+
+COMPILER_RT_VISIBILITY ValueProfNode *
+__llvm_profile_begin_vnodes(void) {
+  return &PROF_VNODES_START;
+}
+COMPILER_RT_VISIBILITY ValueProfNode *__llvm_profile_end_vnodes(void) {
+  return &PROF_VNODES_STOP;
+}
+COMPILER_RT_VISIBILITY ValueProfNode *CurrentVNode = &PROF_VNODES_START;
+COMPILER_RT_VISIBILITY ValueProfNode *EndVNode = &PROF_VNODES_STOP;
+
 #endif
diff --git a/lib/profile/InstrProfilingPlatformOther.c b/lib/profile/InstrProfilingPlatformOther.c
index 58ceb34..b259664 100644
--- a/lib/profile/InstrProfilingPlatformOther.c
+++ b/lib/profile/InstrProfilingPlatformOther.c
@@ -10,6 +10,7 @@
 #include "InstrProfiling.h"
 
 #if !defined(__APPLE__) && !defined(__linux__) && !defined(__FreeBSD__)
+
 #include <stdlib.h>
 
 static const __llvm_profile_data *DataFirst = NULL;
@@ -19,6 +20,14 @@
 static uint64_t *CountersFirst = NULL;
 static uint64_t *CountersLast = NULL;
 
+static const void *getMinAddr(const void *A1, const void *A2) {
+  return A1 < A2 ? A1 : A2;
+}
+
+static const void *getMaxAddr(const void *A1, const void *A2) {
+  return A1 > A2 ? A1 : A2;
+}
+
 /*!
  * \brief Register an instrumented function.
  *
@@ -33,24 +42,30 @@
   if (!DataFirst) {
     DataFirst = Data;
     DataLast = Data + 1;
-    NamesFirst = Data->NamePtr;
-    NamesLast = (const char *)Data->NamePtr + Data->NameSize;
     CountersFirst = Data->CounterPtr;
     CountersLast = (uint64_t *)Data->CounterPtr + Data->NumCounters;
     return;
   }
 
-#define UPDATE_FIRST(First, New) First = New < First ? New : First
-  UPDATE_FIRST(DataFirst, Data);
-  UPDATE_FIRST(NamesFirst, (const char *)Data->NamePtr);
-  UPDATE_FIRST(CountersFirst, (uint64_t *)Data->CounterPtr);
-#undef UPDATE_FIRST
+  DataFirst = (const __llvm_profile_data *)getMinAddr(DataFirst, Data);
+  CountersFirst = (uint64_t *)getMinAddr(CountersFirst, Data->CounterPtr);
 
-#define UPDATE_LAST(Last, New) Last = New > Last ? New : Last
-  UPDATE_LAST(DataLast, Data + 1);
-  UPDATE_LAST(NamesLast, (const char *)Data->NamePtr + Data->NameSize);
-  UPDATE_LAST(CountersLast, (uint64_t *)Data->CounterPtr + Data->NumCounters);
-#undef UPDATE_LAST
+  DataLast = (const __llvm_profile_data *)getMaxAddr(DataLast, Data + 1);
+  CountersLast = (uint64_t *)getMaxAddr(
+      CountersLast, (uint64_t *)Data->CounterPtr + Data->NumCounters);
+}
+
+COMPILER_RT_VISIBILITY
+void __llvm_profile_register_names_function(void *NamesStart,
+                                            uint64_t NamesSize) {
+  if (!NamesFirst) {
+    NamesFirst = (const char *)NamesStart;
+    NamesLast = (const char *)NamesStart + NamesSize;
+    return;
+  }
+  NamesFirst = (const char *)getMinAddr(NamesFirst, NamesStart);
+  NamesLast =
+      (const char *)getMaxAddr(NamesLast, (const char *)NamesStart + NamesSize);
 }
 
 COMPILER_RT_VISIBILITY
@@ -65,4 +80,15 @@
 uint64_t *__llvm_profile_begin_counters(void) { return CountersFirst; }
 COMPILER_RT_VISIBILITY
 uint64_t *__llvm_profile_end_counters(void) { return CountersLast; }
+
+COMPILER_RT_VISIBILITY
+ValueProfNode *__llvm_profile_begin_vnodes(void) {
+  return 0;
+}
+COMPILER_RT_VISIBILITY
+ValueProfNode *__llvm_profile_end_vnodes(void) { return 0; }
+
+COMPILER_RT_VISIBILITY ValueProfNode *CurrentVNode = 0;
+COMPILER_RT_VISIBILITY ValueProfNode *EndVNode = 0;
+
 #endif
diff --git a/lib/profile/InstrProfilingPort.h b/lib/profile/InstrProfilingPort.h
index da4f18f..4fd8aca 100644
--- a/lib/profile/InstrProfilingPort.h
+++ b/lib/profile/InstrProfilingPort.h
@@ -13,60 +13,92 @@
 #ifdef _MSC_VER
 #define COMPILER_RT_ALIGNAS(x) __declspec(align(x))
 #define COMPILER_RT_VISIBILITY
+/* FIXME: selectany does not have the same semantics as weak. */
 #define COMPILER_RT_WEAK __declspec(selectany)
+/* Need to include <windows.h> */
+#define COMPILER_RT_ALLOCA _alloca
+/* Need to include <stdio.h> and <io.h> */
+#define COMPILER_RT_FTRUNCATE(f,l) _chsize(_fileno(f),l)
 #elif __GNUC__
 #define COMPILER_RT_ALIGNAS(x) __attribute__((aligned(x)))
 #define COMPILER_RT_VISIBILITY __attribute__((visibility("hidden")))
 #define COMPILER_RT_WEAK __attribute__((weak))
+#define COMPILER_RT_ALLOCA __builtin_alloca
+#define COMPILER_RT_FTRUNCATE(f,l) ftruncate(fileno(f),l)
 #endif
 
+#if defined(__APPLE__)
+#define COMPILER_RT_SEG "__DATA,"
+#else
+#define COMPILER_RT_SEG ""
+#endif
+
+#ifdef _MSC_VER
+#define COMPILER_RT_SECTION(Sect) __declspec(allocate(Sect))
+#else
 #define COMPILER_RT_SECTION(Sect) __attribute__((section(Sect)))
+#endif
+
+#define COMPILER_RT_MAX_HOSTLEN 128
+#ifdef _MSC_VER
+#define COMPILER_RT_GETHOSTNAME(Name, Len) gethostname(Name, Len)
+#elif defined(__ORBIS__)
+#define COMPILER_RT_GETHOSTNAME(Name, Len) ((void)(Name), (void)(Len), (-1))
+#else
+#define COMPILER_RT_GETHOSTNAME(Name, Len) lprofGetHostName(Name, Len)
+#define COMPILER_RT_HAS_UNAME 1
+#endif
 
 #if COMPILER_RT_HAS_ATOMICS == 1
 #ifdef _MSC_VER
 #include <windows.h>
+#if _MSC_VER < 1900
+#define snprintf _snprintf
+#endif
 #if defined(_WIN64)
 #define COMPILER_RT_BOOL_CMPXCHG(Ptr, OldV, NewV)                              \
   (InterlockedCompareExchange64((LONGLONG volatile *)Ptr, (LONGLONG)NewV,      \
                                 (LONGLONG)OldV) == (LONGLONG)OldV)
-#else
+#define COMPILER_RT_PTR_FETCH_ADD(DomType, PtrVar, PtrIncr)                    \
+  (DomType *)InterlockedExchangeAdd64((LONGLONG volatile *)&PtrVar,            \
+                                      (LONGLONG)sizeof(DomType) * PtrIncr)
+#else /* !defined(_WIN64) */
 #define COMPILER_RT_BOOL_CMPXCHG(Ptr, OldV, NewV)                              \
   (InterlockedCompareExchange((LONG volatile *)Ptr, (LONG)NewV, (LONG)OldV) == \
    (LONG)OldV)
+#define COMPILER_RT_PTR_FETCH_ADD(DomType, PtrVar, PtrIncr)                    \
+  (DomType *)InterlockedExchangeAdd((LONG volatile *)&PtrVar,                  \
+                                    (LONG)sizeof(DomType) * PtrIncr)
 #endif
-#else
+#else /* !defined(_MSC_VER) */
 #define COMPILER_RT_BOOL_CMPXCHG(Ptr, OldV, NewV)                              \
   __sync_bool_compare_and_swap(Ptr, OldV, NewV)
+#define COMPILER_RT_PTR_FETCH_ADD(DomType, PtrVar, PtrIncr)                    \
+  (DomType *)__sync_fetch_and_add((long *)&PtrVar, sizeof(DomType) * PtrIncr)
 #endif
-#else
+#else /* COMPILER_RT_HAS_ATOMICS != 1 */
+#include "InstrProfilingUtil.h"
 #define COMPILER_RT_BOOL_CMPXCHG(Ptr, OldV, NewV)                              \
-  BoolCmpXchg((void **)Ptr, OldV, NewV)
+  lprofBoolCmpXchg((void **)Ptr, OldV, NewV)
+#define COMPILER_RT_PTR_FETCH_ADD(DomType, PtrVar, PtrIncr)                    \
+  (DomType *)lprofPtrFetchAdd((void **)&PtrVar, sizeof(DomType) * PtrIncr)
 #endif
 
 #define PROF_ERR(Format, ...)                                                  \
-  if (GetEnvHook && GetEnvHook("LLVM_PROFILE_VERBOSE_ERRORS"))                 \
-    fprintf(stderr, Format, __VA_ARGS__);
+  fprintf(stderr, "LLVM Profile Error: " Format, __VA_ARGS__);
 
-#if defined(__FreeBSD__) && defined(__i386__)
+#define PROF_WARN(Format, ...)                                                 \
+  fprintf(stderr, "LLVM Profile Warning: " Format, __VA_ARGS__);
 
-/* System headers define 'size_t' incorrectly on x64 FreeBSD (prior to
- * FreeBSD 10, r232261) when compiled in 32-bit mode.
- */
-#define PRIu64 "llu"
-typedef unsigned char uint8_t;
-typedef unsigned short uint16_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
-typedef uint32_t uintptr_t;
-#elif defined(__FreeBSD__) && defined(__x86_64__)
-#define PRIu64 "lu"
-typedef unsigned char uint8_t;
-typedef unsigned short uint16_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
-typedef unsigned long int uintptr_t;
+#define PROF_NOTE(Format, ...)                                                 \
+  fprintf(stderr, "LLVM Profile Note: " Format, __VA_ARGS__);
 
-#else /* defined(__FreeBSD__) && defined(__i386__) */
+#if defined(__FreeBSD__)
+
+#include <inttypes.h>
+#include <sys/types.h>
+
+#else /* defined(__FreeBSD__) */
 
 #include <inttypes.h>
 #include <stdint.h>
diff --git a/lib/profile/InstrProfilingUtil.c b/lib/profile/InstrProfilingUtil.c
index 6f0443d..dc58d73 100644
--- a/lib/profile/InstrProfilingUtil.c
+++ b/lib/profile/InstrProfilingUtil.c
@@ -12,25 +12,123 @@
 
 #ifdef _WIN32
 #include <direct.h>
-#elif I386_FREEBSD
-int mkdir(const char*, unsigned short);
+#include <io.h>
+#include <windows.h>
 #else
 #include <sys/stat.h>
 #include <sys/types.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
 #endif
 
+#ifdef COMPILER_RT_HAS_UNAME
+#include <sys/utsname.h>
+#endif
+
+#include <string.h>
+
 COMPILER_RT_VISIBILITY
 void __llvm_profile_recursive_mkdir(char *path) {
   int i;
 
   for (i = 1; path[i] != '\0'; ++i) {
-    if (path[i] != '/') continue;
+    char save = path[i];
+    if (!(path[i] == '/' || path[i] == '\\'))
+      continue;
     path[i] = '\0';
 #ifdef _WIN32
     _mkdir(path);
 #else
-    mkdir(path, 0755);  /* Some of these will fail, ignore it. */
+    mkdir(path, 0755); /* Some of these will fail, ignore it. */
 #endif
-    path[i] = '/';
+    path[i] = save;
   }
 }
+
+#if COMPILER_RT_HAS_ATOMICS != 1
+COMPILER_RT_VISIBILITY
+uint32_t lprofBoolCmpXchg(void **Ptr, void *OldV, void *NewV) {
+  void *R = *Ptr;
+  if (R == OldV) {
+    *Ptr = NewV;
+    return 1;
+  }
+  return 0;
+}
+COMPILER_RT_VISIBILITY
+void *lprofPtrFetchAdd(void **Mem, long ByteIncr) {
+  void *Old = *Mem;
+  *((char **)Mem) += ByteIncr;
+  return Old;
+}
+
+#endif
+
+#ifdef COMPILER_RT_HAS_UNAME
+COMPILER_RT_VISIBILITY int lprofGetHostName(char *Name, int Len) {
+  struct utsname N;
+  int R;
+  if (!(R = uname(&N)))
+    strncpy(Name, N.nodename, Len);
+  return R;
+}
+#endif
+
+COMPILER_RT_VISIBILITY FILE *lprofOpenFileEx(const char *ProfileName) {
+  FILE *f;
+  int fd;
+#ifdef COMPILER_RT_HAS_FCNTL_LCK
+  struct flock s_flock;
+
+  s_flock.l_whence = SEEK_SET;
+  s_flock.l_start = 0;
+  s_flock.l_len = 0; /* Until EOF.  */
+  s_flock.l_pid = getpid();
+
+  s_flock.l_type = F_WRLCK;
+  fd = open(ProfileName, O_RDWR | O_CREAT, 0666);
+  if (fd < 0)
+    return NULL;
+
+  while (fcntl(fd, F_SETLKW, &s_flock) == -1) {
+    if (errno != EINTR) {
+      if (errno == ENOLCK) {
+        PROF_WARN("Data may be corrupted during profile merging : %s\n",
+                  "Fail to obtain file lock due to system limit.");
+      }
+      break;
+    }
+  }
+
+  f = fdopen(fd, "r+b");
+#elif defined(_WIN32)
+  // FIXME: Use the wide variants to handle Unicode filenames.
+  HANDLE h = CreateFileA(ProfileName, GENERIC_READ | GENERIC_WRITE, 0, 0,
+                         OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, 0);
+  if (h == INVALID_HANDLE_VALUE)
+    return NULL;
+
+  fd = _open_osfhandle((intptr_t)h, 0);
+  if (fd == -1) {
+    CloseHandle(h);
+    return NULL;
+  }
+
+  f = _fdopen(fd, "r+b");
+  if (f == 0) {
+    CloseHandle(h);
+    return NULL;
+  }
+#else
+  /* Worst case no locking applied.  */
+  PROF_WARN("Concurrent file access is not supported : %s\n",
+            "lack file locking");
+  fd = open(ProfileName, O_RDWR | O_CREAT, 0666);
+  if (fd < 0)
+    return NULL;
+  f = fdopen(fd, "r+b");
+#endif
+
+  return f;
+}
diff --git a/lib/profile/InstrProfilingUtil.h b/lib/profile/InstrProfilingUtil.h
index 756b18e..16d3fbf 100644
--- a/lib/profile/InstrProfilingUtil.h
+++ b/lib/profile/InstrProfilingUtil.h
@@ -10,7 +10,24 @@
 #ifndef PROFILE_INSTRPROFILINGUTIL_H
 #define PROFILE_INSTRPROFILINGUTIL_H
 
+#include <stddef.h>
+#include <stdio.h>
+
 /*! \brief Create a directory tree. */
 void __llvm_profile_recursive_mkdir(char *Pathname);
 
-#endif  /* PROFILE_INSTRPROFILINGUTIL_H */
+/*! Open file \c Filename for read+write with write
+ * lock for exclusive access. The caller will block
+ * if the lock is already held by another process. */
+FILE *lprofOpenFileEx(const char *Filename);
+/* PS4 doesn't have getenv. Define a shim. */
+#if __ORBIS__
+static inline char *getenv(const char *name) { return NULL; }
+#endif /* #if __ORBIS__ */
+
+int lprofGetHostName(char *Name, int Len);
+
+unsigned lprofBoolCmpXchg(void **Ptr, void *OldV, void *NewV);
+void *lprofPtrFetchAdd(void **Mem, long ByteIncr);
+
+#endif /* PROFILE_INSTRPROFILINGUTIL_H */
diff --git a/lib/profile/InstrProfilingValue.c b/lib/profile/InstrProfilingValue.c
index 4888eec..93957e3 100644
--- a/lib/profile/InstrProfilingValue.c
+++ b/lib/profile/InstrProfilingValue.c
@@ -9,6 +9,7 @@
 
 #include "InstrProfiling.h"
 #include "InstrProfilingInternal.h"
+#include "InstrProfilingUtil.h" /* For PS4 getenv shim. */
 #include <limits.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -17,25 +18,40 @@
 #define INSTR_PROF_COMMON_API_IMPL
 #include "InstrProfData.inc"
 
-#define PROF_OOM(Msg) PROF_ERR(Msg ":%s\n", "Out of memory");
-#define PROF_OOM_RETURN(Msg)                                                   \
-  {                                                                            \
-    PROF_OOM(Msg)                                                              \
-    return 0;                                                                  \
-  }
+static int hasStaticCounters = 1;
+static int OutOfNodesWarnings = 0;
+static int hasNonDefaultValsPerSite = 0;
+#define INSTR_PROF_MAX_VP_WARNS 10
+#define INSTR_PROF_DEFAULT_NUM_VAL_PER_SITE 8
+#define INSTR_PROF_VNODE_POOL_SIZE 1024
 
-#if COMPILER_RT_HAS_ATOMICS != 1
-COMPILER_RT_VISIBILITY
-uint32_t BoolCmpXchg(void **Ptr, void *OldV, void *NewV) {
-  void *R = *Ptr;
-  if (R == OldV) {
-    *Ptr = NewV;
-    return 1;
-  }
-  return 0;
-}
+#ifndef _MSC_VER
+/* A shared static pool in addition to the vnodes statically
+ * allocated by the compiler.  */
+COMPILER_RT_VISIBILITY ValueProfNode
+    lprofValueProfNodes[INSTR_PROF_VNODE_POOL_SIZE] COMPILER_RT_SECTION(
+       COMPILER_RT_SEG INSTR_PROF_VNODES_SECT_NAME_STR);
 #endif
 
+COMPILER_RT_VISIBILITY uint32_t VPMaxNumValsPerSite =
+    INSTR_PROF_DEFAULT_NUM_VAL_PER_SITE;
+
+COMPILER_RT_VISIBILITY void lprofSetupValueProfiler() {
+  const char *Str = 0;
+  Str = getenv("LLVM_VP_MAX_NUM_VALS_PER_SITE");
+  if (Str && Str[0]) {
+    VPMaxNumValsPerSite = atoi(Str);
+    hasNonDefaultValsPerSite = 1;
+  }
+  if (VPMaxNumValsPerSite > INSTR_PROF_MAX_NUM_VAL_PER_SITE)
+    VPMaxNumValsPerSite = INSTR_PROF_MAX_NUM_VAL_PER_SITE;
+}
+
+COMPILER_RT_VISIBILITY void lprofSetMaxValsPerSite(uint32_t MaxVals) {
+  VPMaxNumValsPerSite = MaxVals;
+  hasNonDefaultValsPerSite = 1;
+}
+
 /* This method is only used in value profiler mock testing.  */
 COMPILER_RT_VISIBILITY void
 __llvm_profile_set_num_value_sites(__llvm_profile_data *Data,
@@ -64,6 +80,15 @@
 static int allocateValueProfileCounters(__llvm_profile_data *Data) {
   uint64_t NumVSites = 0;
   uint32_t VKI;
+
+  /* This function will never be called when value site array is allocated
+     statically at compile time.  */
+  hasStaticCounters = 0;
+  /* When dynamic allocation is enabled, allow tracking the max number of
+   * values allowd.  */
+  if (!hasNonDefaultValsPerSite)
+    VPMaxNumValsPerSite = INSTR_PROF_MAX_NUM_VAL_PER_SITE;
+
   for (VKI = IPVK_First; VKI <= IPVK_Last; ++VKI)
     NumVSites += Data->NumValueSites[VKI];
 
@@ -78,28 +103,36 @@
   return 1;
 }
 
-static void deallocateValueProfileCounters(__llvm_profile_data *Data) {
-  uint64_t NumVSites = 0, I;
-  uint32_t VKI;
-  if (!Data->Values)
-    return;
-  for (VKI = IPVK_First; VKI <= IPVK_Last; ++VKI)
-    NumVSites += Data->NumValueSites[VKI];
-  for (I = 0; I < NumVSites; I++) {
-    ValueProfNode *Node = ((ValueProfNode **)Data->Values)[I];
-    while (Node) {
-      ValueProfNode *Next = Node->Next;
-      free(Node);
-      Node = Next;
+static ValueProfNode *allocateOneNode(__llvm_profile_data *Data, uint32_t Index,
+                                      uint64_t Value) {
+  ValueProfNode *Node;
+
+  if (!hasStaticCounters)
+    return (ValueProfNode *)calloc(1, sizeof(ValueProfNode));
+
+  /* Early check to avoid value wrapping around.  */
+  if (CurrentVNode + 1 > EndVNode) {
+    if (OutOfNodesWarnings++ < INSTR_PROF_MAX_VP_WARNS) {
+      PROF_WARN("Unable to track new values: %s. "
+                " Consider using option -mllvm -vp-counters-per-site=<n> to "
+                "allocate more"
+                " value profile counters at compile time. \n",
+                "Running out of static counters");
     }
+    return 0;
   }
-  free(Data->Values);
+  Node = COMPILER_RT_PTR_FETCH_ADD(ValueProfNode, CurrentVNode, 1);
+  /* Due to section padding, EndVNode point to a byte which is one pass
+   * an incomplete VNode, so we need to skip the last incomplete node. */
+  if (Node + 1 > EndVNode)
+    return 0;
+
+  return Node;
 }
 
 COMPILER_RT_VISIBILITY void
 __llvm_profile_instrument_target(uint64_t TargetValue, void *Data,
                                  uint32_t CounterIndex) {
-
   __llvm_profile_data *PData = (__llvm_profile_data *)Data;
   if (!PData)
     return;
@@ -111,128 +144,184 @@
 
   ValueProfNode **ValueCounters = (ValueProfNode **)PData->Values;
   ValueProfNode *PrevVNode = NULL;
-  ValueProfNode *CurrentVNode = ValueCounters[CounterIndex];
+  ValueProfNode *MinCountVNode = NULL;
+  ValueProfNode *CurVNode = ValueCounters[CounterIndex];
+  uint64_t MinCount = UINT64_MAX;
 
   uint8_t VDataCount = 0;
-  while (CurrentVNode) {
-    if (TargetValue == CurrentVNode->VData.Value) {
-      CurrentVNode->VData.Count++;
+  while (CurVNode) {
+    if (TargetValue == CurVNode->Value) {
+      CurVNode->Count++;
       return;
     }
-    PrevVNode = CurrentVNode;
-    CurrentVNode = CurrentVNode->Next;
+    if (CurVNode->Count < MinCount) {
+      MinCount = CurVNode->Count;
+      MinCountVNode = CurVNode;
+    }
+    PrevVNode = CurVNode;
+    CurVNode = CurVNode->Next;
     ++VDataCount;
   }
 
-  if (VDataCount >= UCHAR_MAX)
+  if (VDataCount >= VPMaxNumValsPerSite) {
+    /* Bump down the min count node's count. If it reaches 0,
+     * evict it. This eviction/replacement policy makes hot
+     * targets more sticky while cold targets less so. In other
+     * words, it makes it less likely for the hot targets to be
+     * prematurally evicted during warmup/establishment period,
+     * when their counts are still low. In a special case when
+     * the number of values tracked is reduced to only one, this
+     * policy will guarantee that the dominating target with >50%
+     * total count will survive in the end. Note that this scheme
+     * allows the runtime to track the min count node in an adaptive
+     * manner. It can correct previous mistakes and eventually
+     * lock on a cold target that is alread in stable state.
+     *
+     * In very rare cases,  this replacement scheme may still lead
+     * to target loss. For instance, out of \c N value slots, \c N-1
+     * slots are occupied by luke warm targets during the warmup
+     * period and the remaining one slot is competed by two or more
+     * very hot targets. If those hot targets occur in an interleaved
+     * way, none of them will survive (gain enough weight to throw out
+     * other established entries) due to the ping-pong effect.
+     * To handle this situation, user can choose to increase the max
+     * number of tracked values per value site. Alternatively, a more
+     * expensive eviction mechanism can be implemented. It requires
+     * the runtime to track the total number of evictions per-site.
+     * When the total number of evictions reaches certain threshold,
+     * the runtime can wipe out more than one lowest count entries
+     * to give space for hot targets.
+     */
+    if (!(--MinCountVNode->Count)) {
+      CurVNode = MinCountVNode;
+      CurVNode->Value = TargetValue;
+      CurVNode->Count++;
+    }
     return;
+  }
 
-  CurrentVNode = (ValueProfNode *)calloc(1, sizeof(ValueProfNode));
-  if (!CurrentVNode)
+  CurVNode = allocateOneNode(PData, CounterIndex, TargetValue);
+  if (!CurVNode)
     return;
-
-  CurrentVNode->VData.Value = TargetValue;
-  CurrentVNode->VData.Count++;
+  CurVNode->Value = TargetValue;
+  CurVNode->Count++;
 
   uint32_t Success = 0;
   if (!ValueCounters[CounterIndex])
     Success =
-        COMPILER_RT_BOOL_CMPXCHG(&ValueCounters[CounterIndex], 0, CurrentVNode);
+        COMPILER_RT_BOOL_CMPXCHG(&ValueCounters[CounterIndex], 0, CurVNode);
   else if (PrevVNode && !PrevVNode->Next)
-    Success = COMPILER_RT_BOOL_CMPXCHG(&(PrevVNode->Next), 0, CurrentVNode);
+    Success = COMPILER_RT_BOOL_CMPXCHG(&(PrevVNode->Next), 0, CurVNode);
 
-  if (!Success) {
-    free(CurrentVNode);
+  if (!Success && !hasStaticCounters) {
+    free(CurVNode);
     return;
   }
 }
 
-/* For multi-threaded programs, while the profile is being dumped, other
-   threads may still be updating the value profile data and creating new
-   value entries. To accommadate this, we need to add extra bytes to the
-   data buffer. The size of the extra space is controlled by an environment
-   variable. */
-static unsigned getVprofExtraBytes() {
-  const char *ExtraStr =
-      GetEnvHook ? GetEnvHook("LLVM_VALUE_PROF_BUFFER_EXTRA") : 0;
-  if (!ExtraStr || !ExtraStr[0])
-    return 1024;
-  return (unsigned)atoi(ExtraStr);
+/*
+ * A wrapper struct that represents value profile runtime data.
+ * Like InstrProfRecord class which is used by profiling host tools,
+ * ValueProfRuntimeRecord also implements the abstract intefaces defined in
+ * ValueProfRecordClosure so that the runtime data can be serialized using
+ * shared C implementation.
+ */
+typedef struct ValueProfRuntimeRecord {
+  const __llvm_profile_data *Data;
+  ValueProfNode **NodesKind[IPVK_Last + 1];
+  uint8_t **SiteCountArray;
+} ValueProfRuntimeRecord;
+
+/* ValueProfRecordClosure Interface implementation. */
+
+static uint32_t getNumValueSitesRT(const void *R, uint32_t VK) {
+  return ((const ValueProfRuntimeRecord *)R)->Data->NumValueSites[VK];
 }
 
-/* Extract the value profile data info from the runtime. */
-#define DEF_VALUE_RECORD(R, NS, V)                                             \
-  ValueProfRuntimeRecord R;                                                    \
-  if (initializeValueProfRuntimeRecord(&R, NS, V))                             \
-    PROF_OOM_RETURN("Failed to write value profile data ");
-
-#define DTOR_VALUE_RECORD(R) finalizeValueProfRuntimeRecord(&R);
-
-COMPILER_RT_VISIBILITY uint64_t
-__llvm_profile_gather_value_data(uint8_t **VDataArray) {
-  size_t S = 0, RealSize = 0, BufferCapacity = 0, Extra = 0;
-  __llvm_profile_data *I;
-  if (!VDataArray)
-    PROF_OOM_RETURN("Failed to write value profile data ");
-
-  const __llvm_profile_data *DataEnd = __llvm_profile_end_data();
-  const __llvm_profile_data *DataBegin = __llvm_profile_begin_data();
-
-  /*
-   * Compute the total Size of the buffer to hold ValueProfData
-   * structures for functions with value profile data.
-   */
-  for (I = (__llvm_profile_data *)DataBegin; I != DataEnd; ++I) {
-
-    DEF_VALUE_RECORD(R, I->NumValueSites, I->Values);
-
-    /* Compute the size of ValueProfData from this runtime record.  */
-    if (getNumValueKindsRT(&R) != 0)
-      S += getValueProfDataSizeRT(&R);
-
-    DTOR_VALUE_RECORD(R);
-  }
-  /* No value sites or no value profile data is collected. */
-  if (!S)
+static uint32_t getNumValueDataRT(const void *R, uint32_t VK) {
+  uint32_t S = 0, I;
+  const ValueProfRuntimeRecord *Record = (const ValueProfRuntimeRecord *)R;
+  if (Record->SiteCountArray[VK] == INSTR_PROF_NULLPTR)
     return 0;
+  for (I = 0; I < Record->Data->NumValueSites[VK]; I++)
+    S += Record->SiteCountArray[VK][I];
+  return S;
+}
 
-  Extra = getVprofExtraBytes();
-  BufferCapacity = S + Extra;
-  *VDataArray = calloc(BufferCapacity, sizeof(uint8_t));
-  if (!*VDataArray)
-    PROF_OOM_RETURN("Failed to write value profile data ");
+static uint32_t getNumValueDataForSiteRT(const void *R, uint32_t VK,
+                                         uint32_t S) {
+  const ValueProfRuntimeRecord *Record = (const ValueProfRuntimeRecord *)R;
+  return Record->SiteCountArray[VK][S];
+}
 
-  ValueProfData *VD = (ValueProfData *)(*VDataArray);
-  /*
-   * Extract value profile data and write into ValueProfData structure
-   * one by one. Note that new value profile data added to any value
-   * site (from another thread) after the ValueProfRuntimeRecord is
-   * initialized (when the profile data snapshot is taken) won't be
-   * collected. This is not a problem as those dropped value will have
-   * very low taken count.
-   */
-  for (I = (__llvm_profile_data *)DataBegin; I != DataEnd; ++I) {
-    DEF_VALUE_RECORD(R, I->NumValueSites, I->Values);
-    if (getNumValueKindsRT(&R) == 0)
+static ValueProfRuntimeRecord RTRecord;
+static ValueProfRecordClosure RTRecordClosure = {
+    &RTRecord,          INSTR_PROF_NULLPTR, /* GetNumValueKinds */
+    getNumValueSitesRT, getNumValueDataRT,  getNumValueDataForSiteRT,
+    INSTR_PROF_NULLPTR, /* RemapValueData */
+    INSTR_PROF_NULLPTR, /* GetValueForSite, */
+    INSTR_PROF_NULLPTR  /* AllocValueProfData */
+};
+
+static uint32_t
+initializeValueProfRuntimeRecord(const __llvm_profile_data *Data,
+                                 uint8_t *SiteCountArray[]) {
+  unsigned I, J, S = 0, NumValueKinds = 0;
+  ValueProfNode **Nodes = (ValueProfNode **)Data->Values;
+  RTRecord.Data = Data;
+  RTRecord.SiteCountArray = SiteCountArray;
+  for (I = 0; I <= IPVK_Last; I++) {
+    uint16_t N = Data->NumValueSites[I];
+    if (!N)
       continue;
 
-    /* Record R has taken a snapshot of the VP data at this point. Newly
-       added VP data for this function will be dropped.  */
-    /* Check if there is enough space.  */
-    if (BufferCapacity - RealSize < getValueProfDataSizeRT(&R)) {
-      PROF_ERR("Value profile data is dropped :%s \n",
-               "Out of buffer space. Use environment "
-               " LLVM_VALUE_PROF_BUFFER_EXTRA to allocate more");
-      I->Values = 0;
+    NumValueKinds++;
+
+    RTRecord.NodesKind[I] = Nodes ? &Nodes[S] : INSTR_PROF_NULLPTR;
+    for (J = 0; J < N; J++) {
+      /* Compute value count for each site. */
+      uint32_t C = 0;
+      ValueProfNode *Site =
+          Nodes ? RTRecord.NodesKind[I][J] : INSTR_PROF_NULLPTR;
+      while (Site) {
+        C++;
+        Site = Site->Next;
+      }
+      if (C > UCHAR_MAX)
+        C = UCHAR_MAX;
+      RTRecord.SiteCountArray[I][J] = C;
     }
-
-    serializeValueProfDataFromRT(&R, VD);
-    deallocateValueProfileCounters(I);
-    I->Values = VD;
-    RealSize += VD->TotalSize;
-    VD = (ValueProfData *)((char *)VD + VD->TotalSize);
-    DTOR_VALUE_RECORD(R);
+    S += N;
   }
+  return NumValueKinds;
+}
 
-  return RealSize;
+static ValueProfNode *getNextNValueData(uint32_t VK, uint32_t Site,
+                                        InstrProfValueData *Dst,
+                                        ValueProfNode *StartNode, uint32_t N) {
+  unsigned I;
+  ValueProfNode *VNode = StartNode ? StartNode : RTRecord.NodesKind[VK][Site];
+  for (I = 0; I < N; I++) {
+    Dst[I].Value = VNode->Value;
+    Dst[I].Count = VNode->Count;
+    VNode = VNode->Next;
+  }
+  return VNode;
+}
+
+static uint32_t getValueProfDataSizeWrapper(void) {
+  return getValueProfDataSize(&RTRecordClosure);
+}
+
+static uint32_t getNumValueDataForSiteWrapper(uint32_t VK, uint32_t S) {
+  return getNumValueDataForSiteRT(&RTRecord, VK, S);
+}
+
+static VPDataReaderType TheVPDataReader = {
+    initializeValueProfRuntimeRecord, getValueProfRecordHeaderSize,
+    getFirstValueProfRecord,          getNumValueDataForSiteWrapper,
+    getValueProfDataSizeWrapper,      getNextNValueData};
+
+COMPILER_RT_VISIBILITY VPDataReaderType *lprofGetVPDataReader() {
+  return &TheVPDataReader;
 }
diff --git a/lib/profile/InstrProfilingWriter.c b/lib/profile/InstrProfilingWriter.c
index 4c9e679..95f37e8 100644
--- a/lib/profile/InstrProfilingWriter.c
+++ b/lib/profile/InstrProfilingWriter.c
@@ -9,11 +9,225 @@
 
 #include "InstrProfiling.h"
 #include "InstrProfilingInternal.h"
+#ifdef _MSC_VER
+/* For _alloca */
+#include <malloc.h>
+#endif
+#include <string.h>
 
-COMPILER_RT_VISIBILITY int llvmWriteProfData(WriterCallback Writer,
-                                             void *WriterCtx,
-                                             const uint8_t *ValueDataBegin,
-                                             const uint64_t ValueDataSize) {
+#define INSTR_PROF_VALUE_PROF_DATA
+#include "InstrProfData.inc"
+
+COMPILER_RT_VISIBILITY void (*FreeHook)(void *) = NULL;
+static ProfBufferIO TheBufferIO;
+#define VP_BUFFER_SIZE 8 * 1024
+static uint8_t BufferIOBuffer[VP_BUFFER_SIZE];
+static InstrProfValueData VPDataArray[16];
+static uint32_t VPDataArraySize = sizeof(VPDataArray) / sizeof(*VPDataArray);
+
+COMPILER_RT_VISIBILITY uint8_t *DynamicBufferIOBuffer = 0;
+COMPILER_RT_VISIBILITY uint32_t VPBufferSize = 0;
+
+/* The buffer writer is reponsponsible in keeping writer state
+ * across the call.
+ */
+COMPILER_RT_VISIBILITY uint32_t lprofBufferWriter(ProfDataIOVec *IOVecs,
+                                                  uint32_t NumIOVecs,
+                                                  void **WriterCtx) {
+  uint32_t I;
+  char **Buffer = (char **)WriterCtx;
+  for (I = 0; I < NumIOVecs; I++) {
+    size_t Length = IOVecs[I].ElmSize * IOVecs[I].NumElm;
+    memcpy(*Buffer, IOVecs[I].Data, Length);
+    *Buffer += Length;
+  }
+  return 0;
+}
+
+static void llvmInitBufferIO(ProfBufferIO *BufferIO, WriterCallback FileWriter,
+                             void *File, uint8_t *Buffer, uint32_t BufferSz) {
+  BufferIO->File = File;
+  BufferIO->FileWriter = FileWriter;
+  BufferIO->BufferStart = Buffer;
+  BufferIO->BufferSz = BufferSz;
+  BufferIO->CurOffset = 0;
+}
+
+COMPILER_RT_VISIBILITY ProfBufferIO *
+lprofCreateBufferIO(WriterCallback FileWriter, void *File) {
+  uint8_t *Buffer = DynamicBufferIOBuffer;
+  uint32_t BufferSize = VPBufferSize;
+  if (!Buffer) {
+    Buffer = &BufferIOBuffer[0];
+    BufferSize = sizeof(BufferIOBuffer);
+  }
+  llvmInitBufferIO(&TheBufferIO, FileWriter, File, Buffer, BufferSize);
+  return &TheBufferIO;
+}
+
+COMPILER_RT_VISIBILITY void lprofDeleteBufferIO(ProfBufferIO *BufferIO) {
+  if (DynamicBufferIOBuffer) {
+    FreeHook(DynamicBufferIOBuffer);
+    DynamicBufferIOBuffer = 0;
+    VPBufferSize = 0;
+  }
+}
+
+COMPILER_RT_VISIBILITY int
+lprofBufferIOWrite(ProfBufferIO *BufferIO, const uint8_t *Data, uint32_t Size) {
+  /* Buffer is not large enough, it is time to flush.  */
+  if (Size + BufferIO->CurOffset > BufferIO->BufferSz) {
+    if (lprofBufferIOFlush(BufferIO) != 0)
+      return -1;
+  }
+  /* Special case, bypass the buffer completely. */
+  ProfDataIOVec IO[] = {{Data, sizeof(uint8_t), Size}};
+  if (Size > BufferIO->BufferSz) {
+    if (BufferIO->FileWriter(IO, 1, &BufferIO->File))
+      return -1;
+  } else {
+    /* Write the data to buffer */
+    uint8_t *Buffer = BufferIO->BufferStart + BufferIO->CurOffset;
+    lprofBufferWriter(IO, 1, (void **)&Buffer);
+    BufferIO->CurOffset = Buffer - BufferIO->BufferStart;
+  }
+  return 0;
+}
+
+COMPILER_RT_VISIBILITY int lprofBufferIOFlush(ProfBufferIO *BufferIO) {
+  if (BufferIO->CurOffset) {
+    ProfDataIOVec IO[] = {
+        {BufferIO->BufferStart, sizeof(uint8_t), BufferIO->CurOffset}};
+    if (BufferIO->FileWriter(IO, 1, &BufferIO->File))
+      return -1;
+    BufferIO->CurOffset = 0;
+  }
+  return 0;
+}
+
+/* Write out value profile data for function specified with \c Data.
+ * The implementation does not use the method \c serializeValueProfData
+ * which depends on dynamic memory allocation. In this implementation,
+ * value profile data is written out to \c BufferIO piecemeal.
+ */
+static int writeOneValueProfData(ProfBufferIO *BufferIO,
+                                 VPDataReaderType *VPDataReader,
+                                 const __llvm_profile_data *Data) {
+  unsigned I, NumValueKinds = 0;
+  ValueProfData VPHeader;
+  uint8_t *SiteCountArray[IPVK_Last + 1];
+
+  for (I = 0; I <= IPVK_Last; I++) {
+    if (!Data->NumValueSites[I])
+      SiteCountArray[I] = 0;
+    else {
+      uint32_t Sz =
+          VPDataReader->GetValueProfRecordHeaderSize(Data->NumValueSites[I]) -
+          offsetof(ValueProfRecord, SiteCountArray);
+      /* Only use alloca for this small byte array to avoid excessive
+       * stack growth.  */
+      SiteCountArray[I] = (uint8_t *)COMPILER_RT_ALLOCA(Sz);
+      memset(SiteCountArray[I], 0, Sz);
+    }
+  }
+
+  /* If NumValueKinds returned is 0, there is nothing to write, report
+     success and return. This should match the raw profile reader's behavior. */
+  if (!(NumValueKinds = VPDataReader->InitRTRecord(Data, SiteCountArray)))
+    return 0;
+
+  /* First write the header structure. */
+  VPHeader.TotalSize = VPDataReader->GetValueProfDataSize();
+  VPHeader.NumValueKinds = NumValueKinds;
+  if (lprofBufferIOWrite(BufferIO, (const uint8_t *)&VPHeader,
+                         sizeof(ValueProfData)))
+    return -1;
+
+  /* Make sure nothing else needs to be written before value profile
+   * records. */
+  if ((void *)VPDataReader->GetFirstValueProfRecord(&VPHeader) !=
+      (void *)(&VPHeader + 1))
+    return -1;
+
+  /* Write out the value profile record for each value kind
+   * one by one. */
+  for (I = 0; I <= IPVK_Last; I++) {
+    uint32_t J;
+    ValueProfRecord RecordHeader;
+    /* The size of the value prof record header without counting the
+     * site count array .*/
+    uint32_t RecordHeaderSize = offsetof(ValueProfRecord, SiteCountArray);
+    uint32_t SiteCountArraySize;
+
+    if (!Data->NumValueSites[I])
+      continue;
+
+    /* Write out the record header.  */
+    RecordHeader.Kind = I;
+    RecordHeader.NumValueSites = Data->NumValueSites[I];
+    if (lprofBufferIOWrite(BufferIO, (const uint8_t *)&RecordHeader,
+                           RecordHeaderSize))
+      return -1;
+
+    /* Write out the site value count array including padding space. */
+    SiteCountArraySize =
+        VPDataReader->GetValueProfRecordHeaderSize(Data->NumValueSites[I]) -
+        RecordHeaderSize;
+    if (lprofBufferIOWrite(BufferIO, SiteCountArray[I], SiteCountArraySize))
+      return -1;
+
+    /* Write out the value profile data for each value site.  */
+    for (J = 0; J < Data->NumValueSites[I]; J++) {
+      uint32_t NRead, NRemain;
+      ValueProfNode *NextStartNode = 0;
+      NRemain = VPDataReader->GetNumValueDataForSite(I, J);
+      if (!NRemain)
+        continue;
+      /* Read and write out value data in small chunks till it is done. */
+      do {
+        NRead = (NRemain > VPDataArraySize ? VPDataArraySize : NRemain);
+        NextStartNode =
+            VPDataReader->GetValueData(I, /* ValueKind */
+                                       J, /* Site */
+                                       &VPDataArray[0], NextStartNode, NRead);
+        if (lprofBufferIOWrite(BufferIO, (const uint8_t *)&VPDataArray[0],
+                               NRead * sizeof(InstrProfValueData)))
+          return -1;
+        NRemain -= NRead;
+      } while (NRemain != 0);
+    }
+  }
+  /* All done report success.  */
+  return 0;
+}
+
+static int writeValueProfData(WriterCallback Writer, void *WriterCtx,
+                              VPDataReaderType *VPDataReader,
+                              const __llvm_profile_data *DataBegin,
+                              const __llvm_profile_data *DataEnd) {
+  ProfBufferIO *BufferIO;
+  const __llvm_profile_data *DI = 0;
+
+  if (!VPDataReader)
+    return 0;
+
+  BufferIO = lprofCreateBufferIO(Writer, WriterCtx);
+
+  for (DI = DataBegin; DI < DataEnd; DI++) {
+    if (writeOneValueProfData(BufferIO, VPDataReader, DI))
+      return -1;
+  }
+
+  if (lprofBufferIOFlush(BufferIO) != 0)
+    return -1;
+  lprofDeleteBufferIO(BufferIO);
+
+  return 0;
+}
+
+COMPILER_RT_VISIBILITY int lprofWriteData(WriterCallback Writer,
+                                          void *WriterCtx,
+                                          VPDataReaderType *VPDataReader) {
   /* Match logic in __llvm_profile_write_buffer(). */
   const __llvm_profile_data *DataBegin = __llvm_profile_begin_data();
   const __llvm_profile_data *DataEnd = __llvm_profile_end_data();
@@ -21,20 +235,21 @@
   const uint64_t *CountersEnd = __llvm_profile_end_counters();
   const char *NamesBegin = __llvm_profile_begin_names();
   const char *NamesEnd = __llvm_profile_end_names();
-  return llvmWriteProfDataImpl(Writer, WriterCtx, DataBegin, DataEnd,
-                               CountersBegin, CountersEnd, ValueDataBegin,
-                               ValueDataSize, NamesBegin, NamesEnd);
+  return lprofWriteDataImpl(Writer, WriterCtx, DataBegin, DataEnd,
+                            CountersBegin, CountersEnd, VPDataReader,
+                            NamesBegin, NamesEnd);
 }
 
-COMPILER_RT_VISIBILITY int llvmWriteProfDataImpl(
-    WriterCallback Writer, void *WriterCtx,
-    const __llvm_profile_data *DataBegin, const __llvm_profile_data *DataEnd,
-    const uint64_t *CountersBegin, const uint64_t *CountersEnd,
-    const uint8_t *ValueDataBegin, const uint64_t ValueDataSize,
-    const char *NamesBegin, const char *NamesEnd) {
+COMPILER_RT_VISIBILITY int
+lprofWriteDataImpl(WriterCallback Writer, void *WriterCtx,
+                   const __llvm_profile_data *DataBegin,
+                   const __llvm_profile_data *DataEnd,
+                   const uint64_t *CountersBegin, const uint64_t *CountersEnd,
+                   VPDataReaderType *VPDataReader, const char *NamesBegin,
+                   const char *NamesEnd) {
 
   /* Calculate size of sections. */
-  const uint64_t DataSize = DataEnd - DataBegin;
+  const uint64_t DataSize = __llvm_profile_get_data_size(DataBegin, DataEnd);
   const uint64_t CountersSize = CountersEnd - CountersBegin;
   const uint64_t NamesSize = NamesEnd - NamesBegin;
   const uint64_t Padding = __llvm_profile_get_num_padding_bytes(NamesSize);
@@ -48,23 +263,19 @@
   if (!DataSize)
     return 0;
 
-  /* Initialize header struture.  */
+/* Initialize header structure.  */
 #define INSTR_PROF_RAW_HEADER(Type, Name, Init) Header.Name = Init;
 #include "InstrProfData.inc"
 
   /* Write the data. */
-  ProfDataIOVec IOVec[] = {
-      {&Header, sizeof(__llvm_profile_header), 1},
-      {DataBegin, sizeof(__llvm_profile_data), DataSize},
-      {CountersBegin, sizeof(uint64_t), CountersSize},
-      {NamesBegin, sizeof(char), NamesSize},
-      {Zeroes, sizeof(char), Padding}};
+  ProfDataIOVec IOVec[] = {{&Header, sizeof(__llvm_profile_header), 1},
+                           {DataBegin, sizeof(__llvm_profile_data), DataSize},
+                           {CountersBegin, sizeof(uint64_t), CountersSize},
+                           {NamesBegin, sizeof(uint8_t), NamesSize},
+                           {Zeroes, sizeof(uint8_t), Padding}};
   if (Writer(IOVec, sizeof(IOVec) / sizeof(*IOVec), &WriterCtx))
     return -1;
-  if (ValueDataBegin) {
-    ProfDataIOVec IOVec2[] = {{ValueDataBegin, sizeof(char), ValueDataSize}};
-    if (Writer(IOVec2, sizeof(IOVec2) / sizeof(*IOVec2), &WriterCtx))
-      return -1;
-  }
-  return 0;
+
+  return writeValueProfData(Writer, WriterCtx, VPDataReader, DataBegin,
+                            DataEnd);
 }
diff --git a/lib/profile/Makefile.mk b/lib/profile/Makefile.mk
deleted file mode 100644
index dd3a36f..0000000
--- a/lib/profile/Makefile.mk
+++ /dev/null
@@ -1,18 +0,0 @@
-#===- lib/profile/Makefile.mk ------------------------------*- Makefile -*--===#
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-#===------------------------------------------------------------------------===#
-
-ModuleName := profile
-SubDirs :=
-
-Sources := $(foreach file,$(wildcard $(Dir)/*.c $(Dir)/*.cc),$(notdir $(file)))
-ObjNames := $(patsubst %.c,%.o,$(patsubst %.cc,%.o,$(Sources)))
-Implementation := Generic
-
-# FIXME: use automatic dependencies?
-Dependencies := $(wildcard $(Dir)/*.h)
diff --git a/lib/profile/WindowsMMap.c b/lib/profile/WindowsMMap.c
new file mode 100644
index 0000000..1f73420
--- /dev/null
+++ b/lib/profile/WindowsMMap.c
@@ -0,0 +1,128 @@
+/*
+ * This code is derived from uClibc (original license follows).
+ * https://git.uclibc.org/uClibc/tree/utils/mmap-windows.c
+ */
+ /* mmap() replacement for Windows
+ *
+ * Author: Mike Frysinger <vapier@gentoo.org>
+ * Placed into the public domain
+ */
+
+/* References:
+ * CreateFileMapping: http://msdn.microsoft.com/en-us/library/aa366537(VS.85).aspx
+ * CloseHandle:       http://msdn.microsoft.com/en-us/library/ms724211(VS.85).aspx
+ * MapViewOfFile:     http://msdn.microsoft.com/en-us/library/aa366761(VS.85).aspx
+ * UnmapViewOfFile:   http://msdn.microsoft.com/en-us/library/aa366882(VS.85).aspx
+ */
+
+#if defined(_WIN32)
+
+#include "WindowsMMap.h"
+#include "InstrProfiling.h"
+
+#ifdef __USE_FILE_OFFSET64
+# define DWORD_HI(x) (x >> 32)
+# define DWORD_LO(x) ((x) & 0xffffffff)
+#else
+# define DWORD_HI(x) (0)
+# define DWORD_LO(x) (x)
+#endif
+
+COMPILER_RT_VISIBILITY
+void *mmap(void *start, size_t length, int prot, int flags, int fd, off_t offset)
+{
+  if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC))
+    return MAP_FAILED;
+  if (fd == -1) {
+    if (!(flags & MAP_ANON) || offset)
+      return MAP_FAILED;
+  } else if (flags & MAP_ANON)
+    return MAP_FAILED;
+
+  DWORD flProtect;
+  if (prot & PROT_WRITE) {
+    if (prot & PROT_EXEC)
+      flProtect = PAGE_EXECUTE_READWRITE;
+    else
+      flProtect = PAGE_READWRITE;
+  } else if (prot & PROT_EXEC) {
+    if (prot & PROT_READ)
+      flProtect = PAGE_EXECUTE_READ;
+    else if (prot & PROT_EXEC)
+      flProtect = PAGE_EXECUTE;
+  } else
+    flProtect = PAGE_READONLY;
+
+  off_t end = length + offset;
+  HANDLE mmap_fd, h;
+  if (fd == -1)
+    mmap_fd = INVALID_HANDLE_VALUE;
+  else
+    mmap_fd = (HANDLE)_get_osfhandle(fd);
+  h = CreateFileMapping(mmap_fd, NULL, flProtect, DWORD_HI(end), DWORD_LO(end), NULL);
+  if (h == NULL)
+    return MAP_FAILED;
+
+  DWORD dwDesiredAccess;
+  if (prot & PROT_WRITE)
+    dwDesiredAccess = FILE_MAP_WRITE;
+  else
+    dwDesiredAccess = FILE_MAP_READ;
+  if (prot & PROT_EXEC)
+    dwDesiredAccess |= FILE_MAP_EXECUTE;
+  if (flags & MAP_PRIVATE)
+    dwDesiredAccess |= FILE_MAP_COPY;
+  void *ret = MapViewOfFile(h, dwDesiredAccess, DWORD_HI(offset), DWORD_LO(offset), length);
+  if (ret == NULL) {
+    CloseHandle(h);
+    ret = MAP_FAILED;
+  }
+  return ret;
+}
+
+COMPILER_RT_VISIBILITY
+void munmap(void *addr, size_t length)
+{
+  UnmapViewOfFile(addr);
+  /* ruh-ro, we leaked handle from CreateFileMapping() ... */
+}
+
+COMPILER_RT_VISIBILITY
+int msync(void *addr, size_t length, int flags)
+{
+  if (flags & MS_INVALIDATE)
+    return -1; /* Not supported. */
+
+  /* Exactly one of MS_ASYNC or MS_SYNC must be specified. */
+  switch (flags & (MS_ASYNC | MS_SYNC)) {
+    case MS_SYNC:
+    case MS_ASYNC:
+      break;
+    default:
+      return -1;
+  }
+
+  if (!FlushViewOfFile(addr, length))
+    return -1;
+
+  if (flags & MS_SYNC) {
+    /* FIXME: No longer have access to handle from CreateFileMapping(). */
+    /*
+     * if (!FlushFileBuffers(h))
+     *   return -1;
+     */
+  }
+
+  return 0;
+}
+
+COMPILER_RT_VISIBILITY
+int flock(int fd, int operation)
+{
+  return -1; /* Not supported. */
+}
+
+#undef DWORD_HI
+#undef DWORD_LO
+
+#endif /* _WIN32 */
diff --git a/lib/profile/WindowsMMap.h b/lib/profile/WindowsMMap.h
new file mode 100644
index 0000000..271619a
--- /dev/null
+++ b/lib/profile/WindowsMMap.h
@@ -0,0 +1,59 @@
+/*===- WindowsMMap.h - Support library for PGO instrumentation ------------===*\
+|*
+|*                     The LLVM Compiler Infrastructure
+|*
+|* This file is distributed under the University of Illinois Open Source
+|* License. See LICENSE.TXT for details.
+|*
+\*===----------------------------------------------------------------------===*/
+
+#ifndef PROFILE_INSTRPROFILING_WINDOWS_MMAP_H
+#define PROFILE_INSTRPROFILING_WINDOWS_MMAP_H
+
+#if defined(_WIN32)
+
+#include <BaseTsd.h>
+#include <io.h>
+#include <sys/types.h>
+
+/*
+ * mmap() flags
+ */
+#define PROT_READ     0x1
+#define PROT_WRITE    0x2
+#define PROT_EXEC     0x0
+
+#define MAP_FILE      0x00
+#define MAP_SHARED    0x01
+#define MAP_PRIVATE   0x02
+#define MAP_ANONYMOUS 0x20
+#define MAP_ANON      MAP_ANONYMOUS
+#define MAP_FAILED    ((void *) -1)
+
+/*
+ * msync() flags
+ */
+#define MS_ASYNC        0x0001  /* return immediately */
+#define MS_INVALIDATE   0x0002  /* invalidate all cached data */
+#define MS_SYNC         0x0010  /* msync synchronously */
+
+/*
+ * flock() operations
+ */
+#define   LOCK_SH   1    /* shared lock */
+#define   LOCK_EX   2    /* exclusive lock */
+#define   LOCK_NB   4    /* don't block when locking */
+#define   LOCK_UN   8    /* unlock */
+
+void *mmap(void *start, size_t length, int prot, int flags, int fd,
+           off_t offset);
+
+void munmap(void *addr, size_t length);
+
+int msync(void *addr, size_t length, int flags);
+
+int flock(int fd, int operation);
+
+#endif /* _WIN32 */
+
+#endif /* PROFILE_INSTRPROFILING_WINDOWS_MMAP_H */
diff --git a/lib/safestack/CMakeLists.txt b/lib/safestack/CMakeLists.txt
index 9c11bb6..a3870ab 100644
--- a/lib/safestack/CMakeLists.txt
+++ b/lib/safestack/CMakeLists.txt
@@ -1,4 +1,6 @@
 add_custom_target(safestack)
+set_target_properties(safestack PROPERTIES
+  FOLDER "Compiler-RT Misc")
 
 set(SAFESTACK_SOURCES safestack.cc)
 
diff --git a/lib/sanitizer_common/.clang-tidy b/lib/sanitizer_common/.clang-tidy
new file mode 100644
index 0000000..aa695cc
--- /dev/null
+++ b/lib/sanitizer_common/.clang-tidy
@@ -0,0 +1,12 @@
+Checks: '-*,clang-diagnostic-*,llvm-*,misc-*,readability-identifier-naming'
+CheckOptions:
+  - key:             readability-identifier-naming.ClassCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.EnumCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.FunctionCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.UnionCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.VariableCase
+    value:           lower_case
diff --git a/lib/sanitizer_common/Android.bp b/lib/sanitizer_common/Android.bp
index 2de4ff1..687e3f8 100644
--- a/lib/sanitizer_common/Android.bp
+++ b/lib/sanitizer_common/Android.bp
@@ -28,6 +28,8 @@
     cppflags: [
         "-fvisibility=hidden",
         "-fno-exceptions",
+        "-fno-rtti",
+        "-fno-builtin",
         "-std=c++11",
         "-Wall",
         "-Werror",
@@ -35,49 +37,18 @@
         "-Wno-unused-parameter",
     ],
     srcs: [
-        // rtl
-        "sanitizer_allocator.cc",
-        "sanitizer_common.cc",
-        "sanitizer_deadlock_detector1.cc",
-        "sanitizer_deadlock_detector2.cc",
-        "sanitizer_flags.cc",
-        "sanitizer_flag_parser.cc",
-        "sanitizer_libc.cc",
-        "sanitizer_libignore.cc",
-        "sanitizer_linux.cc",
-        "sanitizer_mac.cc",
-        "sanitizer_persistent_allocator.cc",
-        "sanitizer_platform_limits_linux.cc",
-        "sanitizer_platform_limits_posix.cc",
-        "sanitizer_posix.cc",
-        "sanitizer_printf.cc",
-        "sanitizer_procmaps_common.cc",
-        "sanitizer_procmaps_freebsd.cc",
-        "sanitizer_procmaps_linux.cc",
-        "sanitizer_procmaps_mac.cc",
-        "sanitizer_stackdepot.cc",
-        "sanitizer_stacktrace.cc",
-        "sanitizer_stacktrace_printer.cc",
-        "sanitizer_suppressions.cc",
-        "sanitizer_symbolizer.cc",
-        "sanitizer_symbolizer_libbacktrace.cc",
-        "sanitizer_symbolizer_win.cc",
-        "sanitizer_tls_get_addr.cc",
-        "sanitizer_thread_registry.cc",
-        "sanitizer_win.cc",
-
-        // cdep
-        "sanitizer_common_libcdep.cc",
-        "sanitizer_coverage_libcdep.cc",
-        "sanitizer_coverage_mapping_libcdep.cc",
-        "sanitizer_linux_libcdep.cc",
-        "sanitizer_posix_libcdep.cc",
-        "sanitizer_stacktrace_libcdep.cc",
-        "sanitizer_stoptheworld_linux_libcdep.cc",
-        "sanitizer_symbolizer_libcdep.cc",
-        "sanitizer_symbolizer_posix_libcdep.cc",
-        "sanitizer_unwind_linux_libcdep.cc",
+        "*.cc",
     ],
+    exclude_srcs: [
+        "sanitizer_common_nolibc.cc",
+    ],
+    arch: {
+        x86_64: {
+            srcs: [
+                "sanitizer_linux_x86_64.S",
+            ]
+        }
+    },
     stl: "none",
     sanitize: {
         never: true,
diff --git a/lib/sanitizer_common/CMakeLists.txt b/lib/sanitizer_common/CMakeLists.txt
index 6a20f02..4af0009 100644
--- a/lib/sanitizer_common/CMakeLists.txt
+++ b/lib/sanitizer_common/CMakeLists.txt
@@ -1,7 +1,7 @@
 # Build system for the common Sanitizer runtime support library components.
 # These components are shared between AddressSanitizer and ThreadSanitizer.
 
-set(SANITIZER_SOURCES
+set(SANITIZER_SOURCES_NOTERMINATION
   sanitizer_allocator.cc
   sanitizer_common.cc
   sanitizer_deadlock_detector1.cc
@@ -11,6 +11,7 @@
   sanitizer_libc.cc
   sanitizer_libignore.cc
   sanitizer_linux.cc
+  sanitizer_linux_s390.cc
   sanitizer_mac.cc
   sanitizer_persistent_allocator.cc
   sanitizer_platform_limits_linux.cc
@@ -33,6 +34,14 @@
   sanitizer_thread_registry.cc
   sanitizer_win.cc)
 
+if(UNIX AND NOT APPLE)
+  list(APPEND SANITIZER_SOURCES_NOTERMINATION
+    sanitizer_linux_x86_64.S)
+endif()
+
+set(SANITIZER_SOURCES
+  ${SANITIZER_SOURCES_NOTERMINATION} sanitizer_termination.cc)
+
 # Libc functions stubs. These sources should be linked instead of
 # SANITIZER_LIBCDEP_SOURCES when sanitizer_common library must not depend on
 # libc.
@@ -122,13 +131,24 @@
 append_have_file_definition(tirpc/rpc/xdr.h HAVE_TIRPC_RPC_XDR_H SANITIZER_COMMON_DEFINITIONS)
 
 set(SANITIZER_CFLAGS ${SANITIZER_COMMON_CFLAGS})
-append_no_rtti_flag(SANITIZER_CFLAGS)
+append_rtti_flag(OFF SANITIZER_CFLAGS)
 
 append_list_if(SANITIZER_LIMIT_FRAME_SIZE -Wframe-larger-than=570
                SANITIZER_CFLAGS)
 append_list_if(COMPILER_RT_HAS_WGLOBAL_CONSTRUCTORS_FLAG -Wglobal-constructors
                SANITIZER_CFLAGS)
 
+if (LLVM_ENABLE_PEDANTIC AND UNIX AND NOT APPLE)
+  # With -pedantic, our .S files raise warnings about empty macro arguments
+  # from __USER_LABEL_PREFIX__ being an empty arg to GLUE().  Unfortunately,
+  # there is no simple way to test for an empty define, nor to disable just
+  # that warning or to disable -pedantic.  There is also no simple way to
+  # remove -pedantic from just this file (we'd have to remove from
+  # CMAKE_C*_FLAGS and re-add as a source property to all the non-.S files).
+  set_source_files_properties(sanitizer_linux_x86_64.S
+    PROPERTIES COMPILE_FLAGS "-w")
+endif ()
+
 if(APPLE)
   set(OS_OPTION OS ${SANITIZER_COMMON_SUPPORTED_OS})
 endif()
@@ -139,6 +159,12 @@
   SOURCES ${SANITIZER_SOURCES}
   CFLAGS ${SANITIZER_CFLAGS}
   DEFS ${SANITIZER_COMMON_DEFINITIONS})
+add_compiler_rt_object_libraries(RTSanitizerCommonNoTermination
+  ${OS_OPTION}
+  ARCHS ${SANITIZER_COMMON_SUPPORTED_ARCH}
+  SOURCES ${SANITIZER_SOURCES_NOTERMINATION}
+  CFLAGS ${SANITIZER_CFLAGS}
+  DEFS ${SANITIZER_COMMON_DEFINITIONS})
 add_compiler_rt_object_libraries(RTSanitizerCommonNoLibc
   ${OS_OPTION}
   ARCHS ${SANITIZER_COMMON_SUPPORTED_ARCH}
diff --git a/lib/sanitizer_common/Makefile.mk b/lib/sanitizer_common/Makefile.mk
deleted file mode 100644
index 5bb20d0..0000000
--- a/lib/sanitizer_common/Makefile.mk
+++ /dev/null
@@ -1,24 +0,0 @@
-#===- lib/sanitizer_common/Makefile.mk ---------------------*- Makefile -*--===#
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-#===------------------------------------------------------------------------===#
-
-ModuleName := sanitizer_common
-SubDirs :=
-
-Sources := $(foreach file,$(wildcard $(Dir)/*.cc),$(notdir $(file)))
-NolibcSources := $(foreach file,$(wildcard $(Dir)/*_nolibc.cc),$(notdir $(file)))
-Sources := $(filter-out $(NolibcSources),$(Sources))
-ObjNames := $(Sources:%.cc=%.o)
-
-Implementation := Generic
-
-# FIXME: use automatic dependencies?
-Dependencies := $(wildcard $(Dir)/*.h)
-
-# Define a convenience variable for all the sanitizer_common functions.
-SanitizerCommonFunctions := $(Sources:%.cc=%)
diff --git a/lib/sanitizer_common/sanitizer_allocator.cc b/lib/sanitizer_common/sanitizer_allocator.cc
index 538e2db..df298c6 100644
--- a/lib/sanitizer_common/sanitizer_allocator.cc
+++ b/lib/sanitizer_common/sanitizer_allocator.cc
@@ -22,30 +22,47 @@
 #if defined(SANITIZER_GO) || defined(SANITIZER_USE_MALLOC)
 # if SANITIZER_LINUX && !SANITIZER_ANDROID
 extern "C" void *__libc_malloc(uptr size);
+extern "C" void *__libc_memalign(uptr alignment, uptr size);
+extern "C" void *__libc_realloc(void *ptr, uptr size);
 extern "C" void __libc_free(void *ptr);
-#  define LIBC_MALLOC __libc_malloc
-#  define LIBC_FREE __libc_free
 # else
 #  include <stdlib.h>
-#  define LIBC_MALLOC malloc
-#  define LIBC_FREE free
+#  define __libc_malloc malloc
+static void *__libc_memalign(uptr alignment, uptr size) {
+  void *p;
+  uptr error = posix_memalign(&p, alignment, size);
+  if (error) return nullptr;
+  return p;
+}
+#  define __libc_realloc realloc
+#  define __libc_free free
 # endif
 
-static void *RawInternalAlloc(uptr size, InternalAllocatorCache *cache) {
+static void *RawInternalAlloc(uptr size, InternalAllocatorCache *cache,
+                              uptr alignment) {
   (void)cache;
-  return LIBC_MALLOC(size);
+  if (alignment == 0)
+    return __libc_malloc(size);
+  else
+    return __libc_memalign(alignment, size);
+}
+
+static void *RawInternalRealloc(void *ptr, uptr size,
+                                InternalAllocatorCache *cache) {
+  (void)cache;
+  return __libc_realloc(ptr, size);
 }
 
 static void RawInternalFree(void *ptr, InternalAllocatorCache *cache) {
   (void)cache;
-  LIBC_FREE(ptr);
+  __libc_free(ptr);
 }
 
 InternalAllocator *internal_allocator() {
   return 0;
 }
 
-#else // SANITIZER_GO
+#else  // defined(SANITIZER_GO) || defined(SANITIZER_USE_MALLOC)
 
 static ALIGNED(64) char internal_alloc_placeholder[sizeof(InternalAllocator)];
 static atomic_uint8_t internal_allocator_initialized;
@@ -68,13 +85,26 @@
   return internal_allocator_instance;
 }
 
-static void *RawInternalAlloc(uptr size, InternalAllocatorCache *cache) {
+static void *RawInternalAlloc(uptr size, InternalAllocatorCache *cache,
+                              uptr alignment) {
+  if (alignment == 0) alignment = 8;
   if (cache == 0) {
     SpinMutexLock l(&internal_allocator_cache_mu);
-    return internal_allocator()->Allocate(&internal_allocator_cache, size, 8,
-                                          false);
+    return internal_allocator()->Allocate(&internal_allocator_cache, size,
+                                          alignment, false);
   }
-  return internal_allocator()->Allocate(cache, size, 8, false);
+  return internal_allocator()->Allocate(cache, size, alignment, false);
+}
+
+static void *RawInternalRealloc(void *ptr, uptr size,
+                                InternalAllocatorCache *cache) {
+  uptr alignment = 8;
+  if (cache == 0) {
+    SpinMutexLock l(&internal_allocator_cache_mu);
+    return internal_allocator()->Reallocate(&internal_allocator_cache, ptr,
+                                            size, alignment);
+  }
+  return internal_allocator()->Reallocate(cache, ptr, size, alignment);
 }
 
 static void RawInternalFree(void *ptr, InternalAllocatorCache *cache) {
@@ -85,20 +115,42 @@
   internal_allocator()->Deallocate(cache, ptr);
 }
 
-#endif // SANITIZER_GO
+#endif  // defined(SANITIZER_GO) || defined(SANITIZER_USE_MALLOC)
 
 const u64 kBlockMagic = 0x6A6CB03ABCEBC041ull;
 
-void *InternalAlloc(uptr size, InternalAllocatorCache *cache) {
+void *InternalAlloc(uptr size, InternalAllocatorCache *cache, uptr alignment) {
   if (size + sizeof(u64) < size)
     return nullptr;
-  void *p = RawInternalAlloc(size + sizeof(u64), cache);
+  void *p = RawInternalAlloc(size + sizeof(u64), cache, alignment);
   if (!p)
     return nullptr;
   ((u64*)p)[0] = kBlockMagic;
   return (char*)p + sizeof(u64);
 }
 
+void *InternalRealloc(void *addr, uptr size, InternalAllocatorCache *cache) {
+  if (!addr)
+    return InternalAlloc(size, cache);
+  if (size + sizeof(u64) < size)
+    return nullptr;
+  addr = (char*)addr - sizeof(u64);
+  size = size + sizeof(u64);
+  CHECK_EQ(kBlockMagic, ((u64*)addr)[0]);
+  void *p = RawInternalRealloc(addr, size, cache);
+  if (!p)
+    return nullptr;
+  return (char*)p + sizeof(u64);
+}
+
+void *InternalCalloc(uptr count, uptr size, InternalAllocatorCache *cache) {
+  if (CallocShouldReturnNullDueToOverflow(count, size))
+    return internal_allocator()->ReturnNullOrDie();
+  void *p = InternalAlloc(count * size, cache);
+  if (p) internal_memset(p, 0, count * size);
+  return p;
+}
+
 void InternalFree(void *addr, InternalAllocatorCache *cache) {
   if (!addr)
     return;
diff --git a/lib/sanitizer_common/sanitizer_allocator.h b/lib/sanitizer_common/sanitizer_allocator.h
index 44d6fce..f0f0020 100644
--- a/lib/sanitizer_common/sanitizer_allocator.h
+++ b/lib/sanitizer_common/sanitizer_allocator.h
@@ -297,9 +297,10 @@
 
 // SizeClassAllocator64 -- allocator for 64-bit address space.
 //
-// Space: a portion of address space of kSpaceSize bytes starting at
-// a fixed address (kSpaceBeg). Both constants are powers of two and
-// kSpaceBeg is kSpaceSize-aligned.
+// Space: a portion of address space of kSpaceSize bytes starting at SpaceBeg.
+// If kSpaceBeg is ~0 then SpaceBeg is chosen dynamically my mmap.
+// Otherwise SpaceBeg=kSpaceBeg (fixed address).
+// kSpaceSize is a power of two.
 // At the beginning the entire space is mprotect-ed, then small parts of it
 // are mapped on demand.
 //
@@ -322,9 +323,16 @@
   typedef SizeClassAllocatorLocalCache<ThisT> AllocatorCache;
 
   void Init() {
-    CHECK_EQ(kSpaceBeg,
-             reinterpret_cast<uptr>(MmapNoAccess(kSpaceBeg, kSpaceSize)));
-    MapWithCallback(kSpaceEnd, AdditionalSize());
+    uptr TotalSpaceSize = kSpaceSize + AdditionalSize();
+    if (kUsingConstantSpaceBeg) {
+      CHECK_EQ(kSpaceBeg, reinterpret_cast<uptr>(
+                              MmapFixedNoAccess(kSpaceBeg, TotalSpaceSize)));
+    } else {
+      NonConstSpaceBeg =
+          reinterpret_cast<uptr>(MmapNoAccess(TotalSpaceSize));
+      CHECK_NE(NonConstSpaceBeg, ~(uptr)0);
+    }
+    MapWithCallback(SpaceEnd(), AdditionalSize());
   }
 
   void MapWithCallback(uptr beg, uptr size) {
@@ -360,12 +368,18 @@
     region->n_freed += b->count;
   }
 
-  static bool PointerIsMine(const void *p) {
-    return reinterpret_cast<uptr>(p) / kSpaceSize == kSpaceBeg / kSpaceSize;
+  bool PointerIsMine(const void *p) {
+    uptr P = reinterpret_cast<uptr>(p);
+    if (kUsingConstantSpaceBeg && (kSpaceBeg % kSpaceSize) == 0)
+      return P / kSpaceSize == kSpaceBeg / kSpaceSize;
+    return P >= SpaceBeg() && P < SpaceEnd();
   }
 
-  static uptr GetSizeClass(const void *p) {
-    return (reinterpret_cast<uptr>(p) / kRegionSize) % kNumClassesRounded;
+  uptr GetSizeClass(const void *p) {
+    if (kUsingConstantSpaceBeg && (kSpaceBeg % kSpaceSize) == 0)
+      return ((reinterpret_cast<uptr>(p)) / kRegionSize) % kNumClassesRounded;
+    return ((reinterpret_cast<uptr>(p) - SpaceBeg()) / kRegionSize) %
+           kNumClassesRounded;
   }
 
   void *GetBlockBegin(const void *p) {
@@ -383,7 +397,7 @@
     return nullptr;
   }
 
-  static uptr GetActuallyAllocatedSize(void *p) {
+  uptr GetActuallyAllocatedSize(void *p) {
     CHECK(PointerIsMine(p));
     return SizeClassMap::Size(GetSizeClass(p));
   }
@@ -394,8 +408,9 @@
     uptr class_id = GetSizeClass(p);
     uptr size = SizeClassMap::Size(class_id);
     uptr chunk_idx = GetChunkIdx(reinterpret_cast<uptr>(p), size);
-    return reinterpret_cast<void*>(kSpaceBeg + (kRegionSize * (class_id + 1)) -
-                                   (1 + chunk_idx) * kMetadataSize);
+    return reinterpret_cast<void *>(SpaceBeg() +
+                                    (kRegionSize * (class_id + 1)) -
+                                    (1 + chunk_idx) * kMetadataSize);
   }
 
   uptr TotalMemoryUsed() {
@@ -407,7 +422,7 @@
 
   // Test-only.
   void TestOnlyUnmap() {
-    UnmapWithCallback(kSpaceBeg, kSpaceSize + AdditionalSize());
+    UnmapWithCallback(SpaceBeg(), kSpaceSize + AdditionalSize());
   }
 
   void PrintStats() {
@@ -455,7 +470,7 @@
     for (uptr class_id = 1; class_id < kNumClasses; class_id++) {
       RegionInfo *region = GetRegionInfo(class_id);
       uptr chunk_size = SizeClassMap::Size(class_id);
-      uptr region_beg = kSpaceBeg + class_id * kRegionSize;
+      uptr region_beg = SpaceBeg() + class_id * kRegionSize;
       for (uptr chunk = region_beg;
            chunk < region_beg + region->allocated_user;
            chunk += chunk_size) {
@@ -476,8 +491,13 @@
 
  private:
   static const uptr kRegionSize = kSpaceSize / kNumClassesRounded;
-  static const uptr kSpaceEnd = kSpaceBeg + kSpaceSize;
-  COMPILER_CHECK(kSpaceBeg % kSpaceSize == 0);
+
+  static const bool kUsingConstantSpaceBeg = kSpaceBeg != ~(uptr)0;
+  uptr NonConstSpaceBeg;
+  uptr SpaceBeg() const {
+    return kUsingConstantSpaceBeg ? kSpaceBeg : NonConstSpaceBeg;
+  }
+  uptr SpaceEnd() const { return  SpaceBeg() + kSpaceSize; }
   // kRegionSize must be >= 2^32.
   COMPILER_CHECK((kRegionSize) >= (1ULL << (SANITIZER_WORDSIZE / 2)));
   // Populate the free list with at most this number of bytes at once
@@ -501,7 +521,8 @@
 
   RegionInfo *GetRegionInfo(uptr class_id) {
     CHECK_LT(class_id, kNumClasses);
-    RegionInfo *regions = reinterpret_cast<RegionInfo*>(kSpaceBeg + kSpaceSize);
+    RegionInfo *regions =
+        reinterpret_cast<RegionInfo *>(SpaceBeg() + kSpaceSize);
     return &regions[class_id];
   }
 
@@ -524,7 +545,7 @@
     uptr count = size < kPopulateSize ? SizeClassMap::MaxCached(class_id) : 1;
     uptr beg_idx = region->allocated_user;
     uptr end_idx = beg_idx + count * size;
-    uptr region_beg = kSpaceBeg + kRegionSize * class_id;
+    uptr region_beg = SpaceBeg() + kRegionSize * class_id;
     if (end_idx + size > region->mapped_user) {
       // Do the mmap for the user memory.
       uptr map_size = kUserMapSize;
@@ -749,6 +770,9 @@
   }
 
   bool PointerIsMine(const void *p) {
+    uptr mem = reinterpret_cast<uptr>(p);
+    if (mem < kSpaceBeg || mem >= kSpaceBeg + kSpaceSize)
+      return false;
     return GetSizeClass(p) != 0;
   }
 
diff --git a/lib/sanitizer_common/sanitizer_allocator_interface.h b/lib/sanitizer_common/sanitizer_allocator_interface.h
index 2cd924c..797c38a 100644
--- a/lib/sanitizer_common/sanitizer_allocator_interface.h
+++ b/lib/sanitizer_common/sanitizer_allocator_interface.h
@@ -29,6 +29,10 @@
 SANITIZER_INTERFACE_ATTRIBUTE uptr __sanitizer_get_free_bytes();
 SANITIZER_INTERFACE_ATTRIBUTE uptr __sanitizer_get_unmapped_bytes();
 
+SANITIZER_INTERFACE_ATTRIBUTE int __sanitizer_install_malloc_and_free_hooks(
+    void (*malloc_hook)(const void *, uptr),
+    void (*free_hook)(const void *));
+
 SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE
     /* OPTIONAL */ void __sanitizer_malloc_hook(void *ptr, uptr size);
 SANITIZER_INTERFACE_ATTRIBUTE SANITIZER_WEAK_ATTRIBUTE
diff --git a/lib/sanitizer_common/sanitizer_allocator_internal.h b/lib/sanitizer_common/sanitizer_allocator_internal.h
index 3dcfccd..a7ea454 100644
--- a/lib/sanitizer_common/sanitizer_allocator_internal.h
+++ b/lib/sanitizer_common/sanitizer_allocator_internal.h
@@ -45,7 +45,12 @@
 typedef CombinedAllocator<PrimaryInternalAllocator, InternalAllocatorCache,
                           LargeMmapAllocator<> > InternalAllocator;
 
-void *InternalAlloc(uptr size, InternalAllocatorCache *cache = nullptr);
+void *InternalAlloc(uptr size, InternalAllocatorCache *cache = nullptr,
+                    uptr alignment = 0);
+void *InternalRealloc(void *p, uptr size,
+                      InternalAllocatorCache *cache = nullptr);
+void *InternalCalloc(uptr countr, uptr size,
+                     InternalAllocatorCache *cache = nullptr);
 void InternalFree(void *p, InternalAllocatorCache *cache = nullptr);
 InternalAllocator *internal_allocator();
 
diff --git a/lib/sanitizer_common/sanitizer_atomic_msvc.h b/lib/sanitizer_common/sanitizer_atomic_msvc.h
index 24d6f0f..6d94056 100644
--- a/lib/sanitizer_common/sanitizer_atomic_msvc.h
+++ b/lib/sanitizer_common/sanitizer_atomic_msvc.h
@@ -33,6 +33,10 @@
 extern "C" long _InterlockedExchangeAdd(  // NOLINT
     long volatile * Addend, long Value);  // NOLINT
 #pragma intrinsic(_InterlockedExchangeAdd)
+extern "C" char _InterlockedCompareExchange8(  // NOLINT
+    char volatile *Destination,                // NOLINT
+    char Exchange, char Comparand);            // NOLINT
+#pragma intrinsic(_InterlockedCompareExchange8)
 extern "C" short _InterlockedCompareExchange16(  // NOLINT
     short volatile *Destination,                 // NOLINT
     short Exchange, short Comparand);            // NOLINT
@@ -171,8 +175,6 @@
   return (u32)_InterlockedExchange((volatile long*)&a->val_dont_use, v);
 }
 
-#ifndef _WIN64
-
 INLINE bool atomic_compare_exchange_strong(volatile atomic_uint8_t *a,
                                            u8 *cmp,
                                            u8 xchgv,
@@ -180,6 +182,10 @@
   (void)mo;
   DCHECK(!((uptr)a % sizeof(*a)));
   u8 cmpv = *cmp;
+#ifdef _WIN64
+  u8 prev = (u8)_InterlockedCompareExchange8(
+      (volatile char*)&a->val_dont_use, (char)xchgv, (char)cmpv);
+#else
   u8 prev;
   __asm {
     mov al, cmpv
@@ -188,14 +194,13 @@
     lock cmpxchg [ecx], dl
     mov prev, al
   }
+#endif
   if (prev == cmpv)
     return true;
   *cmp = prev;
   return false;
 }
 
-#endif
-
 INLINE bool atomic_compare_exchange_strong(volatile atomic_uintptr_t *a,
                                            uptr *cmp,
                                            uptr xchg,
diff --git a/lib/sanitizer_common/sanitizer_common.cc b/lib/sanitizer_common/sanitizer_common.cc
index 9b41a3a..79fcbb1 100644
--- a/lib/sanitizer_common/sanitizer_common.cc
+++ b/lib/sanitizer_common/sanitizer_common.cc
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "sanitizer_common.h"
+#include "sanitizer_allocator_interface.h"
 #include "sanitizer_allocator_internal.h"
 #include "sanitizer_flags.h"
 #include "sanitizer_libc.h"
@@ -24,13 +25,7 @@
 const char *SanitizerToolName = "SanitizerTool";
 
 atomic_uint32_t current_verbosity;
-
-uptr GetPageSizeCached() {
-  static uptr PageSize;
-  if (!PageSize)
-    PageSize = GetPageSize();
-  return PageSize;
-}
+uptr PageSizeCached;
 
 StaticSpinMutex report_file_mu;
 ReportFile report_file = {&report_file_mu, kStderrFd, "", "", 0};
@@ -105,64 +100,6 @@
 // writing to the same log file.
 uptr stoptheworld_tracer_ppid = 0;
 
-static const int kMaxNumOfInternalDieCallbacks = 5;
-static DieCallbackType InternalDieCallbacks[kMaxNumOfInternalDieCallbacks];
-
-bool AddDieCallback(DieCallbackType callback) {
-  for (int i = 0; i < kMaxNumOfInternalDieCallbacks; i++) {
-    if (InternalDieCallbacks[i] == nullptr) {
-      InternalDieCallbacks[i] = callback;
-      return true;
-    }
-  }
-  return false;
-}
-
-bool RemoveDieCallback(DieCallbackType callback) {
-  for (int i = 0; i < kMaxNumOfInternalDieCallbacks; i++) {
-    if (InternalDieCallbacks[i] == callback) {
-      internal_memmove(&InternalDieCallbacks[i], &InternalDieCallbacks[i + 1],
-                       sizeof(InternalDieCallbacks[0]) *
-                           (kMaxNumOfInternalDieCallbacks - i - 1));
-      InternalDieCallbacks[kMaxNumOfInternalDieCallbacks - 1] = nullptr;
-      return true;
-    }
-  }
-  return false;
-}
-
-static DieCallbackType UserDieCallback;
-void SetUserDieCallback(DieCallbackType callback) {
-  UserDieCallback = callback;
-}
-
-void NORETURN Die() {
-  if (UserDieCallback)
-    UserDieCallback();
-  for (int i = kMaxNumOfInternalDieCallbacks - 1; i >= 0; i--) {
-    if (InternalDieCallbacks[i])
-      InternalDieCallbacks[i]();
-  }
-  if (common_flags()->abort_on_error)
-    Abort();
-  internal__exit(common_flags()->exitcode);
-}
-
-static CheckFailedCallbackType CheckFailedCallback;
-void SetCheckFailedCallback(CheckFailedCallbackType callback) {
-  CheckFailedCallback = callback;
-}
-
-void NORETURN CheckFailed(const char *file, int line, const char *cond,
-                          u64 v1, u64 v2) {
-  if (CheckFailedCallback) {
-    CheckFailedCallback(file, line, cond, v1, v2);
-  }
-  Report("Sanitizer CHECK failed: %s:%d %s (%lld, %lld)\n", file, line, cond,
-                                                            v1, v2);
-  Die();
-}
-
 void NORETURN ReportMmapFailureAndDie(uptr size, const char *mem_type,
                                       const char *mmap_type, error_t err,
                                       bool raw_report) {
@@ -230,27 +167,6 @@
   InternalSort<uptr*, UptrComparisonFunction>(&array, size, CompareLess);
 }
 
-// We want to map a chunk of address space aligned to 'alignment'.
-// We do it by maping a bit more and then unmaping redundant pieces.
-// We probably can do it with fewer syscalls in some OS-dependent way.
-void *MmapAlignedOrDie(uptr size, uptr alignment, const char *mem_type) {
-// uptr PageSize = GetPageSizeCached();
-  CHECK(IsPowerOfTwo(size));
-  CHECK(IsPowerOfTwo(alignment));
-  uptr map_size = size + alignment;
-  uptr map_res = (uptr)MmapOrDie(map_size, mem_type);
-  uptr map_end = map_res + map_size;
-  uptr res = map_res;
-  if (res & (alignment - 1))  // Not aligned.
-    res = (map_res + alignment) & ~(alignment - 1);
-  uptr end = res + size;
-  if (res != map_res)
-    UnmapOrDie((void*)map_res, res - map_res);
-  if (end != map_end)
-    UnmapOrDie((void*)end, map_end - end);
-  return (void*)res;
-}
-
 const char *StripPathPrefix(const char *filepath,
                             const char *strip_path_prefix) {
   if (!filepath) return nullptr;
@@ -355,9 +271,8 @@
 }
 
 bool LoadedModule::containsAddress(uptr address) const {
-  for (Iterator iter = ranges(); iter.hasNext();) {
-    const AddressRange *r = iter.next();
-    if (r->beg <= address && address < r->end)
+  for (const AddressRange &r : ranges()) {
+    if (r.beg <= address && address < r.end)
       return true;
   }
   return false;
@@ -424,6 +339,10 @@
 static const char kPathSeparator = SANITIZER_WINDOWS ? ';' : ':';
 
 char *FindPathToBinary(const char *name) {
+  if (FileExists(name)) {
+    return internal_strdup(name);
+  }
+
   const char *path = GetEnv("PATH");
   if (!path)
     return nullptr;
@@ -488,6 +407,53 @@
   return name_len;
 }
 
+void PrintCmdline() {
+  char **argv = GetArgv();
+  if (!argv) return;
+  Printf("\nCommand: ");
+  for (uptr i = 0; argv[i]; ++i)
+    Printf("%s ", argv[i]);
+  Printf("\n\n");
+}
+
+// Malloc hooks.
+static const int kMaxMallocFreeHooks = 5;
+struct MallocFreeHook {
+  void (*malloc_hook)(const void *, uptr);
+  void (*free_hook)(const void *);
+};
+
+static MallocFreeHook MFHooks[kMaxMallocFreeHooks];
+
+void RunMallocHooks(const void *ptr, uptr size) {
+  for (int i = 0; i < kMaxMallocFreeHooks; i++) {
+    auto hook = MFHooks[i].malloc_hook;
+    if (!hook) return;
+    hook(ptr, size);
+  }
+}
+
+void RunFreeHooks(const void *ptr) {
+  for (int i = 0; i < kMaxMallocFreeHooks; i++) {
+    auto hook = MFHooks[i].free_hook;
+    if (!hook) return;
+    hook(ptr);
+  }
+}
+
+static int InstallMallocFreeHooks(void (*malloc_hook)(const void *, uptr),
+                                  void (*free_hook)(const void *)) {
+  if (!malloc_hook || !free_hook) return 0;
+  for (int i = 0; i < kMaxMallocFreeHooks; i++) {
+    if (MFHooks[i].malloc_hook == nullptr) {
+      MFHooks[i].malloc_hook = malloc_hook;
+      MFHooks[i].free_hook = free_hook;
+      return i + 1;
+    }
+  }
+  return 0;
+}
+
 } // namespace __sanitizer
 
 using namespace __sanitizer;  // NOLINT
@@ -497,6 +463,11 @@
   report_file.SetReportPath(path);
 }
 
+void __sanitizer_set_report_fd(void *fd) {
+  report_file.fd = (fd_t)reinterpret_cast<uptr>(fd);
+  report_file.fd_pid = internal_getpid();
+}
+
 void __sanitizer_report_error_summary(const char *error_summary) {
   Printf("%s\n", error_summary);
 }
@@ -505,4 +476,11 @@
 void __sanitizer_set_death_callback(void (*callback)(void)) {
   SetUserDieCallback(callback);
 }
+
+SANITIZER_INTERFACE_ATTRIBUTE
+int __sanitizer_install_malloc_and_free_hooks(void (*malloc_hook)(const void *,
+                                                                  uptr),
+                                              void (*free_hook)(const void *)) {
+  return InstallMallocFreeHooks(malloc_hook, free_hook);
+}
 } // extern "C"
diff --git a/lib/sanitizer_common/sanitizer_common.h b/lib/sanitizer_common/sanitizer_common.h
index 0585f6b..6c1d6a0 100644
--- a/lib/sanitizer_common/sanitizer_common.h
+++ b/lib/sanitizer_common/sanitizer_common.h
@@ -23,7 +23,7 @@
 #include "sanitizer_list.h"
 #include "sanitizer_mutex.h"
 
-#ifdef _MSC_VER
+#if defined(_MSC_VER) && !defined(__clang__)
 extern "C" void _ReadWriteBarrier();
 #pragma intrinsic(_ReadWriteBarrier)
 #endif
@@ -44,9 +44,6 @@
 
 const uptr kMaxPathLength = 4096;
 
-// 16K loaded modules should be enough for everyone.
-static const uptr kMaxNumberOfModules = 1 << 14;
-
 const uptr kMaxThreadStackSize = 1 << 30;  // 1Gb
 
 static const uptr kErrorMessageBufferSize = 1 << 16;
@@ -66,7 +63,12 @@
 }
 
 uptr GetPageSize();
-uptr GetPageSizeCached();
+extern uptr PageSizeCached;
+INLINE uptr GetPageSizeCached() {
+  if (!PageSizeCached)
+    PageSizeCached = GetPageSize();
+  return PageSizeCached;
+}
 uptr GetMmapGranularity();
 uptr GetMaxVirtualAddress();
 // Threads
@@ -87,12 +89,14 @@
                          const char *name = nullptr);
 void *MmapNoReserveOrDie(uptr size, const char *mem_type);
 void *MmapFixedOrDie(uptr fixed_addr, uptr size);
-void *MmapNoAccess(uptr fixed_addr, uptr size, const char *name = nullptr);
+void *MmapFixedNoAccess(uptr fixed_addr, uptr size, const char *name = nullptr);
+void *MmapNoAccess(uptr size);
 // Map aligned chunk of address space; size and alignment are powers of two.
 void *MmapAlignedOrDie(uptr size, uptr alignment, const char *mem_type);
-// Disallow access to a memory range.  Use MmapNoAccess to allocate an
+// Disallow access to a memory range.  Use MmapFixedNoAccess to allocate an
 // unaccessible memory.
 bool MprotectNoAccess(uptr addr, uptr size);
+bool MprotectReadOnly(uptr addr, uptr size);
 
 // Used to check if we can map shadow memory to a fixed location.
 bool MemoryRangeIsAvailable(uptr range_start, uptr range_end);
@@ -104,6 +108,8 @@
 void DontDumpShadowMemory(uptr addr, uptr length);
 // Check if the built VMA size matches the runtime one.
 void CheckVMASize();
+void RunMallocHooks(const void *ptr, uptr size);
+void RunFreeHooks(const void *ptr);
 
 // InternalScopedBuffer can be used instead of large stack arrays to
 // keep frame size low.
@@ -279,10 +285,27 @@
 char *FindPathToBinary(const char *name);
 bool IsPathSeparator(const char c);
 bool IsAbsolutePath(const char *path);
+// Starts a subprocess and returs its pid.
+// If *_fd parameters are not kInvalidFd their corresponding input/output
+// streams will be redirect to the file. The files will always be closed
+// in parent process even in case of an error.
+// The child process will close all fds after STDERR_FILENO
+// before passing control to a program.
+pid_t StartSubprocess(const char *filename, const char *const argv[],
+                      fd_t stdin_fd = kInvalidFd, fd_t stdout_fd = kInvalidFd,
+                      fd_t stderr_fd = kInvalidFd);
+// Checks if specified process is still running
+bool IsProcessRunning(pid_t pid);
+// Waits for the process to finish and returns its exit code.
+// Returns -1 in case of an error.
+int WaitForProcess(pid_t pid);
 
 u32 GetUid();
 void ReExec();
+char **GetArgv();
+void PrintCmdline();
 bool StackSizeIsUnlimited();
+uptr GetStackSizeLimitInBytes();
 void SetStackSizeLimitInBytes(uptr limit);
 bool AddressSpaceIsUnlimited();
 void SetAddressSpaceUnlimited();
@@ -350,7 +373,7 @@
 
 // Functions related to signal handling.
 typedef void (*SignalHandlerType)(int, void *, void *);
-bool IsDeadlySignal(int signum);
+bool IsHandledDeadlySignal(int signum);
 void InstallDeadlySignalHandlers(SignalHandlerType handler);
 // Alternative signal stack (POSIX-only).
 void SetAlternateSignalStack();
@@ -496,7 +519,7 @@
       uptr new_capacity = RoundUpToPowerOfTwo(size_ + 1);
       Resize(new_capacity);
     }
-    data_[size_++] = element;
+    internal_memcpy(&data_[size_++], &element, sizeof(T));
   }
   T &back() {
     CHECK_GT(size_, 0);
@@ -522,6 +545,19 @@
   void clear() { size_ = 0; }
   bool empty() const { return size() == 0; }
 
+  const T *begin() const {
+    return data();
+  }
+  T *begin() {
+    return data();
+  }
+  const T *end() const {
+    return data() + size();
+  }
+  T *end() {
+    return data() + size();
+  }
+
  private:
   void Resize(uptr new_capacity) {
     CHECK_GT(new_capacity, 0);
@@ -628,8 +664,7 @@
         : next(nullptr), beg(beg), end(end), executable(executable) {}
   };
 
-  typedef IntrusiveList<AddressRange>::ConstIterator Iterator;
-  Iterator ranges() const { return Iterator(&ranges_); }
+  const IntrusiveList<AddressRange> &ranges() const { return ranges_; }
 
  private:
   char *full_name_;  // Owned.
@@ -637,13 +672,33 @@
   IntrusiveList<AddressRange> ranges_;
 };
 
-// OS-dependent function that fills array with descriptions of at most
-// "max_modules" currently loaded modules. Returns the number of
-// initialized modules. If filter is nonzero, ignores modules for which
-// filter(full_name) is false.
-typedef bool (*string_predicate_t)(const char *);
-uptr GetListOfModules(LoadedModule *modules, uptr max_modules,
-                      string_predicate_t filter);
+// List of LoadedModules. OS-dependent implementation is responsible for
+// filling this information.
+class ListOfModules {
+ public:
+  ListOfModules() : modules_(kInitialCapacity) {}
+  ~ListOfModules() { clear(); }
+  void init();
+  const LoadedModule *begin() const { return modules_.begin(); }
+  LoadedModule *begin() { return modules_.begin(); }
+  const LoadedModule *end() const { return modules_.end(); }
+  LoadedModule *end() { return modules_.end(); }
+  uptr size() const { return modules_.size(); }
+  const LoadedModule &operator[](uptr i) const {
+    CHECK_LT(i, modules_.size());
+    return modules_[i];
+  }
+
+ private:
+  void clear() {
+    for (auto &module : modules_) module.clear();
+    modules_.clear();
+  }
+
+  InternalMmapVector<LoadedModule> modules_;
+  // We rarely have more than 16K loaded modules.
+  static const uptr kInitialCapacity = 1 << 14;
+};
 
 // Callback type for iterating over a set of memory ranges.
 typedef void (*RangeIteratorCallback)(uptr begin, uptr end, void *arg);
@@ -665,17 +720,17 @@
 
 #if SANITIZER_LINUX || SANITIZER_MAC
 void WriteOneLineToSyslog(const char *s);
+void LogMessageOnPrintf(const char *str);
 #else
 INLINE void WriteOneLineToSyslog(const char *s) {}
+INLINE void LogMessageOnPrintf(const char *str) {}
 #endif
 
 #if SANITIZER_LINUX
 // Initialize Android logging. Any writes before this are silently lost.
 void AndroidLogInit();
-bool ShouldLogAfterPrintf();
 #else
 INLINE void AndroidLogInit() {}
-INLINE bool ShouldLogAfterPrintf() { return false; }
 #endif
 
 #if SANITIZER_ANDROID
@@ -707,7 +762,7 @@
 // compiler from recognising it and turning it into an actual call to
 // memset/memcpy/etc.
 static inline void SanitizerBreakOptimization(void *arg) {
-#if _MSC_VER && !defined(__clang__)
+#if defined(_MSC_VER) && !defined(__clang__)
   _ReadWriteBarrier();
 #else
   __asm__ __volatile__("" : : "r" (arg) : "memory");
@@ -720,20 +775,58 @@
   uptr pc;
   uptr sp;
   uptr bp;
+  bool is_memory_access;
 
-  SignalContext(void *context, uptr addr, uptr pc, uptr sp, uptr bp) :
-      context(context), addr(addr), pc(pc), sp(sp), bp(bp) {
-  }
+  enum WriteFlag { UNKNOWN, READ, WRITE } write_flag;
+
+  SignalContext(void *context, uptr addr, uptr pc, uptr sp, uptr bp,
+                bool is_memory_access, WriteFlag write_flag)
+      : context(context),
+        addr(addr),
+        pc(pc),
+        sp(sp),
+        bp(bp),
+        is_memory_access(is_memory_access),
+        write_flag(write_flag) {}
 
   // Creates signal context in a platform-specific manner.
   static SignalContext Create(void *siginfo, void *context);
+
+  // Returns true if the "context" indicates a memory write.
+  static WriteFlag GetWriteFlag(void *context);
 };
 
 void GetPcSpBp(void *context, uptr *pc, uptr *sp, uptr *bp);
 
-void DisableReexec();
 void MaybeReexec();
 
+template <typename Fn>
+class RunOnDestruction {
+ public:
+  explicit RunOnDestruction(Fn fn) : fn_(fn) {}
+  ~RunOnDestruction() { fn_(); }
+
+ private:
+  Fn fn_;
+};
+
+// A simple scope guard. Usage:
+// auto cleanup = at_scope_exit([]{ do_cleanup; });
+template <typename Fn>
+RunOnDestruction<Fn> at_scope_exit(Fn fn) {
+  return RunOnDestruction<Fn>(fn);
+}
+
+// Linux on 64-bit s390 had a nasty bug that crashes the whole machine
+// if a process uses virtual memory over 4TB (as many sanitizers like
+// to do).  This function will abort the process if running on a kernel
+// that looks vulnerable.
+#if SANITIZER_LINUX && SANITIZER_S390_64
+void AvoidCVE_2016_2143();
+#else
+INLINE void AvoidCVE_2016_2143() {}
+#endif
+
 }  // namespace __sanitizer
 
 inline void *operator new(__sanitizer::operator_new_size_type size,
diff --git a/lib/sanitizer_common/sanitizer_common_interceptors.inc b/lib/sanitizer_common/sanitizer_common_interceptors.inc
index 4639ddc..9559505 100644
--- a/lib/sanitizer_common/sanitizer_common_interceptors.inc
+++ b/lib/sanitizer_common/sanitizer_common_interceptors.inc
@@ -11,7 +11,7 @@
 // ThreadSanitizer, MemorySanitizer, etc.
 //
 // This file should be included into the tool's interceptor file,
-// which has to define it's own macros:
+// which has to define its own macros:
 //   COMMON_INTERCEPTOR_ENTER
 //   COMMON_INTERCEPTOR_ENTER_NOIGNORE
 //   COMMON_INTERCEPTOR_READ_RANGE
@@ -91,6 +91,10 @@
 #define COMMON_INTERCEPTOR_MUTEX_REPAIR(ctx, m) {}
 #endif
 
+#ifndef COMMON_INTERCEPTOR_MUTEX_INVALID
+#define COMMON_INTERCEPTOR_MUTEX_INVALID(ctx, m) {}
+#endif
+
 #ifndef COMMON_INTERCEPTOR_HANDLE_RECVMSG
 #define COMMON_INTERCEPTOR_HANDLE_RECVMSG(ctx, msg) ((void)(msg))
 #endif
@@ -143,6 +147,22 @@
 #define COMMON_INTERCEPTOR_RELEASE(ctx, u) {}
 #endif
 
+#ifndef COMMON_INTERCEPTOR_USER_CALLBACK_START
+#define COMMON_INTERCEPTOR_USER_CALLBACK_START() {}
+#endif
+
+#ifndef COMMON_INTERCEPTOR_USER_CALLBACK_END
+#define COMMON_INTERCEPTOR_USER_CALLBACK_END() {}
+#endif
+
+#ifdef SANITIZER_NLDBL_VERSION
+#define COMMON_INTERCEPT_FUNCTION_LDBL(fn)                          \
+    COMMON_INTERCEPT_FUNCTION_VER(fn, SANITIZER_NLDBL_VERSION)
+#else
+#define COMMON_INTERCEPT_FUNCTION_LDBL(fn)                          \
+    COMMON_INTERCEPT_FUNCTION(fn)
+#endif
+
 struct FileMetadata {
   // For open_memstream().
   char **addr;
@@ -192,6 +212,40 @@
 }
 #endif  // SI_NOT_WINDOWS
 
+#if SANITIZER_INTERCEPT_STRLEN
+INTERCEPTOR(SIZE_T, strlen, const char *s) {
+  // Sometimes strlen is called prior to InitializeCommonInterceptors,
+  // in which case the REAL(strlen) typically used in
+  // COMMON_INTERCEPTOR_ENTER will fail.  We use internal_strlen here
+  // to handle that.
+  if (COMMON_INTERCEPTOR_NOTHING_IS_INITIALIZED)
+    return internal_strlen(s);
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, strlen, s);
+  SIZE_T result = REAL(strlen)(s);
+  if (common_flags()->intercept_strlen)
+    COMMON_INTERCEPTOR_READ_RANGE(ctx, s, result + 1);
+  return result;
+}
+#define INIT_STRLEN COMMON_INTERCEPT_FUNCTION(strlen)
+#else
+#define INIT_STRLEN
+#endif
+
+#if SANITIZER_INTERCEPT_STRNLEN
+INTERCEPTOR(SIZE_T, strnlen, const char *s, SIZE_T maxlen) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, strnlen, s, maxlen);
+  SIZE_T length = REAL(strnlen)(s, maxlen);
+  if (common_flags()->intercept_strlen)
+    COMMON_INTERCEPTOR_READ_RANGE(ctx, s, Min(length + 1, maxlen));
+  return length;
+}
+#define INIT_STRNLEN COMMON_INTERCEPT_FUNCTION(strnlen)
+#else
+#define INIT_STRNLEN
+#endif
+
 #if SANITIZER_INTERCEPT_TEXTDOMAIN
 INTERCEPTOR(char*, textdomain, const char *domainname) {
   void *ctx;
@@ -214,13 +268,11 @@
 }
 
 DECLARE_WEAK_INTERCEPTOR_HOOK(__sanitizer_weak_hook_strcmp, uptr called_pc,
-                              const char *s1, const char *s2)
+                              const char *s1, const char *s2, int result)
 
 INTERCEPTOR(int, strcmp, const char *s1, const char *s2) {
   void *ctx;
   COMMON_INTERCEPTOR_ENTER(ctx, strcmp, s1, s2);
-  CALL_WEAK_INTERCEPTOR_HOOK(__sanitizer_weak_hook_strcmp, GET_CALLER_PC(), s1,
-                             s2);
   unsigned char c1, c2;
   uptr i;
   for (i = 0;; i++) {
@@ -230,19 +282,21 @@
   }
   COMMON_INTERCEPTOR_READ_STRING(ctx, s1, i + 1);
   COMMON_INTERCEPTOR_READ_STRING(ctx, s2, i + 1);
-  return CharCmpX(c1, c2);
+  int result = CharCmpX(c1, c2);
+  CALL_WEAK_INTERCEPTOR_HOOK(__sanitizer_weak_hook_strcmp, GET_CALLER_PC(), s1,
+                             s2, result);
+  return result;
 }
 
 DECLARE_WEAK_INTERCEPTOR_HOOK(__sanitizer_weak_hook_strncmp, uptr called_pc,
-                              const char *s1, const char *s2, uptr n)
+                              const char *s1, const char *s2, uptr n,
+                              int result)
 
 INTERCEPTOR(int, strncmp, const char *s1, const char *s2, uptr size) {
   if (COMMON_INTERCEPTOR_NOTHING_IS_INITIALIZED)
     return internal_strncmp(s1, s2, size);
   void *ctx;
   COMMON_INTERCEPTOR_ENTER(ctx, strncmp, s1, s2, size);
-  CALL_WEAK_INTERCEPTOR_HOOK(__sanitizer_weak_hook_strncmp, GET_CALLER_PC(), s1,
-                             s2, size);
   unsigned char c1 = 0, c2 = 0;
   uptr i;
   for (i = 0; i < size; i++) {
@@ -252,7 +306,10 @@
   }
   COMMON_INTERCEPTOR_READ_RANGE(ctx, s1, Min(i + 1, size));
   COMMON_INTERCEPTOR_READ_RANGE(ctx, s2, Min(i + 1, size));
-  return CharCmpX(c1, c2);
+  int result = CharCmpX(c1, c2);
+  CALL_WEAK_INTERCEPTOR_HOOK(__sanitizer_weak_hook_strncmp, GET_CALLER_PC(), s1,
+                             s2, size, result);
+  return result;
 }
 
 #define INIT_STRCMP COMMON_INTERCEPT_FUNCTION(strcmp)
@@ -349,6 +406,55 @@
 #define INIT_STRCASESTR
 #endif
 
+#if SANITIZER_INTERCEPT_STRCHR
+INTERCEPTOR(char*, strchr, const char *s, int c) {
+  void *ctx;
+  if (COMMON_INTERCEPTOR_NOTHING_IS_INITIALIZED)
+    return internal_strchr(s, c);
+  COMMON_INTERCEPTOR_ENTER(ctx, strchr, s, c);
+  char *result = REAL(strchr)(s, c);
+  uptr len = internal_strlen(s);
+  uptr n = result ? result - s + 1 : len + 1;
+  if (common_flags()->intercept_strchr)
+    COMMON_INTERCEPTOR_READ_STRING_OF_LEN(ctx, s, len, n);
+  return result;
+}
+#define INIT_STRCHR COMMON_INTERCEPT_FUNCTION(strchr)
+#else
+#define INIT_STRCHR
+#endif
+
+#if SANITIZER_INTERCEPT_STRCHRNUL
+INTERCEPTOR(char*, strchrnul, const char *s, int c) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, strchrnul, s, c);
+  char *result = REAL(strchrnul)(s, c);
+  uptr len = result - s + 1;
+  if (common_flags()->intercept_strchr)
+    COMMON_INTERCEPTOR_READ_STRING(ctx, s, len);
+  return result;
+}
+#define INIT_STRCHRNUL COMMON_INTERCEPT_FUNCTION(strchrnul)
+#else
+#define INIT_STRCHRNUL
+#endif
+
+#if SANITIZER_INTERCEPT_STRRCHR
+INTERCEPTOR(char*, strrchr, const char *s, int c) {
+  void *ctx;
+  if (COMMON_INTERCEPTOR_NOTHING_IS_INITIALIZED)
+    return internal_strrchr(s, c);
+  COMMON_INTERCEPTOR_ENTER(ctx, strrchr, s, c);
+  uptr len = internal_strlen(s);
+  if (common_flags()->intercept_strchr)
+    COMMON_INTERCEPTOR_READ_STRING_OF_LEN(ctx, s, len, len + 1);
+  return REAL(strrchr)(s, c);
+}
+#define INIT_STRRCHR COMMON_INTERCEPT_FUNCTION(strrchr)
+#else
+#define INIT_STRRCHR
+#endif
+
 #if SANITIZER_INTERCEPT_STRSPN
 INTERCEPTOR(SIZE_T, strspn, const char *s1, const char *s2) {
   void *ctx;
@@ -397,18 +503,75 @@
 #define INIT_STRPBRK
 #endif
 
+#if SANITIZER_INTERCEPT_MEMSET
+INTERCEPTOR(void*, memset, void *dst, int v, uptr size) {
+  if (COMMON_INTERCEPTOR_NOTHING_IS_INITIALIZED)
+    return internal_memset(dst, v, size);
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, memset, dst, v, size);
+  if (common_flags()->intercept_intrin)
+    COMMON_INTERCEPTOR_WRITE_RANGE(ctx, dst, size);
+  return REAL(memset)(dst, v, size);
+}
+
+#define INIT_MEMSET COMMON_INTERCEPT_FUNCTION(memset)
+#else
+#define INIT_MEMSET
+#endif
+
+#if SANITIZER_INTERCEPT_MEMMOVE
+INTERCEPTOR(void*, memmove, void *dst, const void *src, uptr size) {
+  if (COMMON_INTERCEPTOR_NOTHING_IS_INITIALIZED)
+    return internal_memmove(dst, src, size);
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, memmove, dst, src, size);
+  if (common_flags()->intercept_intrin) {
+    COMMON_INTERCEPTOR_WRITE_RANGE(ctx, dst, size);
+    COMMON_INTERCEPTOR_READ_RANGE(ctx, src, size);
+  }
+  return REAL(memmove)(dst, src, size);
+}
+
+#define INIT_MEMMOVE COMMON_INTERCEPT_FUNCTION(memmove)
+#else
+#define INIT_MEMMOVE
+#endif
+
+#if SANITIZER_INTERCEPT_MEMCPY
+INTERCEPTOR(void*, memcpy, void *dst, const void *src, uptr size) {
+  if (COMMON_INTERCEPTOR_NOTHING_IS_INITIALIZED) {
+    // On OS X, calling internal_memcpy here will cause memory corruptions,
+    // because memcpy and memmove are actually aliases of the same
+    // implementation.  We need to use internal_memmove here.
+    return internal_memmove(dst, src, size);
+  }
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, memcpy, dst, src, size);
+  if (common_flags()->intercept_intrin) {
+    COMMON_INTERCEPTOR_WRITE_RANGE(ctx, dst, size);
+    COMMON_INTERCEPTOR_READ_RANGE(ctx, src, size);
+  }
+  // N.B.: If we switch this to internal_ we'll have to use internal_memmove
+  // due to memcpy being an alias of memmove on OS X.
+  return REAL(memcpy)(dst, src, size);
+}
+
+#define INIT_MEMCPY COMMON_INTERCEPT_FUNCTION(memcpy)
+#else
+#define INIT_MEMCPY
+#endif
+
 #if SANITIZER_INTERCEPT_MEMCMP
 
 DECLARE_WEAK_INTERCEPTOR_HOOK(__sanitizer_weak_hook_memcmp, uptr called_pc,
-                              const void *s1, const void *s2, uptr n)
+                              const void *s1, const void *s2, uptr n,
+                              int result)
 
 INTERCEPTOR(int, memcmp, const void *a1, const void *a2, uptr size) {
   if (COMMON_INTERCEPTOR_NOTHING_IS_INITIALIZED)
     return internal_memcmp(a1, a2, size);
   void *ctx;
   COMMON_INTERCEPTOR_ENTER(ctx, memcmp, a1, a2, size);
-  CALL_WEAK_INTERCEPTOR_HOOK(__sanitizer_weak_hook_memcmp, GET_CALLER_PC(), a1,
-                             a2, size);
   if (common_flags()->intercept_memcmp) {
     if (common_flags()->strict_memcmp) {
       // Check the entire regions even if the first bytes of the buffers are
@@ -428,10 +591,16 @@
       }
       COMMON_INTERCEPTOR_READ_RANGE(ctx, s1, Min(i + 1, size));
       COMMON_INTERCEPTOR_READ_RANGE(ctx, s2, Min(i + 1, size));
-      return CharCmpX(c1, c2);
+      int r = CharCmpX(c1, c2);
+      CALL_WEAK_INTERCEPTOR_HOOK(__sanitizer_weak_hook_memcmp, GET_CALLER_PC(),
+                                 a1, a2, size, r);
+      return r;
     }
   }
-  return REAL(memcmp(a1, a2, size));
+  int result = REAL(memcmp(a1, a2, size));
+  CALL_WEAK_INTERCEPTOR_HOOK(__sanitizer_weak_hook_memcmp, GET_CALLER_PC(), a1,
+                             a2, size, result);
+  return result;
 }
 
 #define INIT_MEMCMP COMMON_INTERCEPT_FUNCTION(memcmp)
@@ -509,7 +678,7 @@
 
 #define INIT_FREXPF_FREXPL           \
   COMMON_INTERCEPT_FUNCTION(frexpf); \
-  COMMON_INTERCEPT_FUNCTION(frexpl)
+  COMMON_INTERCEPT_FUNCTION_LDBL(frexpl)
 #else
 #define INIT_FREXPF_FREXPL
 #endif  // SANITIZER_INTERCEPT_FREXPF_FREXPL
@@ -2288,7 +2457,7 @@
 #define INIT_MODF                   \
   COMMON_INTERCEPT_FUNCTION(modf);  \
   COMMON_INTERCEPT_FUNCTION(modff); \
-  COMMON_INTERCEPT_FUNCTION(modfl);
+  COMMON_INTERCEPT_FUNCTION_LDBL(modfl);
 #else
 #define INIT_MODF
 #endif
@@ -2329,6 +2498,75 @@
 #define INIT_RECVMSG
 #endif
 
+#if SANITIZER_INTERCEPT_SENDMSG
+static void read_msghdr_control(void *ctx, void *control, uptr controllen) {
+  const unsigned kCmsgDataOffset =
+      RoundUpTo(sizeof(__sanitizer_cmsghdr), sizeof(uptr));
+
+  char *p = (char *)control;
+  char *const control_end = p + controllen;
+  while (true) {
+    if (p + sizeof(__sanitizer_cmsghdr) > control_end) break;
+    __sanitizer_cmsghdr *cmsg = (__sanitizer_cmsghdr *)p;
+    COMMON_INTERCEPTOR_READ_RANGE(ctx, &cmsg->cmsg_len, sizeof(cmsg->cmsg_len));
+
+    if (p + RoundUpTo(cmsg->cmsg_len, sizeof(uptr)) > control_end) break;
+
+    COMMON_INTERCEPTOR_READ_RANGE(ctx, &cmsg->cmsg_level,
+                                  sizeof(cmsg->cmsg_level));
+    COMMON_INTERCEPTOR_READ_RANGE(ctx, &cmsg->cmsg_type,
+                                  sizeof(cmsg->cmsg_type));
+
+    if (cmsg->cmsg_len > kCmsgDataOffset) {
+      char *data = p + kCmsgDataOffset;
+      unsigned data_len = cmsg->cmsg_len - kCmsgDataOffset;
+      if (data_len > 0) COMMON_INTERCEPTOR_READ_RANGE(ctx, data, data_len);
+    }
+
+    p += RoundUpTo(cmsg->cmsg_len, sizeof(uptr));
+  }
+}
+
+static void read_msghdr(void *ctx, struct __sanitizer_msghdr *msg,
+                        SSIZE_T maxlen) {
+#define R(f) \
+  COMMON_INTERCEPTOR_READ_RANGE(ctx, &msg->msg_##f, sizeof(msg->msg_##f))
+  R(name);
+  R(namelen);
+  R(iov);
+  R(iovlen);
+  R(control);
+  R(controllen);
+  R(flags);
+#undef R
+  if (msg->msg_name && msg->msg_namelen)
+    COMMON_INTERCEPTOR_READ_RANGE(ctx, msg->msg_name, msg->msg_namelen);
+  if (msg->msg_iov && msg->msg_iovlen)
+    COMMON_INTERCEPTOR_READ_RANGE(ctx, msg->msg_iov,
+                                  sizeof(*msg->msg_iov) * msg->msg_iovlen);
+  read_iovec(ctx, msg->msg_iov, msg->msg_iovlen, maxlen);
+  if (msg->msg_control && msg->msg_controllen)
+    read_msghdr_control(ctx, msg->msg_control, msg->msg_controllen);
+}
+
+INTERCEPTOR(SSIZE_T, sendmsg, int fd, struct __sanitizer_msghdr *msg,
+            int flags) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, sendmsg, fd, msg, flags);
+  if (fd >= 0) {
+    COMMON_INTERCEPTOR_FD_ACCESS(ctx, fd);
+    COMMON_INTERCEPTOR_FD_RELEASE(ctx, fd);
+  }
+  SSIZE_T res = REAL(sendmsg)(fd, msg, flags);
+  if (common_flags()->intercept_send && res >= 0 && msg)
+    read_msghdr(ctx, msg, res);
+  return res;
+}
+#define INIT_SENDMSG COMMON_INTERCEPT_FUNCTION(sendmsg);
+#else
+#define INIT_SENDMSG
+#endif
+
 #if SANITIZER_INTERCEPT_GETPEERNAME
 INTERCEPTOR(int, getpeername, int sockfd, void *addr, unsigned *addrlen) {
   void *ctx;
@@ -3296,7 +3534,9 @@
 INTERCEPTOR(void, _exit, int status) {
   void *ctx;
   COMMON_INTERCEPTOR_ENTER(ctx, _exit, status);
+  COMMON_INTERCEPTOR_USER_CALLBACK_START();
   int status1 = COMMON_INTERCEPTOR_ON_EXIT(ctx);
+  COMMON_INTERCEPTOR_USER_CALLBACK_END();
   if (status == 0) status = status1;
   REAL(_exit)(status);
 }
@@ -3314,6 +3554,8 @@
     COMMON_INTERCEPTOR_MUTEX_REPAIR(ctx, m);
   if (res == 0 || res == errno_EOWNERDEAD)
     COMMON_INTERCEPTOR_MUTEX_LOCK(ctx, m);
+  if (res == errno_EINVAL)
+    COMMON_INTERCEPTOR_MUTEX_INVALID(ctx, m);
   return res;
 }
 
@@ -3321,7 +3563,10 @@
   void *ctx;
   COMMON_INTERCEPTOR_ENTER(ctx, pthread_mutex_unlock, m);
   COMMON_INTERCEPTOR_MUTEX_UNLOCK(ctx, m);
-  return REAL(pthread_mutex_unlock)(m);
+  int res = REAL(pthread_mutex_unlock)(m);
+  if (res == errno_EINVAL)
+    COMMON_INTERCEPTOR_MUTEX_INVALID(ctx, m);
+  return res;
 }
 
 #define INIT_PTHREAD_MUTEX_LOCK COMMON_INTERCEPT_FUNCTION(pthread_mutex_lock)
@@ -3942,7 +4187,7 @@
 #define INIT_SINCOS                   \
   COMMON_INTERCEPT_FUNCTION(sincos);  \
   COMMON_INTERCEPT_FUNCTION(sincosf); \
-  COMMON_INTERCEPT_FUNCTION(sincosl);
+  COMMON_INTERCEPT_FUNCTION_LDBL(sincosl);
 #else
 #define INIT_SINCOS
 #endif
@@ -3981,7 +4226,7 @@
 #define INIT_REMQUO                   \
   COMMON_INTERCEPT_FUNCTION(remquo);  \
   COMMON_INTERCEPT_FUNCTION(remquof); \
-  COMMON_INTERCEPT_FUNCTION(remquol);
+  COMMON_INTERCEPT_FUNCTION_LDBL(remquol);
 #else
 #define INIT_REMQUO
 #endif
@@ -4012,7 +4257,7 @@
 #define INIT_LGAMMA                   \
   COMMON_INTERCEPT_FUNCTION(lgamma);  \
   COMMON_INTERCEPT_FUNCTION(lgammaf); \
-  COMMON_INTERCEPT_FUNCTION(lgammal);
+  COMMON_INTERCEPT_FUNCTION_LDBL(lgammal);
 #else
 #define INIT_LGAMMA
 #endif
@@ -4056,7 +4301,7 @@
   if (signp) COMMON_INTERCEPTOR_WRITE_RANGE(ctx, signp, sizeof(*signp));
   return res;
 }
-#define INIT_LGAMMAL_R COMMON_INTERCEPT_FUNCTION(lgammal_r);
+#define INIT_LGAMMAL_R COMMON_INTERCEPT_FUNCTION_LDBL(lgammal_r);
 #else
 #define INIT_LGAMMAL_R
 #endif
@@ -4197,6 +4442,7 @@
 #endif
 
 #if SANITIZER_INTERCEPT_TLS_GET_ADDR
+#if !SANITIZER_S390
 #define INIT_TLS_GET_ADDR COMMON_INTERCEPT_FUNCTION(__tls_get_addr)
 // If you see any crashes around this functions, there are 2 known issues with
 // it: 1. __tls_get_addr can be called with mis-aligned stack due to:
@@ -4217,6 +4463,67 @@
   }
   return res;
 }
+#if SANITIZER_PPC
+// On PowerPC, we also need to intercept __tls_get_addr_opt, which has
+// mostly the same semantics as __tls_get_addr, but its presence enables
+// some optimizations in linker (which are safe to ignore here).
+extern "C" __attribute__((alias("__interceptor___tls_get_addr"),
+                          visibility("default")))
+void *__tls_get_addr_opt(void *arg);
+#endif
+#else // SANITIZER_S390
+// On s390, we have to intercept two functions here:
+// - __tls_get_addr_internal, which is a glibc-internal function that is like
+//   the usual __tls_get_addr, but returns a TP-relative offset instead of
+//   a proper pointer.  It is used by dlsym for TLS symbols.
+// - __tls_get_offset, which is like the above, but also takes a GOT-relative
+//   descriptor offset as an argument instead of a pointer.  GOT address
+//   is passed in r12, so it's necessary to write it in assembly.  This is
+//   the function used by the compiler.
+#define INIT_TLS_GET_ADDR COMMON_INTERCEPT_FUNCTION(__tls_get_addr_internal)
+INTERCEPTOR(uptr, __tls_get_addr_internal, void *arg) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, __tls_get_addr_internal, arg);
+  uptr res = REAL(__tls_get_addr_internal)(arg);
+  uptr tp = reinterpret_cast<uptr>(__builtin_thread_pointer());
+  void *ptr = reinterpret_cast<void *>(res + tp);
+  uptr tls_begin, tls_end;
+  COMMON_INTERCEPTOR_GET_TLS_RANGE(&tls_begin, &tls_end);
+  DTLS::DTV *dtv = DTLS_on_tls_get_addr(arg, ptr, tls_begin, tls_end);
+  if (dtv) {
+    // New DTLS block has been allocated.
+    COMMON_INTERCEPTOR_INITIALIZE_RANGE((void *)dtv->beg, dtv->size);
+  }
+  return res;
+}
+// We need a protected symbol aliasing the above, so that we can jump
+// directly to it from the assembly below.
+extern "C" __attribute__((alias("__interceptor___tls_get_addr_internal"),
+                          visibility("protected")))
+uptr __interceptor___tls_get_addr_internal_protected(void *arg);
+// Now carefully intercept __tls_get_offset.
+asm(
+  ".text\n"
+  ".global __tls_get_offset\n"
+  "__tls_get_offset:\n"
+// The __intercept_ version has to exist, so that gen_dynamic_list.py
+// exports our symbol.
+  ".global __interceptor___tls_get_offset\n"
+  "__interceptor___tls_get_offset:\n"
+#ifdef __s390x__
+  "la %r2, 0(%r2,%r12)\n"
+  "jg __interceptor___tls_get_addr_internal_protected\n"
+#else
+  "basr %r3,0\n"
+  "0: la %r2,0(%r2,%r12)\n"
+  "l %r4,1f-0b(%r3)\n"
+  "b 0(%r4,%r3)\n"
+  "1: .long __interceptor___tls_get_addr_internal_protected - 0b\n"
+#endif
+  ".type __tls_get_offset, @function\n"
+  ".size __tls_get_offset, .-__tls_get_offset\n"
+);
+#endif // SANITIZER_S390
 #else
 #define INIT_TLS_GET_ADDR
 #endif
@@ -5315,19 +5622,211 @@
 #define INIT_CTERMID_R
 #endif
 
+#if SANITIZER_INTERCEPT_RECV_RECVFROM
+INTERCEPTOR(SSIZE_T, recv, int fd, void *buf, SIZE_T len, int flags) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, recv, fd, buf, len, flags);
+  COMMON_INTERCEPTOR_FD_ACCESS(ctx, fd);
+  SSIZE_T res = REAL(recv)(fd, buf, len, flags);
+  if (res > 0) {
+    COMMON_INTERCEPTOR_WRITE_RANGE(ctx, buf, Min((SIZE_T)res, len));
+  }
+  if (res >= 0 && fd >= 0) COMMON_INTERCEPTOR_FD_ACQUIRE(ctx, fd);
+  return res;
+}
+
+INTERCEPTOR(SSIZE_T, recvfrom, int fd, void *buf, SIZE_T len, int flags,
+            void *srcaddr, int *addrlen) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, recvfrom, fd, buf, len, flags, srcaddr,
+                           addrlen);
+  COMMON_INTERCEPTOR_FD_ACCESS(ctx, fd);
+  SIZE_T srcaddr_sz;
+  if (srcaddr) srcaddr_sz = *addrlen;
+  (void)srcaddr_sz;  // prevent "set but not used" warning
+  SSIZE_T res = REAL(recvfrom)(fd, buf, len, flags, srcaddr, addrlen);
+  if (res > 0) {
+    COMMON_INTERCEPTOR_WRITE_RANGE(ctx, buf, Min((SIZE_T)res, len));
+    if (srcaddr)
+      COMMON_INTERCEPTOR_INITIALIZE_RANGE(srcaddr,
+                                          Min((SIZE_T)*addrlen, srcaddr_sz));
+  }
+  return res;
+}
+#define INIT_RECV_RECVFROM          \
+  COMMON_INTERCEPT_FUNCTION(recv);  \
+  COMMON_INTERCEPT_FUNCTION(recvfrom);
+#else
+#define INIT_RECV_RECVFROM
+#endif
+
+#if SANITIZER_INTERCEPT_SEND_SENDTO
+INTERCEPTOR(SSIZE_T, send, int fd, void *buf, SIZE_T len, int flags) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, send, fd, buf, len, flags);
+  if (fd >= 0) {
+    COMMON_INTERCEPTOR_FD_ACCESS(ctx, fd);
+    COMMON_INTERCEPTOR_FD_RELEASE(ctx, fd);
+  }
+  SSIZE_T res = REAL(send)(fd, buf, len, flags);
+  if (common_flags()->intercept_send && res > 0)
+    COMMON_INTERCEPTOR_READ_RANGE(ctx, buf, Min((SIZE_T)res, len));
+  return res;
+}
+
+INTERCEPTOR(SSIZE_T, sendto, int fd, void *buf, SIZE_T len, int flags,
+            void *dstaddr, int addrlen) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, sendto, fd, buf, len, flags, dstaddr, addrlen);
+  if (fd >= 0) {
+    COMMON_INTERCEPTOR_FD_ACCESS(ctx, fd);
+    COMMON_INTERCEPTOR_FD_RELEASE(ctx, fd);
+  }
+  // Can't check dstaddr as it may have uninitialized padding at the end.
+  SSIZE_T res = REAL(sendto)(fd, buf, len, flags, dstaddr, addrlen);
+  if (common_flags()->intercept_send && res > 0)
+    COMMON_INTERCEPTOR_READ_RANGE(ctx, buf, Min((SIZE_T)res, len));
+  return res;
+}
+#define INIT_SEND_SENDTO           \
+  COMMON_INTERCEPT_FUNCTION(send); \
+  COMMON_INTERCEPT_FUNCTION(sendto);
+#else
+#define INIT_SEND_SENDTO
+#endif
+
+#if SANITIZER_INTERCEPT_EVENTFD_READ_WRITE
+INTERCEPTOR(int, eventfd_read, int fd, u64 *value) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, eventfd_read, fd, value);
+  COMMON_INTERCEPTOR_FD_ACCESS(ctx, fd);
+  int res = REAL(eventfd_read)(fd, value);
+  if (res == 0) {
+    COMMON_INTERCEPTOR_WRITE_RANGE(ctx, value, sizeof(*value));
+    if (fd >= 0) COMMON_INTERCEPTOR_FD_ACQUIRE(ctx, fd);
+  }
+  return res;
+}
+INTERCEPTOR(int, eventfd_write, int fd, u64 value) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, eventfd_write, fd, value);
+  if (fd >= 0) {
+    COMMON_INTERCEPTOR_FD_ACCESS(ctx, fd);
+    COMMON_INTERCEPTOR_FD_RELEASE(ctx, fd);
+  }
+  int res = REAL(eventfd_write)(fd, value);
+  return res;
+}
+#define INIT_EVENTFD_READ_WRITE            \
+  COMMON_INTERCEPT_FUNCTION(eventfd_read); \
+  COMMON_INTERCEPT_FUNCTION(eventfd_write)
+#else
+#define INIT_EVENTFD_READ_WRITE
+#endif
+
+#if SANITIZER_INTERCEPT_STAT
+INTERCEPTOR(int, stat, const char *path, void *buf) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, stat, path, buf);
+  if (common_flags()->intercept_stat)
+    COMMON_INTERCEPTOR_READ_STRING(ctx, path, 0);
+  int res = REAL(stat)(path, buf);
+  if (!res)
+    COMMON_INTERCEPTOR_WRITE_RANGE(ctx, buf, __sanitizer::struct_stat_sz);
+  return res;
+}
+#define INIT_STAT COMMON_INTERCEPT_FUNCTION(stat)
+#else
+#define INIT_STAT
+#endif
+
+#if SANITIZER_INTERCEPT___XSTAT
+INTERCEPTOR(int, __xstat, int version, const char *path, void *buf) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, __xstat, version, path, buf);
+  if (common_flags()->intercept_stat)
+    COMMON_INTERCEPTOR_READ_STRING(ctx, path, 0);
+  int res = REAL(__xstat)(version, path, buf);
+  if (!res)
+    COMMON_INTERCEPTOR_WRITE_RANGE(ctx, buf, __sanitizer::struct_stat_sz);
+  return res;
+}
+#define INIT___XSTAT COMMON_INTERCEPT_FUNCTION(__xstat)
+#else
+#define INIT___XSTAT
+#endif
+
+#if SANITIZER_INTERCEPT___XSTAT64
+INTERCEPTOR(int, __xstat64, int version, const char *path, void *buf) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, __xstat64, version, path, buf);
+  if (common_flags()->intercept_stat)
+    COMMON_INTERCEPTOR_READ_STRING(ctx, path, 0);
+  int res = REAL(__xstat64)(version, path, buf);
+  if (!res)
+    COMMON_INTERCEPTOR_WRITE_RANGE(ctx, buf, __sanitizer::struct_stat64_sz);
+  return res;
+}
+#define INIT___XSTAT64 COMMON_INTERCEPT_FUNCTION(__xstat64)
+#else
+#define INIT___XSTAT64
+#endif
+
+#if SANITIZER_INTERCEPT___LXSTAT
+INTERCEPTOR(int, __lxstat, int version, const char *path, void *buf) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, __lxstat, version, path, buf);
+  if (common_flags()->intercept_stat)
+    COMMON_INTERCEPTOR_READ_STRING(ctx, path, 0);
+  int res = REAL(__lxstat)(version, path, buf);
+  if (!res)
+    COMMON_INTERCEPTOR_WRITE_RANGE(ctx, buf, __sanitizer::struct_stat_sz);
+  return res;
+}
+#define INIT___LXSTAT COMMON_INTERCEPT_FUNCTION(__lxstat)
+#else
+#define INIT___LXSTAT
+#endif
+
+#if SANITIZER_INTERCEPT___LXSTAT64
+INTERCEPTOR(int, __lxstat64, int version, const char *path, void *buf) {
+  void *ctx;
+  COMMON_INTERCEPTOR_ENTER(ctx, __lxstat64, version, path, buf);
+  if (common_flags()->intercept_stat)
+    COMMON_INTERCEPTOR_READ_STRING(ctx, path, 0);
+  int res = REAL(__lxstat64)(version, path, buf);
+  if (!res)
+    COMMON_INTERCEPTOR_WRITE_RANGE(ctx, buf, __sanitizer::struct_stat64_sz);
+  return res;
+}
+#define INIT___LXSTAT64 COMMON_INTERCEPT_FUNCTION(__lxstat64)
+#else
+#define INIT___LXSTAT64
+#endif
+
+// FIXME: add other *stat interceptor
+
 static void InitializeCommonInterceptors() {
   static u64 metadata_mem[sizeof(MetadataHashMap) / sizeof(u64) + 1];
   interceptor_metadata_map = new((void *)&metadata_mem) MetadataHashMap();
 
   INIT_TEXTDOMAIN;
+  INIT_STRLEN;
+  INIT_STRNLEN;
   INIT_STRCMP;
   INIT_STRNCMP;
   INIT_STRCASECMP;
   INIT_STRNCASECMP;
   INIT_STRSTR;
   INIT_STRCASESTR;
+  INIT_STRCHR;
+  INIT_STRCHRNUL;
+  INIT_STRRCHR;
   INIT_STRSPN;
   INIT_STRPBRK;
+  INIT_MEMSET;
+  INIT_MEMMOVE;
+  INIT_MEMCPY;
   INIT_MEMCHR;
   INIT_MEMCMP;
   INIT_MEMRCHR;
@@ -5380,6 +5879,7 @@
   INIT_ACCEPT4;
   INIT_MODF;
   INIT_RECVMSG;
+  INIT_SENDMSG;
   INIT_GETPEERNAME;
   INIT_IOCTL;
   INIT_INET_ATON;
@@ -5491,4 +5991,13 @@
   INIT_PROCESS_VM_READV;
   INIT_CTERMID;
   INIT_CTERMID_R;
+  INIT_RECV_RECVFROM;
+  INIT_SEND_SENDTO;
+  INIT_STAT;
+  INIT_EVENTFD_READ_WRITE;
+  INIT___XSTAT;
+  INIT___XSTAT64;
+  INIT___LXSTAT;
+  INIT___LXSTAT64;
+  // FIXME: add other *stat interceptors.
 }
diff --git a/lib/sanitizer_common/sanitizer_common_interceptors_ioctl.inc b/lib/sanitizer_common/sanitizer_common_interceptors_ioctl.inc
index fcd0a3d..959c622 100644
--- a/lib/sanitizer_common/sanitizer_common_interceptors_ioctl.inc
+++ b/lib/sanitizer_common/sanitizer_common_interceptors_ioctl.inc
@@ -53,25 +53,9 @@
   _(FIONBIO, READ, sizeof(int));
   _(FIONCLEX, NONE, 0);
   _(FIOSETOWN, READ, sizeof(int));
-  _(SIOCADDMULTI, READ, struct_ifreq_sz);
   _(SIOCATMARK, WRITE, sizeof(int));
-  _(SIOCDELMULTI, READ, struct_ifreq_sz);
-  _(SIOCGIFADDR, WRITE, struct_ifreq_sz);
-  _(SIOCGIFBRDADDR, WRITE, struct_ifreq_sz);
   _(SIOCGIFCONF, CUSTOM, 0);
-  _(SIOCGIFDSTADDR, WRITE, struct_ifreq_sz);
-  _(SIOCGIFFLAGS, WRITE, struct_ifreq_sz);
-  _(SIOCGIFMETRIC, WRITE, struct_ifreq_sz);
-  _(SIOCGIFMTU, WRITE, struct_ifreq_sz);
-  _(SIOCGIFNETMASK, WRITE, struct_ifreq_sz);
   _(SIOCGPGRP, WRITE, sizeof(int));
-  _(SIOCSIFADDR, READ, struct_ifreq_sz);
-  _(SIOCSIFBRDADDR, READ, struct_ifreq_sz);
-  _(SIOCSIFDSTADDR, READ, struct_ifreq_sz);
-  _(SIOCSIFFLAGS, READ, struct_ifreq_sz);
-  _(SIOCSIFMETRIC, READ, struct_ifreq_sz);
-  _(SIOCSIFMTU, READ, struct_ifreq_sz);
-  _(SIOCSIFNETMASK, READ, struct_ifreq_sz);
   _(SIOCSPGRP, READ, sizeof(int));
   _(TIOCCONS, NONE, 0);
   _(TIOCEXCL, NONE, 0);
@@ -92,6 +76,25 @@
   _(TIOCSTI, READ, sizeof(char));
   _(TIOCSWINSZ, READ, struct_winsize_sz);
 
+#if !SANITIZER_IOS
+  _(SIOCADDMULTI, READ, struct_ifreq_sz);
+  _(SIOCDELMULTI, READ, struct_ifreq_sz);
+  _(SIOCGIFADDR, WRITE, struct_ifreq_sz);
+  _(SIOCGIFBRDADDR, WRITE, struct_ifreq_sz);
+  _(SIOCGIFDSTADDR, WRITE, struct_ifreq_sz);
+  _(SIOCGIFFLAGS, WRITE, struct_ifreq_sz);
+  _(SIOCGIFMETRIC, WRITE, struct_ifreq_sz);
+  _(SIOCGIFMTU, WRITE, struct_ifreq_sz);
+  _(SIOCGIFNETMASK, WRITE, struct_ifreq_sz);
+  _(SIOCSIFADDR, READ, struct_ifreq_sz);
+  _(SIOCSIFBRDADDR, READ, struct_ifreq_sz);
+  _(SIOCSIFDSTADDR, READ, struct_ifreq_sz);
+  _(SIOCSIFFLAGS, READ, struct_ifreq_sz);
+  _(SIOCSIFMETRIC, READ, struct_ifreq_sz);
+  _(SIOCSIFMTU, READ, struct_ifreq_sz);
+  _(SIOCSIFNETMASK, READ, struct_ifreq_sz);
+#endif
+
 #if (SANITIZER_LINUX && !SANITIZER_ANDROID)
   _(SIOCGETSGCNT, WRITE, struct_sioc_sg_req_sz);
   _(SIOCGETVIFCNT, WRITE, struct_sioc_vif_req_sz);
diff --git a/lib/sanitizer_common/sanitizer_common_libcdep.cc b/lib/sanitizer_common/sanitizer_common_libcdep.cc
index b5d46f2..596f5bc 100644
--- a/lib/sanitizer_common/sanitizer_common_libcdep.cc
+++ b/lib/sanitizer_common/sanitizer_common_libcdep.cc
@@ -125,9 +125,6 @@
   char *p = msg_copy.data();
   char *q;
 
-  // Remove color sequences since syslogs cannot print them.
-  RemoveANSIEscapeSequencesFromString(p);
-
   // Print one line at a time.
   // syslog, at least on Android, has an implicit message length limit.
   do {
diff --git a/lib/sanitizer_common/sanitizer_common_nolibc.cc b/lib/sanitizer_common/sanitizer_common_nolibc.cc
index 89c17e0..e24cf99 100644
--- a/lib/sanitizer_common/sanitizer_common_nolibc.cc
+++ b/lib/sanitizer_common/sanitizer_common_nolibc.cc
@@ -19,8 +19,10 @@
 
 #if SANITIZER_LINUX
 bool ShouldLogAfterPrintf() { return false; }
+void LogMessageOnPrintf(const char *str) {}
 #endif
 void WriteToSyslog(const char *buffer) {}
 void Abort() { internal__exit(1); }
+void SleepForSeconds(int seconds) { internal_sleep(seconds); }
 
 }  // namespace __sanitizer
diff --git a/lib/sanitizer_common/sanitizer_common_syscalls.inc b/lib/sanitizer_common/sanitizer_common_syscalls.inc
index 008e577..469c8eb 100644
--- a/lib/sanitizer_common/sanitizer_common_syscalls.inc
+++ b/lib/sanitizer_common/sanitizer_common_syscalls.inc
@@ -1237,17 +1237,15 @@
 PRE_SYSCALL(pipe)(void *fildes) {}
 
 POST_SYSCALL(pipe)(long res, void *fildes) {
-  if (res >= 0) {
-    if (fildes) POST_WRITE(fildes, sizeof(int));
-  }
+  if (res >= 0)
+    if (fildes) POST_WRITE(fildes, sizeof(int) * 2);
 }
 
 PRE_SYSCALL(pipe2)(void *fildes, long flags) {}
 
 POST_SYSCALL(pipe2)(long res, void *fildes, long flags) {
-  if (res >= 0) {
-    if (fildes) POST_WRITE(fildes, sizeof(int));
-  }
+  if (res >= 0)
+    if (fildes) POST_WRITE(fildes, sizeof(int) * 2);
 }
 
 PRE_SYSCALL(dup)(long fildes) {}
@@ -1880,13 +1878,11 @@
 
 POST_SYSCALL(socket)(long res, long arg0, long arg1, long arg2) {}
 
-PRE_SYSCALL(socketpair)(long arg0, long arg1, long arg2, void *arg3) {}
+PRE_SYSCALL(socketpair)(long arg0, long arg1, long arg2, int *sv) {}
 
-POST_SYSCALL(socketpair)(long res, long arg0, long arg1, long arg2,
-                         void *arg3) {
-  if (res >= 0) {
-    if (arg3) POST_WRITE(arg3, sizeof(int));
-  }
+POST_SYSCALL(socketpair)(long res, long arg0, long arg1, long arg2, int *sv) {
+  if (res >= 0)
+    if (sv) POST_WRITE(sv, sizeof(int) * 2);
 }
 
 PRE_SYSCALL(socketcall)(long call, void *args) {}
@@ -2301,7 +2297,7 @@
 PRE_SYSCALL(ptrace)(long request, long pid, long addr, long data) {
 #if !SANITIZER_ANDROID && \
     (defined(__i386) || defined(__x86_64) || defined(__mips64) || \
-     defined(__powerpc64__) || defined(__aarch64__))
+     defined(__powerpc64__) || defined(__aarch64__) || defined(__s390__))
   if (data) {
     if (request == ptrace_setregs) {
       PRE_READ((void *)data, struct_user_regs_struct_sz);
@@ -2322,7 +2318,7 @@
 POST_SYSCALL(ptrace)(long res, long request, long pid, long addr, long data) {
 #if !SANITIZER_ANDROID && \
     (defined(__i386) || defined(__x86_64) || defined(__mips64) || \
-     defined(__powerpc64__) || defined(__aarch64__))
+     defined(__powerpc64__) || defined(__aarch64__) || defined(__s390__))
   if (res >= 0 && data) {
     // Note that this is different from the interceptor in
     // sanitizer_common_interceptors.inc.
@@ -2844,6 +2840,40 @@
 POST_SYSCALL(vfork)(long res) {
   COMMON_SYSCALL_POST_FORK(res);
 }
+
+PRE_SYSCALL(sigaction)(long signum, const __sanitizer_kernel_sigaction_t *act,
+                       __sanitizer_kernel_sigaction_t *oldact) {
+  if (act) {
+    PRE_READ(&act->sigaction, sizeof(act->sigaction));
+    PRE_READ(&act->sa_flags, sizeof(act->sa_flags));
+    PRE_READ(&act->sa_mask, sizeof(act->sa_mask));
+  }
+}
+
+POST_SYSCALL(sigaction)(long res, long signum,
+                        const __sanitizer_kernel_sigaction_t *act,
+                        __sanitizer_kernel_sigaction_t *oldact) {
+  if (res >= 0 && oldact) POST_WRITE(oldact, sizeof(*oldact));
+}
+
+PRE_SYSCALL(rt_sigaction)(long signum,
+                          const __sanitizer_kernel_sigaction_t *act,
+                          __sanitizer_kernel_sigaction_t *oldact, SIZE_T sz) {
+  if (act) {
+    PRE_READ(&act->sigaction, sizeof(act->sigaction));
+    PRE_READ(&act->sa_flags, sizeof(act->sa_flags));
+    PRE_READ(&act->sa_mask, sz);
+  }
+}
+
+POST_SYSCALL(rt_sigaction)(long res, long signum,
+                           const __sanitizer_kernel_sigaction_t *act,
+                           __sanitizer_kernel_sigaction_t *oldact, SIZE_T sz) {
+  if (res >= 0 && oldact) {
+    SIZE_T oldact_sz = ((char *)&oldact->sa_mask) - ((char *)oldact) + sz;
+    POST_WRITE(oldact, oldact_sz);
+  }
+}
 }  // extern "C"
 
 #undef PRE_SYSCALL
diff --git a/lib/sanitizer_common/sanitizer_coverage_libcdep.cc b/lib/sanitizer_common/sanitizer_coverage_libcdep.cc
index b9833c5..51b53d3 100644
--- a/lib/sanitizer_common/sanitizer_coverage_libcdep.cc
+++ b/lib/sanitizer_common/sanitizer_coverage_libcdep.cc
@@ -49,6 +49,8 @@
 
 static const u64 kMagic64 = 0xC0BFFFFFFFFFFF64ULL;
 static const u64 kMagic32 = 0xC0BFFFFFFFFFFF32ULL;
+static const uptr kNumWordsForMagic = SANITIZER_WORDSIZE == 64 ? 1 : 2;
+static const u64 kMagic = SANITIZER_WORDSIZE == 64 ? kMagic64 : kMagic32;
 
 static atomic_uint32_t dump_once_guard;  // Ensure that CovDump runs only once.
 
@@ -107,11 +109,19 @@
   uptr Update8bitCounterBitsetAndClearCounters(u8 *bitset);
 
   uptr *data();
-  uptr size();
+  uptr size() const;
+  uptr *buffer() const { return pc_buffer; }
 
  private:
+  struct NamedPcRange {
+    const char *copied_module_name;
+    uptr beg, end; // elements [beg,end) in pc_array.
+  };
+
   void DirectOpen();
   void UpdateModuleNameVec(uptr caller_pc, uptr range_beg, uptr range_end);
+  void GetRangeOffsets(const NamedPcRange& r, Symbolizer* s,
+      InternalMmapVector<uptr>* offsets) const;
 
   // Maximal size pc array may ever grow.
   // We MmapNoReserve this space to ensure that the array is contiguous.
@@ -133,14 +143,11 @@
   // Descriptor of the file mapped pc array.
   fd_t pc_fd;
 
+  uptr *pc_buffer;
+
   // Vector of coverage guard arrays, protected by mu.
   InternalMmapVectorNoCtor<s32*> guard_array_vec;
 
-  struct NamedPcRange {
-    const char *copied_module_name;
-    uptr beg, end; // elements [beg,end) in pc_array.
-  };
-
   // Vector of module and compilation unit pc ranges.
   InternalMmapVectorNoCtor<NamedPcRange> comp_unit_name_vec;
   InternalMmapVectorNoCtor<NamedPcRange> module_name_vec;
@@ -209,6 +216,11 @@
     atomic_store(&pc_array_size, kPcArrayMaxSize, memory_order_relaxed);
   }
 
+  pc_buffer = nullptr;
+  if (common_flags()->coverage_pc_buffer)
+    pc_buffer = reinterpret_cast<uptr *>(MmapNoReserveOrDie(
+        sizeof(uptr) * kPcArrayMaxSize, "CovInit::pc_buffer"));
+
   cc_array = reinterpret_cast<uptr **>(MmapNoReserveOrDie(
       sizeof(uptr *) * kCcArrayMaxSize, "CovInit::cc_array"));
   atomic_store(&cc_array_size, kCcArrayMaxSize, memory_order_relaxed);
@@ -246,6 +258,10 @@
     UnmapOrDie(cc_array, sizeof(uptr *) * kCcArrayMaxSize);
     cc_array = nullptr;
   }
+  if (pc_buffer) {
+    UnmapOrDie(pc_buffer, sizeof(uptr) * kPcArrayMaxSize);
+    pc_buffer = nullptr;
+  }
   if (tr_event_array) {
     UnmapOrDie(tr_event_array,
                sizeof(tr_event_array[0]) * kTrEventArrayMaxSize +
@@ -414,6 +430,7 @@
            atomic_load(&pc_array_size, memory_order_acquire));
   uptr counter = atomic_fetch_add(&coverage_counter, 1, memory_order_relaxed);
   pc_array[idx] = BundlePcAndCounter(pc, counter);
+  if (pc_buffer) pc_buffer[counter] = pc;
 }
 
 // Registers a pair caller=>callee.
@@ -512,7 +529,7 @@
   return pc_array;
 }
 
-uptr CoverageData::size() {
+uptr CoverageData::size() const {
   return atomic_load(&pc_array_index, memory_order_relaxed);
 }
 
@@ -742,41 +759,96 @@
   }
 }
 
+
+void CoverageData::GetRangeOffsets(const NamedPcRange& r, Symbolizer* sym,
+    InternalMmapVector<uptr>* offsets) const {
+  offsets->clear();
+  for (uptr i = 0; i < kNumWordsForMagic; i++)
+    offsets->push_back(0);
+  CHECK(r.copied_module_name);
+  CHECK_LE(r.beg, r.end);
+  CHECK_LE(r.end, size());
+  for (uptr i = r.beg; i < r.end; i++) {
+    uptr pc = UnbundlePc(pc_array[i]);
+    uptr counter = UnbundleCounter(pc_array[i]);
+    if (!pc) continue; // Not visited.
+    uptr offset = 0;
+    sym->GetModuleNameAndOffsetForPC(pc, nullptr, &offset);
+    offsets->push_back(BundlePcAndCounter(offset, counter));
+  }
+
+  CHECK_GE(offsets->size(), kNumWordsForMagic);
+  SortArray(offsets->data(), offsets->size());
+  for (uptr i = 0; i < offsets->size(); i++)
+    (*offsets)[i] = UnbundlePc((*offsets)[i]);
+}
+
+static void GenerateHtmlReport(const InternalMmapVector<char *> &cov_files) {
+  if (!common_flags()->html_cov_report) {
+    return;
+  }
+  char *sancov_path = FindPathToBinary(common_flags()->sancov_path);
+  if (sancov_path == nullptr) {
+    return;
+  }
+
+  InternalMmapVector<char *> sancov_argv(cov_files.size() * 2 + 3);
+  sancov_argv.push_back(sancov_path);
+  sancov_argv.push_back(internal_strdup("-html-report"));
+  auto argv_deleter = at_scope_exit([&] {
+    for (uptr i = 0; i < sancov_argv.size(); ++i) {
+      InternalFree(sancov_argv[i]);
+    }
+  });
+
+  for (const auto &cov_file : cov_files) {
+    sancov_argv.push_back(internal_strdup(cov_file));
+  }
+
+  {
+    ListOfModules modules;
+    modules.init();
+    for (const LoadedModule &module : modules) {
+      sancov_argv.push_back(internal_strdup(module.full_name()));
+    }
+  }
+
+  InternalScopedString report_path(kMaxPathLength);
+  fd_t report_fd =
+      CovOpenFile(&report_path, false /* packed */, GetProcessName(), "html");
+  int pid = StartSubprocess(sancov_argv[0], sancov_argv.data(),
+                            kInvalidFd /* stdin */, report_fd /* std_out */);
+  if (pid > 0) {
+    int result = WaitForProcess(pid);
+    if (result == 0)
+      Printf("coverage report generated to %s\n", report_path.data());
+  }
+}
+
 void CoverageData::DumpOffsets() {
   auto sym = Symbolizer::GetOrInit();
   if (!common_flags()->coverage_pcs) return;
   CHECK_NE(sym, nullptr);
   InternalMmapVector<uptr> offsets(0);
   InternalScopedString path(kMaxPathLength);
-  for (uptr m = 0; m < module_name_vec.size(); m++) {
-    offsets.clear();
-    uptr num_words_for_magic = SANITIZER_WORDSIZE == 64 ? 1 : 2;
-    for (uptr i = 0; i < num_words_for_magic; i++)
-      offsets.push_back(0);
-    auto r = module_name_vec[m];
-    CHECK(r.copied_module_name);
-    CHECK_LE(r.beg, r.end);
-    CHECK_LE(r.end, size());
-    for (uptr i = r.beg; i < r.end; i++) {
-      uptr pc = UnbundlePc(pc_array[i]);
-      uptr counter = UnbundleCounter(pc_array[i]);
-      if (!pc) continue; // Not visited.
-      uptr offset = 0;
-      sym->GetModuleNameAndOffsetForPC(pc, nullptr, &offset);
-      offsets.push_back(BundlePcAndCounter(offset, counter));
+
+  InternalMmapVector<char *> cov_files(module_name_vec.size());
+  auto cov_files_deleter = at_scope_exit([&] {
+    for (uptr i = 0; i < cov_files.size(); ++i) {
+      InternalFree(cov_files[i]);
     }
+  });
 
-    CHECK_GE(offsets.size(), num_words_for_magic);
-    SortArray(offsets.data(), offsets.size());
-    for (uptr i = 0; i < offsets.size(); i++)
-      offsets[i] = UnbundlePc(offsets[i]);
+  for (uptr m = 0; m < module_name_vec.size(); m++) {
+    auto r = module_name_vec[m];
+    GetRangeOffsets(r, sym, &offsets);
 
-    uptr num_offsets = offsets.size() - num_words_for_magic;
+    uptr num_offsets = offsets.size() - kNumWordsForMagic;
     u64 *magic_p = reinterpret_cast<u64*>(offsets.data());
     CHECK_EQ(*magic_p, 0ULL);
     // FIXME: we may want to write 32-bit offsets even in 64-mode
     // if all the offsets are small enough.
-    *magic_p = SANITIZER_WORDSIZE == 64 ? kMagic64 : kMagic32;
+    *magic_p = kMagic;
 
     const char *module_name = StripModuleName(r.copied_module_name);
     if (cov_sandboxed) {
@@ -791,11 +863,14 @@
       if (fd == kInvalidFd) continue;
       WriteToFile(fd, offsets.data(), offsets.size() * sizeof(offsets[0]));
       CloseFile(fd);
+      cov_files.push_back(internal_strdup(path.data()));
       VReport(1, " CovDump: %s: %zd PCs written\n", path.data(), num_offsets);
     }
   }
   if (cov_fd != kInvalidFd)
     CloseFile(cov_fd);
+
+  GenerateHtmlReport(cov_files);
 }
 
 void CoverageData::DumpAll() {
@@ -944,6 +1019,12 @@
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
+uptr __sanitizer_get_coverage_pc_buffer(uptr **data) {
+  *data = coverage_data.buffer();
+  return __sanitizer_get_total_unique_coverage();
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE
 uptr __sanitizer_get_number_of_counters() {
   return coverage_data.GetNumberOf8bitCounters();
 }
diff --git a/lib/sanitizer_common/sanitizer_coverage_mapping_libcdep.cc b/lib/sanitizer_common/sanitizer_coverage_mapping_libcdep.cc
index c8b5d90..3477b06 100644
--- a/lib/sanitizer_common/sanitizer_coverage_mapping_libcdep.cc
+++ b/lib/sanitizer_common/sanitizer_coverage_mapping_libcdep.cc
@@ -72,26 +72,21 @@
   InternalScopedString text(kMaxTextSize);
 
   {
-    InternalScopedBuffer<LoadedModule> modules(kMaxNumberOfModules);
-    CHECK(modules.data());
-    int n_modules = GetListOfModules(modules.data(), kMaxNumberOfModules,
-                                     /* filter */ nullptr);
-
     text.append("%d\n", sizeof(uptr) * 8);
-    for (int i = 0; i < n_modules; ++i) {
-      const char *module_name = StripModuleName(modules[i].full_name());
-      uptr base = modules[i].base_address();
-      for (auto iter = modules[i].ranges(); iter.hasNext();) {
-        const auto *range = iter.next();
-        if (range->executable) {
-          uptr start = range->beg;
-          uptr end = range->end;
+    ListOfModules modules;
+    modules.init();
+    for (const LoadedModule &module : modules) {
+      const char *module_name = StripModuleName(module.full_name());
+      uptr base = module.base_address();
+      for (const auto &range : module.ranges()) {
+        if (range.executable) {
+          uptr start = range.beg;
+          uptr end = range.end;
           text.append("%zx %zx %zx %s\n", start, end, base, module_name);
           if (caller_pc && caller_pc >= start && caller_pc < end)
             cached_mapping.SetModuleRange(start, end);
         }
       }
-      modules[i].clear();
     }
   }
 
diff --git a/lib/sanitizer_common/sanitizer_deadlock_detector1.cc b/lib/sanitizer_common/sanitizer_deadlock_detector1.cc
index bd57a40..68a99d2 100644
--- a/lib/sanitizer_common/sanitizer_deadlock_detector1.cc
+++ b/lib/sanitizer_common/sanitizer_deadlock_detector1.cc
@@ -119,11 +119,16 @@
 
 void DD::ReportDeadlock(DDCallback *cb, DDMutex *m) {
   DDLogicalThread *lt = cb->lt;
-  uptr path[10];
+  uptr path[20];
   uptr len = dd.findPathToLock(&lt->dd, m->id, path, ARRAY_SIZE(path));
-  CHECK_GT(len, 0U);  // Hm.. cycle of 10 locks? I'd like to see that.
+  if (len == 0U) {
+    // A cycle of 20+ locks? Well, that's a bit odd...
+    Printf("WARNING: too long mutex cycle found\n");
+    return;
+  }
   CHECK_EQ(m->id, path[0]);
   lt->report_pending = true;
+  len = Min<uptr>(len, DDReport::kMaxLoopSize);
   DDReport *rep = &lt->rep;
   rep->n = len;
   for (uptr i = 0; i < len; i++) {
diff --git a/lib/sanitizer_common/sanitizer_deadlock_detector_interface.h b/lib/sanitizer_common/sanitizer_deadlock_detector_interface.h
index b6e91a1..11674df 100644
--- a/lib/sanitizer_common/sanitizer_deadlock_detector_interface.h
+++ b/lib/sanitizer_common/sanitizer_deadlock_detector_interface.h
@@ -51,7 +51,7 @@
 };
 
 struct DDReport {
-  enum { kMaxLoopSize = 8 };
+  enum { kMaxLoopSize = 20 };
   int n;  // number of entries in loop
   struct {
     u64 thr_ctx;   // user thread context
diff --git a/lib/sanitizer_common/sanitizer_flags.cc b/lib/sanitizer_common/sanitizer_flags.cc
index 18b9ea3..c2f19d4 100644
--- a/lib/sanitizer_common/sanitizer_flags.cc
+++ b/lib/sanitizer_common/sanitizer_flags.cc
@@ -45,17 +45,44 @@
   internal_memcpy(this, &other, sizeof(*this));
 }
 
-// Copy the string from "s" to "out", replacing "%b" with the binary basename.
-static void SubstituteBinaryName(const char *s, char *out, uptr out_size) {
+// Copy the string from "s" to "out", making the following substitutions:
+// %b = binary basename
+// %p = pid
+void SubstituteForFlagValue(const char *s, char *out, uptr out_size) {
   char *out_end = out + out_size;
   while (*s && out < out_end - 1) {
-    if (s[0] != '%' || s[1] != 'b') { *out++ = *s++; continue; }
-    const char *base = GetProcessName();
-    CHECK(base);
-    while (*base && out < out_end - 1)
-      *out++ = *base++;
-    s += 2; // skip "%b"
+    if (s[0] != '%') {
+      *out++ = *s++;
+      continue;
+    }
+    switch (s[1]) {
+      case 'b': {
+        const char *base = GetProcessName();
+        CHECK(base);
+        while (*base && out < out_end - 1)
+          *out++ = *base++;
+        s += 2; // skip "%b"
+        break;
+      }
+      case 'p': {
+        int pid = internal_getpid();
+        char buf[32];
+        char *buf_pos = buf + 32;
+        do {
+          *--buf_pos = (pid % 10) + '0';
+          pid /= 10;
+        } while (pid);
+        while (buf_pos < buf + 32 && out < out_end - 1)
+          *out++ = *buf_pos++;
+        s += 2; // skip "%p"
+        break;
+      }
+      default:
+        *out++ = *s++;
+        break;
+    }
   }
+  CHECK(out < out_end - 1);
   *out = '\0';
 }
 
@@ -69,7 +96,7 @@
   bool Parse(const char *value) final {
     if (internal_strchr(value, '%')) {
       char *buf = (char *)MmapOrDie(kMaxPathLength, "FlagHandlerInclude");
-      SubstituteBinaryName(value, buf, kMaxPathLength);
+      SubstituteForFlagValue(value, buf, kMaxPathLength);
       bool res = parser_->ParseFile(buf, ignore_missing_);
       UnmapOrDie(buf, kMaxPathLength);
       return res;
@@ -99,4 +126,10 @@
   RegisterIncludeFlags(parser, cf);
 }
 
+void InitializeCommonFlags(CommonFlags *cf) {
+  // need to record coverage to generate coverage report.
+  cf->coverage |= cf->html_cov_report;
+  SetVerbosity(cf->verbosity);
+}
+
 }  // namespace __sanitizer
diff --git a/lib/sanitizer_common/sanitizer_flags.h b/lib/sanitizer_common/sanitizer_flags.h
index 33c3c45..503126b 100644
--- a/lib/sanitizer_common/sanitizer_flags.h
+++ b/lib/sanitizer_common/sanitizer_flags.h
@@ -46,10 +46,17 @@
   common_flags_dont_use.CopyFrom(cf);
 }
 
+void SubstituteForFlagValue(const char *s, char *out, uptr out_size);
+
 class FlagParser;
 void RegisterCommonFlags(FlagParser *parser,
                          CommonFlags *cf = &common_flags_dont_use);
 void RegisterIncludeFlags(FlagParser *parser, CommonFlags *cf);
+
+// Should be called after parsing all flags. Sets up common flag values
+// and perform initializations common to all sanitizers (e.g. setting
+// verbosity).
+void InitializeCommonFlags(CommonFlags *cf = &common_flags_dont_use);
 }  // namespace __sanitizer
 
 #endif  // SANITIZER_FLAGS_H
diff --git a/lib/sanitizer_common/sanitizer_flags.inc b/lib/sanitizer_common/sanitizer_flags.inc
index c892731..0022c31 100644
--- a/lib/sanitizer_common/sanitizer_flags.inc
+++ b/lib/sanitizer_common/sanitizer_flags.inc
@@ -144,6 +144,9 @@
 COMMON_FLAG(const char *, coverage_dir, ".",
             "Target directory for coverage dumps. Defaults to the current "
             "directory.")
+COMMON_FLAG(bool, coverage_pc_buffer, true,
+            "If set (and if 'coverage' is set too), the pcs would be collected "
+            "in a buffer.")
 COMMON_FLAG(bool, full_address_space, false,
             "Sanitize complete address space; "
             "by default kernel area on 32-bit platforms will not be sanitized")
@@ -162,6 +165,11 @@
 COMMON_FLAG(bool, symbolize_vs_style, false,
             "Print file locations in Visual Studio style (e.g: "
             " file(10,42): ...")
+COMMON_FLAG(int, dedup_token_length, 0,
+            "If positive, after printing a stack trace also print a short "
+            "string token based on this number of frames that will simplify "
+            "deduplication of the reports. "
+            "Example: 'DEDUP_TOKEN: foo-bar-main'. Default is 0.")
 COMMON_FLAG(const char *, stack_trace_format, "DEFAULT",
             "Format string used to render stack frames. "
             "See sanitizer_stacktrace_printer.h for the format description. "
@@ -179,21 +187,40 @@
 COMMON_FLAG(bool, intercept_strpbrk, true,
             "If set, uses custom wrappers for strpbrk function "
             "to find more errors.")
+COMMON_FLAG(bool, intercept_strlen, true,
+            "If set, uses custom wrappers for strlen and strnlen functions "
+            "to find more errors.")
+COMMON_FLAG(bool, intercept_strchr, true,
+            "If set, uses custom wrappers for strchr, strchrnul, and strrchr "
+            "functions to find more errors.")
 COMMON_FLAG(bool, intercept_memcmp, true,
             "If set, uses custom wrappers for memcmp function "
             "to find more errors.")
 COMMON_FLAG(bool, strict_memcmp, true,
           "If true, assume that memcmp(p1, p2, n) always reads n bytes before "
           "comparing p1 and p2.")
+COMMON_FLAG(bool, intercept_intrin, true,
+            "If set, uses custom wrappers for memset/memcpy/memmove "
+            "intrinsics to find more errors.")
+COMMON_FLAG(bool, intercept_stat, true,
+            "If set, uses custom wrappers for *stat functions "
+            "to find more errors.")
+COMMON_FLAG(bool, intercept_send, true,
+            "If set, uses custom wrappers for send* functions "
+            "to find more errors.")
 COMMON_FLAG(bool, decorate_proc_maps, false, "If set, decorate sanitizer "
                                              "mappings in /proc/self/maps with "
                                              "user-readable names")
 COMMON_FLAG(int, exitcode, 1, "Override the program exit status if the tool "
                               "found an error")
 COMMON_FLAG(
-    bool, abort_on_error, SANITIZER_MAC,
+    bool, abort_on_error, SANITIZER_ANDROID || SANITIZER_MAC,
     "If set, the tool calls abort() instead of _exit() after printing the "
     "error report.")
 COMMON_FLAG(bool, suppress_equal_pcs, true,
             "Deduplicate multiple reports for single source location in "
             "halt_on_error=false mode (asan only).")
+COMMON_FLAG(bool, print_cmdline, false, "Print command line on crash "
+            "(asan only).")
+COMMON_FLAG(bool, html_cov_report, false, "Generate html coverage report.")
+COMMON_FLAG(const char *, sancov_path, "sancov", "Sancov tool location.")
diff --git a/lib/sanitizer_common/sanitizer_interface_internal.h b/lib/sanitizer_common/sanitizer_interface_internal.h
index b11ae30..7f43c84 100644
--- a/lib/sanitizer_common/sanitizer_interface_internal.h
+++ b/lib/sanitizer_common/sanitizer_interface_internal.h
@@ -25,6 +25,10 @@
   // The special values are "stdout" and "stderr".
   SANITIZER_INTERFACE_ATTRIBUTE
   void __sanitizer_set_report_path(const char *path);
+  // Tell the tools to write their reports to the provided file descriptor
+  // (casted to void *).
+  SANITIZER_INTERFACE_ATTRIBUTE
+  void __sanitizer_set_report_fd(void *fd);
 
   typedef struct {
       int coverage_sandboxed;
diff --git a/lib/sanitizer_common/sanitizer_internal_defs.h b/lib/sanitizer_common/sanitizer_internal_defs.h
index e83eed0..720672d 100644
--- a/lib/sanitizer_common/sanitizer_internal_defs.h
+++ b/lib/sanitizer_common/sanitizer_internal_defs.h
@@ -89,6 +89,7 @@
 typedef int fd_t;
 typedef int error_t;
 #endif
+typedef int pid_t;
 
 // WARNING: OFF_T may be different from OS type off_t, depending on the value of
 // _FILE_OFFSET_BITS. This definition of OFF_T matches the ABI of system calls
@@ -105,12 +106,15 @@
 #if (SANITIZER_WORDSIZE == 64) || SANITIZER_MAC
 typedef uptr operator_new_size_type;
 #else
+# if defined(__s390__) && !defined(__s390x__)
+// Special case: 31-bit s390 has unsigned long as size_t.
+typedef unsigned long operator_new_size_type;
+# else
 typedef u32 operator_new_size_type;
+# endif
 #endif
-}  // namespace __sanitizer
 
 
-using namespace __sanitizer;  // NOLINT
 // ----------- ATTENTION -------------
 // This header should NOT include any other headers to avoid portability issues.
 
@@ -134,7 +138,7 @@
 # define THREADLOCAL   __declspec(thread)
 # define LIKELY(x) (x)
 # define UNLIKELY(x) (x)
-# define PREFETCH(x) /* _mm_prefetch(x, _MM_HINT_NTA) */
+# define PREFETCH(x) /* _mm_prefetch(x, _MM_HINT_NTA) */ (void)0
 #else  // _MSC_VER
 # define ALWAYS_INLINE inline __attribute__((always_inline))
 # define ALIAS(x) __attribute__((alias(x)))
@@ -178,7 +182,9 @@
 typedef ALIGNED(1) s64 us64;
 
 #if SANITIZER_WINDOWS
+}  // namespace __sanitizer
 typedef unsigned long DWORD;  // NOLINT
+namespace __sanitizer {
 typedef DWORD thread_return_t;
 # define THREAD_CALLING_CONV __stdcall
 #else  // _WIN32
@@ -188,14 +194,12 @@
 typedef thread_return_t (THREAD_CALLING_CONV *thread_callback_t)(void* arg);
 
 // NOTE: Functions below must be defined in each run-time.
-namespace __sanitizer {
 void NORETURN Die();
 
 // FIXME: No, this shouldn't be in the sanitizer interface.
 SANITIZER_INTERFACE_ATTRIBUTE
 void NORETURN CheckFailed(const char *file, int line, const char *cond,
                           u64 v1, u64 v2);
-}  // namespace __sanitizer
 
 // Check macro
 #define RAW_CHECK_MSG(expr, msg) do { \
@@ -287,6 +291,9 @@
 #if !defined(_MSC_VER) || defined(__clang__)
 # define GET_CALLER_PC() (uptr)__builtin_return_address(0)
 # define GET_CURRENT_FRAME() (uptr)__builtin_frame_address(0)
+inline void Trap() {
+  __builtin_trap();
+}
 #else
 extern "C" void* _ReturnAddress(void);
 # pragma intrinsic(_ReturnAddress)
@@ -295,6 +302,12 @@
 // FIXME: This macro is still used when printing error reports though it's not
 // clear if the BP value is needed in the ASan reports on Windows.
 # define GET_CURRENT_FRAME() (uptr)0xDEADBEEF
+
+extern "C" void __ud2(void);
+# pragma intrinsic(__ud2)
+inline void Trap() {
+  __ud2();
+}
 #endif
 
 #define HANDLE_EINTR(res, f)                                       \
@@ -313,4 +326,8 @@
     (void)enable_fp;                                               \
   } while (0)
 
+}  // namespace __sanitizer
+
+using namespace __sanitizer;  // NOLINT
+
 #endif  // SANITIZER_DEFS_H
diff --git a/lib/sanitizer_common/sanitizer_libc.cc b/lib/sanitizer_common/sanitizer_libc.cc
index cf31e68..28f55dd 100644
--- a/lib/sanitizer_common/sanitizer_libc.cc
+++ b/lib/sanitizer_common/sanitizer_libc.cc
@@ -74,7 +74,7 @@
 
 // Semi-fast bzero for 16-aligned data. Still far from peak performance.
 void internal_bzero_aligned16(void *s, uptr n) {
-  struct S16 { u64 a, b; } ALIGNED(16);
+  struct ALIGNED(16) S16 { u64 a, b; };
   CHECK_EQ((reinterpret_cast<uptr>(s) | n) & 15, 0);
   for (S16 *p = reinterpret_cast<S16*>(s), *end = p + n / 16; p < end; p++) {
     p->a = p->b = 0;
diff --git a/lib/sanitizer_common/sanitizer_libc.h b/lib/sanitizer_common/sanitizer_libc.h
index df28677..71b8917 100644
--- a/lib/sanitizer_common/sanitizer_libc.h
+++ b/lib/sanitizer_common/sanitizer_libc.h
@@ -70,6 +70,7 @@
 
 // OS
 void NORETURN internal__exit(int exitcode);
+unsigned int internal_sleep(unsigned int seconds);
 
 uptr internal_getpid();
 uptr internal_getppid();
diff --git a/lib/sanitizer_common/sanitizer_linux.cc b/lib/sanitizer_common/sanitizer_linux.cc
index 8c3ebff..b8890ea 100644
--- a/lib/sanitizer_common/sanitizer_linux.cc
+++ b/lib/sanitizer_common/sanitizer_linux.cc
@@ -60,7 +60,10 @@
 #include <unistd.h>
 
 #if SANITIZER_FREEBSD
+#include <sys/exec.h>
 #include <sys/sysctl.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
 #include <machine/atomic.h>
 extern "C" {
 // <sys/umtx.h> must be included after <errno.h> and <sys/types.h> on
@@ -96,6 +99,12 @@
 # define SANITIZER_LINUX_USES_64BIT_SYSCALLS 0
 #endif
 
+#if defined(__x86_64__)
+extern "C" {
+extern void internal_sigreturn();
+}
+#endif
+
 namespace __sanitizer {
 
 #if SANITIZER_LINUX && defined(__x86_64__)
@@ -107,6 +116,7 @@
 #endif
 
 // --------------- sanitizer_libc.h
+#if !SANITIZER_S390
 uptr internal_mmap(void *addr, uptr length, int prot, int flags, int fd,
                    OFF_T offset) {
 #if SANITIZER_FREEBSD || SANITIZER_LINUX_USES_64BIT_SYSCALLS
@@ -119,6 +129,7 @@
                           offset / 4096);
 #endif
 }
+#endif // !SANITIZER_S390
 
 uptr internal_munmap(void *addr, uptr length) {
   return internal_syscall(SYSCALL(munmap), (uptr)addr, length);
@@ -241,7 +252,15 @@
   return internal_syscall(SYSCALL(newfstatat), AT_FDCWD, (uptr)path,
                          (uptr)buf, AT_SYMLINK_NOFOLLOW);
 #elif SANITIZER_LINUX_USES_64BIT_SYSCALLS
+# if SANITIZER_MIPS64
+  // For mips64, lstat syscall fills buffer in the format of kernel_stat
+  struct kernel_stat kbuf;
+  int res = internal_syscall(SYSCALL(lstat), path, &kbuf);
+  kernel_stat_to_stat(&kbuf, (struct stat *)buf);
+  return res;
+# else
   return internal_syscall(SYSCALL(lstat), (uptr)path, (uptr)buf);
+# endif
 #else
   struct stat64 buf64;
   int res = internal_syscall(SYSCALL(lstat64), path, &buf64);
@@ -252,7 +271,15 @@
 
 uptr internal_fstat(fd_t fd, void *buf) {
 #if SANITIZER_FREEBSD || SANITIZER_LINUX_USES_64BIT_SYSCALLS
+# if SANITIZER_MIPS64
+  // For mips64, fstat syscall fills buffer in the format of kernel_stat
+  struct kernel_stat kbuf;
+  int res = internal_syscall(SYSCALL(fstat), fd, &kbuf);
+  kernel_stat_to_stat(&kbuf, (struct stat *)buf);
+  return res;
+# else
   return internal_syscall(SYSCALL(fstat), fd, (uptr)buf);
+# endif
 #else
   struct stat64 buf64;
   int res = internal_syscall(SYSCALL(fstat64), fd, &buf64);
@@ -315,6 +342,15 @@
   Die();  // Unreachable.
 }
 
+unsigned int internal_sleep(unsigned int seconds) {
+  struct timespec ts;
+  ts.tv_sec = 1;
+  ts.tv_nsec = 0;
+  int res = internal_syscall(SYSCALL(nanosleep), &ts, &ts);
+  if (res) return ts.tv_sec;
+  return 0;
+}
+
 uptr internal_execve(const char *filename, char *const argv[],
                      char *const envp[]) {
   return internal_syscall(SYSCALL(execve), (uptr)filename, (uptr)argv,
@@ -395,11 +431,13 @@
 #endif
 }
 
+#if !SANITIZER_FREEBSD
 extern "C" {
   SANITIZER_WEAK_ATTRIBUTE extern void *__libc_stack_end;
 }
+#endif
 
-#if !SANITIZER_GO
+#if !SANITIZER_GO && !SANITIZER_FREEBSD
 static void ReadNullSepFileToArray(const char *path, char ***arr,
                                    int arr_size) {
   char *buff;
@@ -424,7 +462,8 @@
 }
 #endif
 
-static void GetArgsAndEnv(char*** argv, char*** envp) {
+static void GetArgsAndEnv(char ***argv, char ***envp) {
+#if !SANITIZER_FREEBSD
 #if !SANITIZER_GO
   if (&__libc_stack_end) {
 #endif
@@ -439,6 +478,25 @@
     ReadNullSepFileToArray("/proc/self/environ", envp, kMaxEnvp);
   }
 #endif
+#else
+  // On FreeBSD, retrieving the argument and environment arrays is done via the
+  // kern.ps_strings sysctl, which returns a pointer to a structure containing
+  // this information. See also <sys/exec.h>.
+  ps_strings *pss;
+  size_t sz = sizeof(pss);
+  if (sysctlbyname("kern.ps_strings", &pss, &sz, NULL, 0) == -1) {
+    Printf("sysctl kern.ps_strings failed\n");
+    Die();
+  }
+  *argv = pss->ps_argvstr;
+  *envp = pss->ps_envstr;
+#endif
+}
+
+char **GetArgv() {
+  char **argv, **envp;
+  GetArgsAndEnv(&argv, &envp);
+  return argv;
 }
 
 void ReExec() {
@@ -564,7 +622,8 @@
 
 #if SANITIZER_LINUX
 #define SA_RESTORER 0x04000000
-// Doesn't set sa_restorer, use with caution (see below).
+// Doesn't set sa_restorer if the caller did not set it, so use with caution
+//(see below).
 int internal_sigaction_norestorer(int signum, const void *act, void *oldact) {
   __sanitizer_kernel_sigaction_t k_act, k_oldact;
   internal_memset(&k_act, 0, sizeof(__sanitizer_kernel_sigaction_t));
@@ -608,6 +667,25 @@
   }
   return result;
 }
+
+// Invokes sigaction via a raw syscall with a restorer, but does not support
+// all platforms yet.
+// We disable for Go simply because we have not yet added to buildgo.sh.
+#if defined(__x86_64__) && !SANITIZER_GO
+int internal_sigaction_syscall(int signum, const void *act, void *oldact) {
+  if (act == nullptr)
+    return internal_sigaction_norestorer(signum, act, oldact);
+  __sanitizer_sigaction u_adjust;
+  internal_memcpy(&u_adjust, act, sizeof(u_adjust));
+#if !SANITIZER_ANDROID || !SANITIZER_MIPS32
+    if (u_adjust.sa_restorer == nullptr) {
+      u_adjust.sa_restorer = internal_sigreturn;
+    }
+#endif
+    return internal_sigaction_norestorer(signum, (const void *)&u_adjust,
+                                         oldact);
+}
+#endif // defined(__x86_64__) && !SANITIZER_GO
 #endif  // SANITIZER_LINUX
 
 uptr internal_sigprocmask(int how, __sanitizer_sigset_t *set,
@@ -627,6 +705,10 @@
   internal_memset(set, 0xff, sizeof(*set));
 }
 
+void internal_sigemptyset(__sanitizer_sigset_t *set) {
+  internal_memset(set, 0, sizeof(*set));
+}
+
 #if SANITIZER_LINUX
 void internal_sigdelset(__sanitizer_sigset_t *set, int signum) {
   signum -= 1;
@@ -637,6 +719,16 @@
   const uptr bit = signum % (sizeof(k_set->sig[0]) * 8);
   k_set->sig[idx] &= ~(1 << bit);
 }
+
+bool internal_sigismember(__sanitizer_sigset_t *set, int signum) {
+  signum -= 1;
+  CHECK_GE(signum, 0);
+  CHECK_LT(signum, sizeof(*set) * 8);
+  __sanitizer_kernel_sigset_t *k_set = (__sanitizer_kernel_sigset_t *)set;
+  const uptr idx = signum / (sizeof(k_set->sig[0]) * 8);
+  const uptr bit = signum % (sizeof(k_set->sig[0]) * 8);
+  return k_set->sig[idx] & (1 << bit);
+}
 #endif  // SANITIZER_LINUX
 
 // ThreadLister implementation.
@@ -708,7 +800,10 @@
 }
 
 uptr GetPageSize() {
-#if SANITIZER_LINUX && (defined(__x86_64__) || defined(__i386__))
+// Android post-M sysconf(_SC_PAGESIZE) crashes if called from .preinit_array.
+#if SANITIZER_ANDROID
+  return 4096;
+#elif SANITIZER_LINUX && (defined(__x86_64__) || defined(__i386__))
   return EXEC_PAGESIZE;
 #else
   return sysconf(_SC_PAGESIZE);  // EXEC_PAGESIZE may not be trustworthy.
@@ -915,8 +1010,18 @@
                        "bnez $2,1f;\n"
 
                        /* Call "fn(arg)". */
+#if SANITIZER_WORDSIZE == 32
+#ifdef __BIG_ENDIAN__
+                       "lw $25,4($29);\n"
+                       "lw $4,12($29);\n"
+#else
+                       "lw $25,0($29);\n"
+                       "lw $4,8($29);\n"
+#endif
+#else
                        "ld $25,0($29);\n"
                        "ld $4,8($29);\n"
+#endif
                        "jal $25;\n"
 
                        /* Call _exit($v0). */
@@ -1115,7 +1220,7 @@
 
 #endif
 
-bool IsDeadlySignal(int signum) {
+bool IsHandledDeadlySignal(int signum) {
   if (common_flags()->handle_abort && signum == SIGABRT)
     return true;
   if (common_flags()->handle_sigill && signum == SIGILL)
@@ -1151,6 +1256,58 @@
 void internal_join_thread(void *th) {}
 #endif
 
+#if defined(__aarch64__)
+// Android headers in the older NDK releases miss this definition.
+struct __sanitizer_esr_context {
+  struct _aarch64_ctx head;
+  uint64_t esr;
+};
+
+static bool Aarch64GetESR(ucontext_t *ucontext, u64 *esr) {
+  static const u32 kEsrMagic = 0x45535201;
+  u8 *aux = ucontext->uc_mcontext.__reserved;
+  while (true) {
+    _aarch64_ctx *ctx = (_aarch64_ctx *)aux;
+    if (ctx->size == 0) break;
+    if (ctx->magic == kEsrMagic) {
+      *esr = ((__sanitizer_esr_context *)ctx)->esr;
+      return true;
+    }
+    aux += ctx->size;
+  }
+  return false;
+}
+#endif
+
+SignalContext::WriteFlag SignalContext::GetWriteFlag(void *context) {
+  ucontext_t *ucontext = (ucontext_t *)context;
+#if defined(__x86_64__) || defined(__i386__)
+  static const uptr PF_WRITE = 1U << 1;
+#if SANITIZER_FREEBSD
+  uptr err = ucontext->uc_mcontext.mc_err;
+#else
+  uptr err = ucontext->uc_mcontext.gregs[REG_ERR];
+#endif
+  return err & PF_WRITE ? WRITE : READ;
+#elif defined(__arm__)
+  static const uptr FSR_WRITE = 1U << 11;
+  uptr fsr = ucontext->uc_mcontext.error_code;
+  // FSR bits 5:0 describe the abort type, and are never 0 (or so it seems).
+  // Zero FSR indicates an older kernel that does not pass this information to
+  // the userspace.
+  if (fsr == 0) return UNKNOWN;
+  return fsr & FSR_WRITE ? WRITE : READ;
+#elif defined(__aarch64__)
+  static const u64 ESR_ELx_WNR = 1U << 6;
+  u64 esr;
+  if (!Aarch64GetESR(ucontext, &esr)) return UNKNOWN;
+  return esr & ESR_ELx_WNR ? WRITE : READ;
+#else
+  (void)ucontext;
+  return UNKNOWN;  // FIXME: Implement.
+#endif
+}
+
 void GetPcSpBp(void *context, uptr *pc, uptr *sp, uptr *bp) {
 #if defined(__arm__)
   ucontext_t *ucontext = (ucontext_t*)context;
@@ -1218,15 +1375,20 @@
   *pc = ucontext->uc_mcontext.pc;
   *bp = ucontext->uc_mcontext.gregs[30];
   *sp = ucontext->uc_mcontext.gregs[29];
+#elif defined(__s390__)
+  ucontext_t *ucontext = (ucontext_t*)context;
+# if defined(__s390x__)
+  *pc = ucontext->uc_mcontext.psw.addr;
+# else
+  *pc = ucontext->uc_mcontext.psw.addr & 0x7fffffff;
+# endif
+  *bp = ucontext->uc_mcontext.gregs[11];
+  *sp = ucontext->uc_mcontext.gregs[15];
 #else
 # error "Unsupported arch"
 #endif
 }
 
-void DisableReexec() {
-  // No need to re-exec on Linux.
-}
-
 void MaybeReexec() {
   // No need to re-exec on Linux.
 }
diff --git a/lib/sanitizer_common/sanitizer_linux.h b/lib/sanitizer_common/sanitizer_linux.h
index 77bfbd1..526fa44 100644
--- a/lib/sanitizer_common/sanitizer_linux.h
+++ b/lib/sanitizer_common/sanitizer_linux.h
@@ -34,7 +34,6 @@
                           struct sigaltstack* oss);
 uptr internal_sigprocmask(int how, __sanitizer_sigset_t *set,
     __sanitizer_sigset_t *oldset);
-void internal_sigfillset(__sanitizer_sigset_t *set);
 
 // Linux-only syscalls.
 #if SANITIZER_LINUX
@@ -43,9 +42,13 @@
 // (like the process-wide error reporting SEGV handler) must use
 // internal_sigaction instead.
 int internal_sigaction_norestorer(int signum, const void *act, void *oldact);
+#if defined(__x86_64__) && !SANITIZER_GO
+// Uses a raw system call to avoid interceptors.
+int internal_sigaction_syscall(int signum, const void *act, void *oldact);
+#endif
 void internal_sigdelset(__sanitizer_sigset_t *set, int signum);
 #if defined(__x86_64__) || defined(__mips__) || defined(__aarch64__) \
-  || defined(__powerpc64__)
+  || defined(__powerpc64__) || defined(__s390__)
 uptr internal_clone(int (*fn)(void *), void *child_stack, int flags, void *arg,
                     int *parent_tidptr, void *newtls, int *child_tidptr);
 #endif
diff --git a/lib/sanitizer_common/sanitizer_linux_libcdep.cc b/lib/sanitizer_common/sanitizer_linux_libcdep.cc
index 0fb67f3..a37bdf1 100644
--- a/lib/sanitizer_common/sanitizer_linux_libcdep.cc
+++ b/lib/sanitizer_common/sanitizer_linux_libcdep.cc
@@ -187,7 +187,7 @@
 #endif  // !SANITIZER_FREEBSD && !SANITIZER_ANDROID && !SANITIZER_GO
 
 #if (defined(__x86_64__) || defined(__i386__) || defined(__mips__) \
-    || defined(__aarch64__) || defined(__powerpc64__)) \
+    || defined(__aarch64__) || defined(__powerpc64__) || defined(__s390__)) \
     && SANITIZER_LINUX && !SANITIZER_ANDROID
 // sizeof(struct pthread) from glibc.
 static atomic_uintptr_t kThreadDescriptorSize;
@@ -204,6 +204,11 @@
     char *end;
     int minor = internal_simple_strtoll(buf + 8, &end, 10);
     if (end != buf + 8 && (*end == '\0' || *end == '.')) {
+      int patch = 0;
+      if (*end == '.')
+        // strtoll will return 0 if no valid conversion could be performed
+        patch = internal_simple_strtoll(end + 1, nullptr, 10);
+
       /* sizeof(struct pthread) values from various glibc versions.  */
       if (SANITIZER_X32)
         val = 1728;  // Assume only one particular version for x32.
@@ -217,9 +222,9 @@
         val = FIRST_32_SECOND_64(1136, 1712);
       else if (minor == 10)
         val = FIRST_32_SECOND_64(1168, 1776);
-      else if (minor <= 12)
+      else if (minor == 11 || (minor == 12 && patch == 1))
         val = FIRST_32_SECOND_64(1168, 2288);
-      else if (minor == 13)
+      else if (minor <= 13)
         val = FIRST_32_SECOND_64(1168, 2304);
       else
         val = FIRST_32_SECOND_64(1216, 2304);
@@ -244,6 +249,9 @@
   val = 1776; // from glibc.ppc64le 2.20-8.fc21
   atomic_store(&kThreadDescriptorSize, val, memory_order_relaxed);
   return val;
+#elif defined(__s390__)
+  val = FIRST_32_SECOND_64(1152, 1776); // valid for glibc 2.22
+  atomic_store(&kThreadDescriptorSize, val, memory_order_relaxed);
 #endif
   return 0;
 }
@@ -291,7 +299,7 @@
                 rdhwr %0,$29;\
                 .set pop" : "=r" (thread_pointer));
   descr_addr = thread_pointer - kTlsTcbOffset - TlsPreTcbSize();
-# elif defined(__aarch64__)
+# elif defined(__aarch64__) || defined(__s390__)
   descr_addr = reinterpret_cast<uptr>(__builtin_thread_pointer());
 # elif defined(__powerpc64__)
   // PPC64LE uses TLS variant I. The thread pointer (in GPR 13)
@@ -332,7 +340,7 @@
 #if !SANITIZER_GO
 static void GetTls(uptr *addr, uptr *size) {
 #if SANITIZER_LINUX && !SANITIZER_ANDROID
-# if defined(__x86_64__) || defined(__i386__)
+# if defined(__x86_64__) || defined(__i386__) || defined(__s390__)
   *addr = ThreadSelf();
   *size = GetTlsSize();
   *addr -= *size;
@@ -412,17 +420,12 @@
 # endif
 
 struct DlIteratePhdrData {
-  LoadedModule *modules;
-  uptr current_n;
+  InternalMmapVector<LoadedModule> *modules;
   bool first;
-  uptr max_n;
-  string_predicate_t filter;
 };
 
 static int dl_iterate_phdr_cb(dl_phdr_info *info, size_t size, void *arg) {
   DlIteratePhdrData *data = (DlIteratePhdrData*)arg;
-  if (data->current_n == data->max_n)
-    return 0;
   InternalScopedString module_name(kMaxPathLength);
   if (data->first) {
     data->first = false;
@@ -433,20 +436,18 @@
   }
   if (module_name[0] == '\0')
     return 0;
-  if (data->filter && !data->filter(module_name.data()))
-    return 0;
-  LoadedModule *cur_module = &data->modules[data->current_n];
-  cur_module->set(module_name.data(), info->dlpi_addr);
-  data->current_n++;
+  LoadedModule cur_module;
+  cur_module.set(module_name.data(), info->dlpi_addr);
   for (int i = 0; i < info->dlpi_phnum; i++) {
     const Elf_Phdr *phdr = &info->dlpi_phdr[i];
     if (phdr->p_type == PT_LOAD) {
       uptr cur_beg = info->dlpi_addr + phdr->p_vaddr;
       uptr cur_end = cur_beg + phdr->p_memsz;
       bool executable = phdr->p_flags & PF_X;
-      cur_module->addAddressRange(cur_beg, cur_end, executable);
+      cur_module.addAddressRange(cur_beg, cur_end, executable);
     }
   }
+  data->modules->push_back(cur_module);
   return 0;
 }
 
@@ -455,8 +456,8 @@
     int (*)(struct dl_phdr_info *, size_t, void *), void *);
 #endif
 
-uptr GetListOfModules(LoadedModule *modules, uptr max_modules,
-                      string_predicate_t filter) {
+void ListOfModules::init() {
+  clear();
 #if SANITIZER_ANDROID && __ANDROID_API__ <= 22
   u32 api_level = AndroidGetApiLevel();
   // Fall back to /proc/maps if dl_iterate_phdr is unavailable or broken.
@@ -464,13 +465,12 @@
   // both K and L (and future) Android releases.
   if (api_level <= ANDROID_LOLLIPOP_MR1) { // L or earlier
     MemoryMappingLayout memory_mapping(false);
-    return memory_mapping.DumpListOfModules(modules, max_modules, filter);
+    memory_mapping.DumpListOfModules(&modules_);
+    return;
   }
 #endif
-  CHECK(modules);
-  DlIteratePhdrData data = {modules, 0, true, max_modules, filter};
+  DlIteratePhdrData data = {&modules_, true};
   dl_iterate_phdr(dl_iterate_phdr_cb, &data);
-  return data.current_n;
 }
 
 // getrusage does not give us the current RSS, only the max RSS.
@@ -524,13 +524,13 @@
   atomic_store(&android_log_initialized, 1, memory_order_release);
 }
 
-bool ShouldLogAfterPrintf() {
+static bool ShouldLogAfterPrintf() {
   return atomic_load(&android_log_initialized, memory_order_acquire);
 }
 #else
 void AndroidLogInit() {}
 
-bool ShouldLogAfterPrintf() { return true; }
+static bool ShouldLogAfterPrintf() { return true; }
 #endif  // SANITIZER_ANDROID
 
 void WriteOneLineToSyslog(const char *s) {
@@ -541,6 +541,11 @@
 #endif
 }
 
+void LogMessageOnPrintf(const char *str) {
+  if (common_flags()->log_to_syslog && ShouldLogAfterPrintf())
+    WriteToSyslog(str);
+}
+
 #endif // SANITIZER_LINUX
 
 } // namespace __sanitizer
diff --git a/lib/sanitizer_common/sanitizer_linux_s390.cc b/lib/sanitizer_common/sanitizer_linux_s390.cc
new file mode 100644
index 0000000..053fd17
--- /dev/null
+++ b/lib/sanitizer_common/sanitizer_linux_s390.cc
@@ -0,0 +1,191 @@
+//===-- sanitizer_linux_s390.cc -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is shared between AddressSanitizer and ThreadSanitizer
+// run-time libraries and implements s390-linux-specific functions from
+// sanitizer_libc.h.
+//===----------------------------------------------------------------------===//
+
+#include "sanitizer_platform.h"
+
+#if SANITIZER_LINUX && SANITIZER_S390
+
+#include "sanitizer_libc.h"
+#include "sanitizer_linux.h"
+
+#include <errno.h>
+#include <sys/syscall.h>
+#include <sys/utsname.h>
+#include <unistd.h>
+
+namespace __sanitizer {
+
+// --------------- sanitizer_libc.h
+uptr internal_mmap(void *addr, uptr length, int prot, int flags, int fd,
+                   OFF_T offset) {
+  struct s390_mmap_params {
+    unsigned long addr;
+    unsigned long length;
+    unsigned long prot;
+    unsigned long flags;
+    unsigned long fd;
+    unsigned long offset;
+  } params = {
+    (unsigned long)addr,
+    (unsigned long)length,
+    (unsigned long)prot,
+    (unsigned long)flags,
+    (unsigned long)fd,
+# ifdef __s390x__
+    (unsigned long)offset,
+# else
+    (unsigned long)(offset / 4096),
+# endif
+  };
+# ifdef __s390x__
+  return syscall(__NR_mmap, &params);
+# else
+  return syscall(__NR_mmap2, &params);
+# endif
+}
+
+uptr internal_clone(int (*fn)(void *), void *child_stack, int flags, void *arg,
+                    int *parent_tidptr, void *newtls, int *child_tidptr) {
+  if (!fn || !child_stack)
+    return -EINVAL;
+  CHECK_EQ(0, (uptr)child_stack % 16);
+  // Minimum frame size.
+#ifdef __s390x__
+  child_stack = (char *)child_stack - 160;
+#else
+  child_stack = (char *)child_stack - 96;
+#endif
+  // Terminate unwind chain.
+  ((unsigned long *)child_stack)[0] = 0;
+  // And pass parameters.
+  ((unsigned long *)child_stack)[1] = (uptr)fn;
+  ((unsigned long *)child_stack)[2] = (uptr)arg;
+  register long res __asm__("r2");
+  register void *__cstack      __asm__("r2") = child_stack;
+  register int __flags         __asm__("r3") = flags;
+  register int * __ptidptr     __asm__("r4") = parent_tidptr;
+  register int * __ctidptr     __asm__("r5") = child_tidptr;
+  register void * __newtls     __asm__("r6") = newtls;
+
+  __asm__ __volatile__(
+                       /* Clone. */
+                       "svc    %1\n"
+
+                       /* if (%r2 != 0)
+                        *   return;
+                        */
+#ifdef __s390x__
+                       "cghi   %%r2, 0\n"
+#else
+                       "chi    %%r2, 0\n"
+#endif
+                       "jne    1f\n"
+
+                       /* Call "fn(arg)". */
+#ifdef __s390x__
+                       "lmg    %%r1, %%r2, 8(%%r15)\n"
+#else
+                       "lm     %%r1, %%r2, 4(%%r15)\n"
+#endif
+                       "basr   %%r14, %%r1\n"
+
+                       /* Call _exit(%r2). */
+                       "svc %2\n"
+
+                       /* Return to parent. */
+                     "1:\n"
+                       : "=r" (res)
+                       : "i"(__NR_clone), "i"(__NR_exit),
+                         "r"(__cstack),
+                         "r"(__flags),
+                         "r"(__ptidptr),
+                         "r"(__ctidptr),
+                         "r"(__newtls)
+                       : "memory", "cc");
+  return res;
+}
+
+#if SANITIZER_S390_64
+static bool FixedCVE_2016_2143() {
+  // Try to determine if the running kernel has a fix for CVE-2016-2143,
+  // return false if in doubt (better safe than sorry).  Distros may want to
+  // adjust this for their own kernels.
+  struct utsname buf;
+  unsigned int major, minor, patch = 0;
+  // This should never fail, but just in case...
+  if (uname(&buf))
+    return false;
+  char *ptr = buf.release;
+  major = internal_simple_strtoll(ptr, &ptr, 10);
+  // At least first 2 should be matched.
+  if (ptr[0] != '.')
+    return false;
+  minor = internal_simple_strtoll(ptr+1, &ptr, 10);
+  // Third is optional.
+  if (ptr[0] == '.')
+    patch = internal_simple_strtoll(ptr+1, &ptr, 10);
+  if (major < 3) {
+    // <3.0 is bad.
+    return false;
+  } else if (major == 3) {
+    // 3.2.79+ is OK.
+    if (minor == 2 && patch >= 79)
+      return true;
+    // 3.12.58+ is OK.
+    if (minor == 12 && patch >= 58)
+      return true;
+    // Otherwise, bad.
+    return false;
+  } else if (major == 4) {
+    // 4.1.21+ is OK.
+    if (minor == 1 && patch >= 21)
+      return true;
+    // 4.4.6+ is OK.
+    if (minor == 4 && patch >= 6)
+      return true;
+    // Otherwise, OK if 4.5+.
+    return minor >= 5;
+  } else {
+    // Linux 5 and up are fine.
+    return true;
+  }
+}
+
+void AvoidCVE_2016_2143() {
+  // Older kernels are affected by CVE-2016-2143 - they will crash hard
+  // if someone uses 4-level page tables (ie. virtual addresses >= 4TB)
+  // and fork() in the same process.  Unfortunately, sanitizers tend to
+  // require such addresses.  Since this is very likely to crash the whole
+  // machine (sanitizers themselves use fork() for llvm-symbolizer, for one),
+  // abort the process at initialization instead.
+  if (FixedCVE_2016_2143())
+    return;
+  if (GetEnv("SANITIZER_IGNORE_CVE_2016_2143"))
+    return;
+  Report(
+    "ERROR: Your kernel seems to be vulnerable to CVE-2016-2143.  Using ASan,\n"
+    "MSan, TSan, DFSan or LSan with such kernel can and will crash your\n"
+    "machine, or worse.\n"
+    "\n"
+    "If you are certain your kernel is not vulnerable (you have compiled it\n"
+    "yourself, or are using an unrecognized distribution kernel), you can\n"
+    "override this safety check by exporting SANITIZER_IGNORE_CVE_2016_2143\n"
+    "with any value.\n");
+  Die();
+}
+#endif
+
+} // namespace __sanitizer
+
+#endif // SANITIZER_LINUX && SANITIZER_S390
diff --git a/lib/sanitizer_common/sanitizer_linux_x86_64.S b/lib/sanitizer_common/sanitizer_linux_x86_64.S
new file mode 100644
index 0000000..8ff9095
--- /dev/null
+++ b/lib/sanitizer_common/sanitizer_linux_x86_64.S
@@ -0,0 +1,25 @@
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+
+// Avoid being marked as needing an executable stack:
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+// Further contents are x86_64-only:
+#if defined(__linux__) && defined(__x86_64__)
+
+#include "../builtins/assembly.h"
+
+// If the "naked" function attribute were supported for x86 we could
+// do this via inline asm.
+.text
+.balign 4
+DEFINE_COMPILERRT_FUNCTION(internal_sigreturn)
+        mov           $0xf,             %eax    // 0xf == SYS_rt_sigreturn
+        mov           %rcx,             %r10
+        syscall
+        ret                                     // Won't normally reach here.
+END_COMPILERRT_FUNCTION(internal_sigreturn)
+
+#endif // defined(__linux__) && defined(__x86_64__)
diff --git a/lib/sanitizer_common/sanitizer_list.h b/lib/sanitizer_common/sanitizer_list.h
index adbb97d..c78cb4c 100644
--- a/lib/sanitizer_common/sanitizer_list.h
+++ b/lib/sanitizer_common/sanitizer_list.h
@@ -71,7 +71,9 @@
   }
 
   Item *front() { return first_; }
+  const Item *front() const { return first_; }
   Item *back() { return last_; }
+  const Item *back() const { return last_; }
 
   void append_front(IntrusiveList<Item> *l) {
     CHECK_NE(this, l);
@@ -116,24 +118,32 @@
     }
   }
 
-  template<class ListTy, class ItemTy>
+  template<class ItemTy>
   class IteratorBase {
    public:
-    explicit IteratorBase(ListTy *list)
-        : list_(list), current_(list->first_) { }
-    ItemTy *next() {
-      ItemTy *ret = current_;
-      if (current_) current_ = current_->next;
-      return ret;
+    explicit IteratorBase(ItemTy *current) : current_(current) {}
+    IteratorBase &operator++() {
+      current_ = current_->next;
+      return *this;
     }
-    bool hasNext() const { return current_ != nullptr; }
+    bool operator!=(IteratorBase other) const {
+      return current_ != other.current_;
+    }
+    ItemTy &operator*() {
+      return *current_;
+    }
    private:
-    ListTy *list_;
     ItemTy *current_;
   };
 
-  typedef IteratorBase<IntrusiveList<Item>, Item> Iterator;
-  typedef IteratorBase<const IntrusiveList<Item>, const Item> ConstIterator;
+  typedef IteratorBase<Item> Iterator;
+  typedef IteratorBase<const Item> ConstIterator;
+
+  Iterator begin() { return Iterator(first_); }
+  Iterator end() { return Iterator(0); }
+
+  ConstIterator begin() const { return ConstIterator(first_); }
+  ConstIterator end() const { return ConstIterator(0); }
 
 // private, don't use directly.
   uptr size_;
diff --git a/lib/sanitizer_common/sanitizer_mac.cc b/lib/sanitizer_common/sanitizer_mac.cc
index 1c96a6b..69178c6 100644
--- a/lib/sanitizer_common/sanitizer_mac.cc
+++ b/lib/sanitizer_common/sanitizer_mac.cc
@@ -68,22 +68,34 @@
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/types.h>
+#include <sys/wait.h>
 #include <unistd.h>
 #include <util.h>
 
+// from <crt_externs.h>, but we don't have that file on iOS
+extern "C" {
+  extern char ***_NSGetArgv(void);
+  extern char ***_NSGetEnviron(void);
+}
+
 namespace __sanitizer {
 
 #include "sanitizer_syscall_generic.inc"
 
+// Direct syscalls, don't call libmalloc hooks.
+extern "C" void *__mmap(void *addr, size_t len, int prot, int flags, int fildes,
+                        off_t off);
+extern "C" int __munmap(void *, size_t);
+
 // ---------------------- sanitizer_libc.h
 uptr internal_mmap(void *addr, size_t length, int prot, int flags,
                    int fd, u64 offset) {
   if (fd == -1) fd = VM_MAKE_TAG(VM_MEMORY_ANALYSIS_TOOL);
-  return (uptr)mmap(addr, length, prot, flags, fd, offset);
+  return (uptr)__mmap(addr, length, prot, flags, fd, offset);
 }
 
 uptr internal_munmap(void *addr, uptr length) {
-  return munmap(addr, length);
+  return __munmap(addr, length);
 }
 
 int internal_mprotect(void *addr, uptr length, int prot) {
@@ -149,6 +161,10 @@
   _exit(exitcode);
 }
 
+unsigned int internal_sleep(unsigned int seconds) {
+  return sleep(seconds);
+}
+
 uptr internal_getpid() {
   return getpid();
 }
@@ -183,7 +199,11 @@
   }
   if (pid == 0) {
     close(master);
-    CHECK_EQ(login_tty(slave), 0);
+    if (login_tty(slave) != 0) {
+      // We already forked, there's not much we can do.  Let's quit.
+      Report("login_tty failed (errno %d)\n", errno);
+      internal__exit(1);
+    }
   } else {
     *amaster = master;
     close(slave);
@@ -199,6 +219,15 @@
   return ftruncate(fd, size);
 }
 
+uptr internal_execve(const char *filename, char *const argv[],
+                     char *const envp[]) {
+  return execve(filename, argv, envp);
+}
+
+uptr internal_waitpid(int pid, int *status, int options) {
+  return waitpid(pid, status, options);
+}
+
 // ----------------- sanitizer_common.h
 bool FileExists(const char *filename) {
   struct stat st;
@@ -209,7 +238,10 @@
 }
 
 uptr GetTid() {
-  return reinterpret_cast<uptr>(pthread_self());
+  // FIXME: This can potentially get truncated on 32-bit, where uptr is 4 bytes.
+  uint64_t tid;
+  pthread_threadid_np(nullptr, &tid);
+  return tid;
 }
 
 void GetThreadStackTopAndBottom(bool at_initialization, uptr *stack_top,
@@ -345,13 +377,16 @@
 #endif
 }
 
-uptr GetListOfModules(LoadedModule *modules, uptr max_modules,
-                      string_predicate_t filter) {
+void ListOfModules::init() {
+  clear();
   MemoryMappingLayout memory_mapping(false);
-  return memory_mapping.DumpListOfModules(modules, max_modules, filter);
+  memory_mapping.DumpListOfModules(&modules_);
 }
 
-bool IsDeadlySignal(int signum) {
+bool IsHandledDeadlySignal(int signum) {
+  if ((SANITIZER_WATCHOS || SANITIZER_TVOS) && !(SANITIZER_IOSSIM))
+    // Handling fatal signals on watchOS and tvOS devices is disallowed.
+    return false;
   return (signum == SIGSEGV || signum == SIGBUS) && common_flags()->handle_segv;
 }
 
@@ -423,14 +458,25 @@
 
 void internal_join_thread(void *th) { pthread_join((pthread_t)th, 0); }
 
+#ifndef SANITIZER_GO
 static BlockingMutex syslog_lock(LINKER_INITIALIZED);
+#endif
 
 void WriteOneLineToSyslog(const char *s) {
+#ifndef SANITIZER_GO
   syslog_lock.CheckLocked();
   asl_log(nullptr, nullptr, ASL_LEVEL_ERR, "%s", s);
+#endif
+}
+
+void LogMessageOnPrintf(const char *str) {
+  // Log all printf output to CrashLog.
+  if (common_flags()->abort_on_error)
+    CRAppendCrashLogMessage(str);
 }
 
 void LogFullErrorReport(const char *buffer) {
+#ifndef SANITIZER_GO
   // Log with os_trace. This will make it into the crash log.
 #if SANITIZER_OS_TRACE
   if (GetMacosVersion() >= MACOS_VERSION_YOSEMITE) {
@@ -463,9 +509,17 @@
   if (common_flags()->log_to_syslog)
     WriteToSyslog(buffer);
 
-  // Log to CrashLog.
-  if (common_flags()->abort_on_error)
-    CRSetCrashLogMessage(buffer);
+  // The report is added to CrashLog as part of logging all of Printf output.
+#endif
+}
+
+SignalContext::WriteFlag SignalContext::GetWriteFlag(void *context) {
+#if defined(__x86_64__) || defined(__i386__)
+  ucontext_t *ucontext = static_cast<ucontext_t*>(context);
+  return ucontext->uc_mcontext->__es.__err & 2 /*T_PF_WRITE*/ ? WRITE : READ;
+#else
+  return UNKNOWN;
+#endif
 }
 
 void GetPcSpBp(void *context, uptr *pc, uptr *sp, uptr *bp) {
@@ -495,6 +549,7 @@
 # endif
 }
 
+#ifndef SANITIZER_GO
 static const char kDyldInsertLibraries[] = "DYLD_INSERT_LIBRARIES";
 LowLevelAllocator allocator_for_env;
 
@@ -527,16 +582,19 @@
   }
 }
 
-static bool reexec_disabled = false;
-
-void DisableReexec() {
-  reexec_disabled = true;
+SANITIZER_WEAK_CXX_DEFAULT_IMPL
+bool ReexecDisabled() {
+  return false;
 }
 
-extern "C" double dyldVersionNumber;
+extern "C" SANITIZER_WEAK_ATTRIBUTE double dyldVersionNumber;
 static const double kMinDyldVersionWithAutoInterposition = 360.0;
 
 bool DyldNeedsEnvVariable() {
+  // Although sanitizer support was added to LLVM on OS X 10.7+, GCC users
+  // still may want use them on older systems. On older Darwin platforms, dyld
+  // doesn't export dyldVersionNumber symbol and we simply return true.
+  if (!&dyldVersionNumber) return true;
   // If running on OS X 10.11+ or iOS 9.0+, dyld will interpose even if
   // DYLD_INSERT_LIBRARIES is not set. However, checking OS version via
   // GetMacosVersion() doesn't work for the simulator. Let's instead check
@@ -546,13 +604,13 @@
 }
 
 void MaybeReexec() {
-  if (reexec_disabled) return;
+  if (ReexecDisabled()) return;
 
   // Make sure the dynamic runtime library is preloaded so that the
   // wrappers work. If it is not, set DYLD_INSERT_LIBRARIES and re-exec
   // ourselves.
   Dl_info info;
-  CHECK(dladdr((void*)((uptr)&__sanitizer_report_error_summary), &info));
+  RAW_CHECK(dladdr((void*)((uptr)&__sanitizer_report_error_summary), &info));
   char *dyld_insert_libraries =
       const_cast<char*>(GetEnv(kDyldInsertLibraries));
   uptr old_env_len = dyld_insert_libraries ?
@@ -597,7 +655,22 @@
            "environment variable and re-execute itself, but execv() failed, "
            "possibly because of sandbox restrictions. Make sure to launch the "
            "executable with:\n%s=%s\n", kDyldInsertLibraries, new_env);
-    CHECK("execv failed" && 0);
+    RAW_CHECK("execv failed" && 0);
+  }
+
+  // Verify that interceptors really work.  We'll use dlsym to locate
+  // "pthread_create", if interceptors are working, it should really point to
+  // "wrap_pthread_create" within our own dylib.
+  Dl_info info_pthread_create;
+  void *dlopen_addr = dlsym(RTLD_DEFAULT, "pthread_create");
+  RAW_CHECK(dladdr(dlopen_addr, &info_pthread_create));
+  if (internal_strcmp(info.dli_fname, info_pthread_create.dli_fname) != 0) {
+    Report(
+        "ERROR: Interceptors are not working. This may be because %s is "
+        "loaded too late (e.g. via dlopen). Please launch the executable "
+        "with:\n%s=%s\n",
+        SanitizerToolName, kDyldInsertLibraries, info.dli_fname);
+    RAW_CHECK("interceptors not installed" && 0);
   }
 
   if (!lib_is_in_env)
@@ -612,7 +685,7 @@
   // sign and the '\0' char.
   char *new_env = (char*)allocator_for_env.Allocate(
       old_env_len + 2 + env_name_len);
-  CHECK(new_env);
+  RAW_CHECK(new_env);
   internal_memset(new_env, '\0', old_env_len + 2 + env_name_len);
   internal_strncpy(new_env, kDyldInsertLibraries, env_name_len);
   new_env[env_name_len] = '=';
@@ -661,6 +734,11 @@
   if (new_env_pos == new_env + env_name_len + 1) new_env = NULL;
   LeakyResetEnv(kDyldInsertLibraries, new_env);
 }
+#endif  // SANITIZER_GO
+
+char **GetArgv() {
+  return *_NSGetArgv();
+}
 
 }  // namespace __sanitizer
 
diff --git a/lib/sanitizer_common/sanitizer_mac.h b/lib/sanitizer_common/sanitizer_mac.h
index 86a9956..6e2b84f 100644
--- a/lib/sanitizer_common/sanitizer_mac.h
+++ b/lib/sanitizer_common/sanitizer_mac.h
@@ -44,9 +44,11 @@
   &__crashreporter_info_buff__[0];
 asm(".desc ___crashreporter_info__, 0x10");
 } // extern "C"
+static BlockingMutex crashreporter_info_mutex(LINKER_INITIALIZED);
 
-INLINE void CRSetCrashLogMessage(const char *msg) {
-  internal_strlcpy(__crashreporter_info_buff__, msg,
+INLINE void CRAppendCrashLogMessage(const char *msg) {
+  BlockingMutexLock l(&crashreporter_info_mutex);
+  internal_strlcat(__crashreporter_info_buff__, msg,
                    sizeof(__crashreporter_info_buff__)); }
 
 #endif  // SANITIZER_MAC
diff --git a/lib/sanitizer_common/sanitizer_platform.h b/lib/sanitizer_common/sanitizer_platform.h
index c31e631..8824912 100644
--- a/lib/sanitizer_common/sanitizer_platform.h
+++ b/lib/sanitizer_common/sanitizer_platform.h
@@ -49,12 +49,30 @@
 # define SANITIZER_IOSSIM  0
 #endif
 
+#if defined(__APPLE__) && TARGET_OS_IPHONE && TARGET_OS_WATCH
+# define SANITIZER_WATCHOS 1
+#else
+# define SANITIZER_WATCHOS 0
+#endif
+
+#if defined(__APPLE__) && TARGET_OS_IPHONE && TARGET_OS_TV
+# define SANITIZER_TVOS 1
+#else
+# define SANITIZER_TVOS 0
+#endif
+
 #if defined(_WIN32)
 # define SANITIZER_WINDOWS 1
 #else
 # define SANITIZER_WINDOWS 0
 #endif
 
+#if defined(_WIN64)
+# define SANITIZER_WINDOWS64 1
+#else
+# define SANITIZER_WINDOWS64 0
+#endif
+
 #if defined(__ANDROID__)
 # define SANITIZER_ANDROID 1
 #else
@@ -96,6 +114,54 @@
 # define SANITIZER_MIPS64 0
 #endif
 
+#if defined(__s390__)
+# define SANITIZER_S390 1
+# if defined(__s390x__)
+#  define SANITIZER_S390_31 0
+#  define SANITIZER_S390_64 1
+# else
+#  define SANITIZER_S390_31 1
+#  define SANITIZER_S390_64 0
+# endif
+#else
+# define SANITIZER_S390 0
+# define SANITIZER_S390_31 0
+# define SANITIZER_S390_64 0
+#endif
+
+#if defined(__powerpc__)
+# define SANITIZER_PPC 1
+# if defined(__powerpc64__)
+#  define SANITIZER_PPC32 0
+#  define SANITIZER_PPC64 1
+// 64-bit PPC has two ABIs (v1 and v2).  The old powerpc64 target is
+// big-endian, and uses v1 ABI (known for its function descriptors),
+// while the new powerpc64le target is little-endian and uses v2.
+// In theory, you could convince gcc to compile for their evil twins
+// (eg. big-endian v2), but you won't find such combinations in the wild
+// (it'd require bootstrapping a whole system, which would be quite painful
+// - there's no target triple for that).  LLVM doesn't support them either.
+#  if _CALL_ELF == 2
+#   define SANITIZER_PPC64V1 0
+#   define SANITIZER_PPC64V2 1
+#  else
+#   define SANITIZER_PPC64V1 1
+#   define SANITIZER_PPC64V2 0
+#  endif
+# else
+#  define SANITIZER_PPC32 1
+#  define SANITIZER_PPC64 0
+#  define SANITIZER_PPC64V1 0
+#  define SANITIZER_PPC64V2 0
+# endif
+#else
+# define SANITIZER_PPC 0
+# define SANITIZER_PPC32 0
+# define SANITIZER_PPC64 0
+# define SANITIZER_PPC64V1 0
+# define SANITIZER_PPC64V2 0
+#endif
+
 // By default we allow to use SizeClassAllocator64 on 64-bit platform.
 // But in some cases (e.g. AArch64's 39-bit address space) SizeClassAllocator64
 // does not work well and we need to fallback to SizeClassAllocator32.
@@ -165,4 +231,18 @@
 # define MSC_PREREQ(version) 0
 #endif
 
+#if defined(__arm64__) && SANITIZER_IOS
+# define SANITIZER_NON_UNIQUE_TYPEINFO 1
+#else
+# define SANITIZER_NON_UNIQUE_TYPEINFO 0
+#endif
+
+// On linux, some architectures had an ABI transition from 64-bit long double
+// (ie. same as double) to 128-bit long double.  On those, glibc symbols
+// involving long doubles come in two versions, and we need to pass the
+// correct one to dlvsym when intercepting them.
+#if SANITIZER_LINUX && (SANITIZER_S390 || SANITIZER_PPC32 || SANITIZER_PPC64V1)
+#define SANITIZER_NLDBL_VERSION "GLIBC_2.4"
+#endif
+
 #endif // SANITIZER_PLATFORM_H
diff --git a/lib/sanitizer_common/sanitizer_platform_interceptors.h b/lib/sanitizer_common/sanitizer_platform_interceptors.h
index 430ad48..27233ee 100644
--- a/lib/sanitizer_common/sanitizer_platform_interceptors.h
+++ b/lib/sanitizer_common/sanitizer_platform_interceptors.h
@@ -29,6 +29,12 @@
 # define SI_LINUX_NOT_ANDROID 0
 #endif
 
+#if SANITIZER_ANDROID
+# define SI_ANDROID 1
+#else
+# define SI_ANDROID 0
+#endif
+
 #if SANITIZER_FREEBSD
 # define SI_FREEBSD 1
 #else
@@ -43,8 +49,10 @@
 
 #if SANITIZER_MAC
 # define SI_MAC 1
+# define SI_NOT_MAC 0
 #else
 # define SI_MAC 0
+# define SI_NOT_MAC 1
 #endif
 
 #if SANITIZER_IOS
@@ -53,15 +61,37 @@
 # define SI_IOS 0
 #endif
 
+#if !SANITIZER_WINDOWS && !SANITIZER_MAC
+# define SI_UNIX_NOT_MAC 1
+#else
+# define SI_UNIX_NOT_MAC 0
+#endif
+
+#define SANITIZER_INTERCEPT_STRLEN 1
+#define SANITIZER_INTERCEPT_STRNLEN SI_NOT_MAC
 #define SANITIZER_INTERCEPT_STRCMP 1
 #define SANITIZER_INTERCEPT_STRSTR 1
 #define SANITIZER_INTERCEPT_STRCASESTR SI_NOT_WINDOWS
+#define SANITIZER_INTERCEPT_STRCHR 1
+#define SANITIZER_INTERCEPT_STRCHRNUL SI_UNIX_NOT_MAC
+#define SANITIZER_INTERCEPT_STRRCHR 1
 #define SANITIZER_INTERCEPT_STRSPN 1
 #define SANITIZER_INTERCEPT_STRPBRK 1
 #define SANITIZER_INTERCEPT_TEXTDOMAIN SI_LINUX_NOT_ANDROID
 #define SANITIZER_INTERCEPT_STRCASECMP SI_NOT_WINDOWS
+#define SANITIZER_INTERCEPT_MEMSET 1
+#define SANITIZER_INTERCEPT_MEMMOVE 1
+#define SANITIZER_INTERCEPT_MEMCPY 1
 #define SANITIZER_INTERCEPT_MEMCMP 1
+// TODO(wwchrome): Re-enable intercepting memchr() when ready.
+// The function memchr() contains a jump in the first 6 bytes
+// that is problematic to intercept correctly on Win64.
+// Disable memchr() interception for Win64 temporarily.
+#if SANITIZER_WINDOWS64
+#define SANITIZER_INTERCEPT_MEMCHR 0
+#else
 #define SANITIZER_INTERCEPT_MEMCHR 1
+#endif
 #define SANITIZER_INTERCEPT_MEMRCHR SI_FREEBSD || SI_LINUX
 
 #define SANITIZER_INTERCEPT_READ   SI_NOT_WINDOWS
@@ -125,15 +155,21 @@
 #define SANITIZER_INTERCEPT_ACCEPT4 SI_LINUX_NOT_ANDROID
 #define SANITIZER_INTERCEPT_MODF SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_RECVMSG SI_NOT_WINDOWS
+#define SANITIZER_INTERCEPT_SENDMSG SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_GETPEERNAME SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_IOCTL SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_INET_ATON SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_SYSINFO SI_LINUX
 #define SANITIZER_INTERCEPT_READDIR SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_READDIR64 SI_LINUX_NOT_ANDROID
-#define SANITIZER_INTERCEPT_PTRACE SI_LINUX_NOT_ANDROID && \
+#if SI_LINUX_NOT_ANDROID && \
   (defined(__i386) || defined(__x86_64) || defined(__mips64) || \
-    defined(__powerpc64__) || defined(__aarch64__) || defined(__arm__))
+    defined(__powerpc64__) || defined(__aarch64__) || defined(__arm__) || \
+    defined(__s390__))
+#define SANITIZER_INTERCEPT_PTRACE 1
+#else
+#define SANITIZER_INTERCEPT_PTRACE 0
+#endif
 #define SANITIZER_INTERCEPT_SETLOCALE SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_GETCWD SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_GET_CURRENT_DIR_NAME SI_LINUX_NOT_ANDROID
@@ -237,7 +273,11 @@
 #define SANITIZER_INTERCEPT_IF_INDEXTONAME \
   SI_FREEBSD || SI_LINUX_NOT_ANDROID || SI_MAC
 #define SANITIZER_INTERCEPT_CAPGET SI_LINUX_NOT_ANDROID
-#define SANITIZER_INTERCEPT_AEABI_MEM SI_LINUX && defined(__arm__)
+#if SI_LINUX && defined(__arm__)
+#define SANITIZER_INTERCEPT_AEABI_MEM 1
+#else
+#define SANITIZER_INTERCEPT_AEABI_MEM 0
+#endif
 #define SANITIZER_INTERCEPT___BZERO SI_MAC
 #define SANITIZER_INTERCEPT_FTIME !SI_FREEBSD && SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_XDR SI_LINUX_NOT_ANDROID
@@ -249,8 +289,12 @@
 #define SANITIZER_INTERCEPT_OBSTACK SI_LINUX_NOT_ANDROID
 #define SANITIZER_INTERCEPT_FFLUSH SI_NOT_WINDOWS
 #define SANITIZER_INTERCEPT_FCLOSE SI_NOT_WINDOWS
+
+#ifndef SANITIZER_INTERCEPT_DLOPEN_DLCLOSE
 #define SANITIZER_INTERCEPT_DLOPEN_DLCLOSE \
     SI_FREEBSD || SI_LINUX_NOT_ANDROID || SI_MAC
+#endif
+
 #define SANITIZER_INTERCEPT_GETPASS SI_LINUX_NOT_ANDROID || SI_MAC
 #define SANITIZER_INTERCEPT_TIMERFD SI_LINUX_NOT_ANDROID
 
@@ -264,5 +308,13 @@
 #define SANITIZER_INTERCEPT_CTERMID_R SI_MAC || SI_FREEBSD
 
 #define SANITIZER_INTERCEPTOR_HOOKS SI_LINUX
+#define SANITIZER_INTERCEPT_RECV_RECVFROM SI_NOT_WINDOWS
+#define SANITIZER_INTERCEPT_SEND_SENDTO SI_NOT_WINDOWS
+#define SANITIZER_INTERCEPT_EVENTFD_READ_WRITE SI_LINUX
 
+#define SANITIZER_INTERCEPT_STAT (SI_FREEBSD || SI_MAC || SI_ANDROID)
+#define SANITIZER_INTERCEPT___XSTAT !SANITIZER_INTERCEPT_STAT && SI_NOT_WINDOWS
+#define SANITIZER_INTERCEPT___XSTAT64 SI_LINUX_NOT_ANDROID
+#define SANITIZER_INTERCEPT___LXSTAT SANITIZER_INTERCEPT___XSTAT
+#define SANITIZER_INTERCEPT___LXSTAT64 SI_LINUX_NOT_ANDROID
 #endif  // #ifndef SANITIZER_PLATFORM_INTERCEPTORS_H
diff --git a/lib/sanitizer_common/sanitizer_platform_limits_linux.cc b/lib/sanitizer_common/sanitizer_platform_limits_linux.cc
index 92353e4..ed16f63 100644
--- a/lib/sanitizer_common/sanitizer_platform_limits_linux.cc
+++ b/lib/sanitizer_common/sanitizer_platform_limits_linux.cc
@@ -28,7 +28,7 @@
 // With old kernels (and even new kernels on powerpc) asm/stat.h uses types that
 // are not defined anywhere in userspace headers. Fake them. This seems to work
 // fine with newer headers, too.
-#include <asm/posix_types.h>
+#include <linux/posix_types.h>
 #if defined(__x86_64__) ||  defined(__mips__)
 #include <sys/stat.h>
 #else
@@ -62,7 +62,7 @@
 }  // namespace __sanitizer
 
 #if !defined(__powerpc64__) && !defined(__x86_64__) && !defined(__aarch64__)\
-                            && !defined(__mips__)
+                            && !defined(__mips__) && !defined(__s390__)
 COMPILER_CHECK(struct___old_kernel_stat_sz == sizeof(struct __old_kernel_stat));
 #endif
 
diff --git a/lib/sanitizer_common/sanitizer_platform_limits_posix.cc b/lib/sanitizer_common/sanitizer_platform_limits_posix.cc
index 0e07737..137cd9a 100644
--- a/lib/sanitizer_common/sanitizer_platform_limits_posix.cc
+++ b/lib/sanitizer_common/sanitizer_platform_limits_posix.cc
@@ -311,23 +311,28 @@
 
 #if SANITIZER_LINUX && !SANITIZER_ANDROID && \
     (defined(__i386) || defined(__x86_64) || defined(__mips64) || \
-      defined(__powerpc64__) || defined(__aarch64__) || defined(__arm__))
+      defined(__powerpc64__) || defined(__aarch64__) || defined(__arm__) || \
+      defined(__s390__))
 #if defined(__mips64) || defined(__powerpc64__) || defined(__arm__)
   unsigned struct_user_regs_struct_sz = sizeof(struct pt_regs);
   unsigned struct_user_fpregs_struct_sz = sizeof(elf_fpregset_t);
 #elif defined(__aarch64__)
   unsigned struct_user_regs_struct_sz = sizeof(struct user_pt_regs);
   unsigned struct_user_fpregs_struct_sz = sizeof(struct user_fpsimd_state);
+#elif defined(__s390__)
+  unsigned struct_user_regs_struct_sz = sizeof(struct _user_regs_struct);
+  unsigned struct_user_fpregs_struct_sz = sizeof(struct _user_fpregs_struct);
 #else
   unsigned struct_user_regs_struct_sz = sizeof(struct user_regs_struct);
   unsigned struct_user_fpregs_struct_sz = sizeof(struct user_fpregs_struct);
 #endif // __mips64 || __powerpc64__ || __aarch64__
 #if defined(__x86_64) || defined(__mips64) || defined(__powerpc64__) || \
-    defined(__aarch64__) || defined(__arm__)
+    defined(__aarch64__) || defined(__arm__) || defined(__s390__)
   unsigned struct_user_fpxregs_struct_sz = 0;
 #else
   unsigned struct_user_fpxregs_struct_sz = sizeof(struct user_fpxregs_struct);
 #endif // __x86_64 || __mips64 || __powerpc64__ || __aarch64__ || __arm__
+// || __s390__
 #ifdef __arm__
   unsigned struct_user_vfpregs_struct_sz = ARM_VFPREGS_SIZE;
 #else
@@ -1055,7 +1060,14 @@
 // Can't write checks for sa_handler and sa_sigaction due to them being
 // preprocessor macros.
 CHECK_STRUCT_SIZE_AND_OFFSET(sigaction, sa_mask);
+#ifndef __GLIBC_PREREQ
+#define __GLIBC_PREREQ(x, y) 0
+#endif
+#if !defined(__s390x__) || __GLIBC_PREREQ (2, 20)
+// On s390x glibc 2.19 and earlier sa_flags was unsigned long, and sa_resv
+// didn't exist.
 CHECK_STRUCT_SIZE_AND_OFFSET(sigaction, sa_flags);
+#endif
 #if SANITIZER_LINUX && (!SANITIZER_ANDROID || !SANITIZER_MIPS32)
 CHECK_STRUCT_SIZE_AND_OFFSET(sigaction, sa_restorer);
 #endif
@@ -1127,9 +1139,6 @@
 CHECK_SIZE_AND_OFFSET(ipc_perm, gid);
 CHECK_SIZE_AND_OFFSET(ipc_perm, cuid);
 CHECK_SIZE_AND_OFFSET(ipc_perm, cgid);
-#ifndef __GLIBC_PREREQ
-#define __GLIBC_PREREQ(x, y) 0
-#endif
 #if !defined(__aarch64__) || !SANITIZER_LINUX || __GLIBC_PREREQ (2, 21)
 /* On aarch64 glibc 2.20 and earlier provided incorrect mode field.  */
 CHECK_SIZE_AND_OFFSET(ipc_perm, mode);
diff --git a/lib/sanitizer_common/sanitizer_platform_limits_posix.h b/lib/sanitizer_common/sanitizer_platform_limits_posix.h
index af33a45..14bc750 100644
--- a/lib/sanitizer_common/sanitizer_platform_limits_posix.h
+++ b/lib/sanitizer_common/sanitizer_platform_limits_posix.h
@@ -81,6 +81,12 @@
                  SANITIZER_ANDROID ? FIRST_32_SECOND_64(104, 128) :
                                      FIRST_32_SECOND_64(144, 216);
   const unsigned struct_kernel_stat64_sz = 104;
+#elif defined(__s390__) && !defined(__s390x__)
+  const unsigned struct_kernel_stat_sz = 64;
+  const unsigned struct_kernel_stat64_sz = 104;
+#elif defined(__s390x__)
+  const unsigned struct_kernel_stat_sz = 144;
+  const unsigned struct_kernel_stat64_sz = 0;
 #endif
   struct __sanitizer_perf_event_attr {
     unsigned type;
@@ -101,7 +107,7 @@
 
 #if SANITIZER_LINUX || SANITIZER_FREEBSD
 
-#if defined(__powerpc64__)
+#if defined(__powerpc64__) || defined(__s390__)
   const unsigned struct___old_kernel_stat_sz = 0;
 #else
   const unsigned struct___old_kernel_stat_sz = 32;
@@ -188,7 +194,7 @@
     unsigned __seq;
     u64 __unused1;
     u64 __unused2;
-#elif defined(__mips__) || defined(__aarch64__)
+#elif defined(__mips__) || defined(__aarch64__) || defined(__s390x__)
     unsigned int mode;
     unsigned short __seq;
     unsigned short __pad1;
@@ -576,7 +582,11 @@
     int sa_flags;
     __sanitizer_sigset_t sa_mask;
 #else
+#if defined(__s390x__)
+    int sa_resv;
+#else
     __sanitizer_sigset_t sa_mask;
+#endif
 #ifndef __mips__
     int sa_flags;
 #endif
@@ -587,6 +597,9 @@
 #if defined(__mips__) && (SANITIZER_WORDSIZE == 32)
     int sa_resv[1];
 #endif
+#if defined(__s390x__)
+    __sanitizer_sigset_t sa_mask;
+#endif
   };
 #endif // !SANITIZER_ANDROID
 
@@ -747,7 +760,8 @@
 
 #if SANITIZER_LINUX && !SANITIZER_ANDROID && \
   (defined(__i386) || defined(__x86_64) || defined(__mips64) || \
-    defined(__powerpc64__) || defined(__aarch64__) || defined(__arm__))
+    defined(__powerpc64__) || defined(__aarch64__) || defined(__arm__) || \
+    defined(__s390__))
   extern unsigned struct_user_regs_struct_sz;
   extern unsigned struct_user_fpregs_struct_sz;
   extern unsigned struct_user_fpxregs_struct_sz;
diff --git a/lib/sanitizer_common/sanitizer_posix.cc b/lib/sanitizer_common/sanitizer_posix.cc
index 5ae6866..c70d5a4 100644
--- a/lib/sanitizer_common/sanitizer_posix.cc
+++ b/lib/sanitizer_common/sanitizer_posix.cc
@@ -89,7 +89,11 @@
 
 uptr GetMaxVirtualAddress() {
 #if SANITIZER_WORDSIZE == 64
-# if defined(__powerpc64__) || defined(__aarch64__)
+# if defined(__aarch64__) && SANITIZER_IOS && !SANITIZER_IOSSIM
+  // Ideally, we would derive the upper bound from MACH_VM_MAX_ADDRESS. The
+  // upper bound can change depending on the device.
+  return 0x200000000 - 1;
+# elif defined(__powerpc64__) || defined(__aarch64__)
   // On PowerPC64 we have two different address space layouts: 44- and 46-bit.
   // We somehow need to figure out which one we are using now and choose
   // one of 0x00000fffffffffffUL and 0x00003fffffffffffUL.
@@ -100,15 +104,21 @@
   return (1ULL << (MostSignificantSetBitIndex(GET_CURRENT_FRAME()) + 1)) - 1;
 # elif defined(__mips64)
   return (1ULL << 40) - 1;  // 0x000000ffffffffffUL;
+# elif defined(__s390x__)
+  return (1ULL << 53) - 1;  // 0x001fffffffffffffUL;
 # else
   return (1ULL << 47) - 1;  // 0x00007fffffffffffUL;
 # endif
 #else  // SANITIZER_WORDSIZE == 32
+# if defined(__s390__)
+  return (1ULL << 31) - 1;  // 0x7fffffff;
+# else
   uptr res = (1ULL << 32) - 1;  // 0xffffffff;
   if (!common_flags()->full_address_space)
     res -= GetKernelAreaSize();
   CHECK_LT(reinterpret_cast<uptr>(&res), res);
   return res;
+# endif
 #endif  // SANITIZER_WORDSIZE
 }
 
@@ -135,6 +145,26 @@
   DecreaseTotalMmap(size);
 }
 
+// We want to map a chunk of address space aligned to 'alignment'.
+// We do it by maping a bit more and then unmaping redundant pieces.
+// We probably can do it with fewer syscalls in some OS-dependent way.
+void *MmapAlignedOrDie(uptr size, uptr alignment, const char *mem_type) {
+  CHECK(IsPowerOfTwo(size));
+  CHECK(IsPowerOfTwo(alignment));
+  uptr map_size = size + alignment;
+  uptr map_res = (uptr)MmapOrDie(map_size, mem_type);
+  uptr map_end = map_res + map_size;
+  uptr res = map_res;
+  if (res & (alignment - 1))  // Not aligned.
+    res = (map_res + alignment) & ~(alignment - 1);
+  uptr end = res + size;
+  if (res != map_res)
+    UnmapOrDie((void*)map_res, res - map_res);
+  if (end != map_end)
+    UnmapOrDie((void*)end, map_end - end);
+  return (void*)res;
+}
+
 void *MmapNoReserveOrDie(uptr size, const char *mem_type) {
   uptr PageSize = GetPageSizeCached();
   uptr p = internal_mmap(nullptr,
@@ -171,6 +201,10 @@
   return 0 == internal_mprotect((void*)addr, size, PROT_NONE);
 }
 
+bool MprotectReadOnly(uptr addr, uptr size) {
+  return 0 == internal_mprotect((void *)addr, size, PROT_READ);
+}
+
 fd_t OpenFile(const char *filename, FileAccessMode mode, error_t *errno_p) {
   int flags;
   switch (mode) {
@@ -315,10 +349,13 @@
 }
 
 SignalContext SignalContext::Create(void *siginfo, void *context) {
-  uptr addr = (uptr)((siginfo_t*)siginfo)->si_addr;
+  auto si = (siginfo_t *)siginfo;
+  uptr addr = (uptr)si->si_addr;
   uptr pc, sp, bp;
   GetPcSpBp(context, &pc, &sp, &bp);
-  return SignalContext(context, addr, pc, sp, bp);
+  WriteFlag write_flag = GetWriteFlag(context);
+  bool is_memory_access = si->si_signo == SIGSEGV;
+  return SignalContext(context, addr, pc, sp, bp, is_memory_access, write_flag);
 }
 
 } // namespace __sanitizer
diff --git a/lib/sanitizer_common/sanitizer_posix.h b/lib/sanitizer_common/sanitizer_posix.h
index c0426a0..7f862cd 100644
--- a/lib/sanitizer_common/sanitizer_posix.h
+++ b/lib/sanitizer_common/sanitizer_posix.h
@@ -16,6 +16,7 @@
 // ----------- ATTENTION -------------
 // This header should NOT include any other headers from sanitizer runtime.
 #include "sanitizer_internal_defs.h"
+#include "sanitizer_platform_limits_posix.h"
 
 #if !SANITIZER_POSIX
 // Make it hard to accidentally use any of functions declared in this file:
@@ -77,8 +78,15 @@
 
 int my_pthread_attr_getstack(void *attr, void **addr, uptr *size);
 
+// A routine named real_sigaction() must be implemented by each sanitizer in
+// order for internal_sigaction() to bypass interceptors.
 int internal_sigaction(int signum, const void *act, void *oldact);
+void internal_sigfillset(__sanitizer_sigset_t *set);
+void internal_sigemptyset(__sanitizer_sigset_t *set);
+bool internal_sigismember(__sanitizer_sigset_t *set, int signum);
 
+uptr internal_execve(const char *filename, char *const argv[],
+                     char *const envp[]);
 }  // namespace __sanitizer
 
 #endif  // SANITIZER_POSIX_H
diff --git a/lib/sanitizer_common/sanitizer_posix_libcdep.cc b/lib/sanitizer_common/sanitizer_posix_libcdep.cc
index c158eed..f1e8b50 100644
--- a/lib/sanitizer_common/sanitizer_posix_libcdep.cc
+++ b/lib/sanitizer_common/sanitizer_posix_libcdep.cc
@@ -34,6 +34,7 @@
 #include <sys/stat.h>
 #include <sys/time.h>
 #include <sys/types.h>
+#include <sys/wait.h>
 #include <unistd.h>
 
 #if SANITIZER_FREEBSD
@@ -97,6 +98,10 @@
   return (stack_size == RLIM_INFINITY);
 }
 
+uptr GetStackSizeLimitInBytes() {
+  return (uptr)getlim(RLIMIT_STACK);
+}
+
 void SetStackSizeLimitInBytes(uptr limit) {
   setlim(RLIMIT_STACK, (rlim_t)limit);
   CHECK(!StackSizeIsUnlimited());
@@ -168,7 +173,7 @@
 typedef void (*sa_sigaction_t)(int, siginfo_t *, void *);
 static void MaybeInstallSigaction(int signum,
                                   SignalHandlerType handler) {
-  if (!IsDeadlySignal(signum))
+  if (!IsHandledDeadlySignal(signum))
     return;
   struct sigaction sigact;
   internal_memset(&sigact, 0, sizeof(sigact));
@@ -269,7 +274,7 @@
   return (void *)p;
 }
 
-void *MmapNoAccess(uptr fixed_addr, uptr size, const char *name) {
+void *MmapFixedNoAccess(uptr fixed_addr, uptr size, const char *name) {
   int fd = name ? GetNamedMappingFd(name, size) : -1;
   unsigned flags = MAP_PRIVATE | MAP_FIXED | MAP_NORESERVE;
   if (fd == -1) flags |= MAP_ANON;
@@ -278,6 +283,11 @@
                                0);
 }
 
+void *MmapNoAccess(uptr size) {
+  unsigned flags = MAP_PRIVATE | MAP_ANON | MAP_NORESERVE;
+  return (void *)internal_mmap(nullptr, size, PROT_NONE, flags, -1, 0);
+}
+
 // This function is defined elsewhere if we intercepted pthread_attr_getstack.
 extern "C" {
 SANITIZER_WEAK_ATTRIBUTE int
@@ -320,6 +330,79 @@
 }
 #endif // !SANITIZER_GO
 
+pid_t StartSubprocess(const char *program, const char *const argv[],
+                      fd_t stdin_fd, fd_t stdout_fd, fd_t stderr_fd) {
+  auto file_closer = at_scope_exit([&] {
+    if (stdin_fd != kInvalidFd) {
+      internal_close(stdin_fd);
+    }
+    if (stdout_fd != kInvalidFd) {
+      internal_close(stdout_fd);
+    }
+    if (stderr_fd != kInvalidFd) {
+      internal_close(stderr_fd);
+    }
+  });
+
+  int pid = internal_fork();
+
+  if (pid < 0) {
+    int rverrno;
+    if (internal_iserror(pid, &rverrno)) {
+      Report("WARNING: failed to fork (errno %d)\n", rverrno);
+    }
+    return pid;
+  }
+
+  if (pid == 0) {
+    // Child subprocess
+    if (stdin_fd != kInvalidFd) {
+      internal_close(STDIN_FILENO);
+      internal_dup2(stdin_fd, STDIN_FILENO);
+      internal_close(stdin_fd);
+    }
+    if (stdout_fd != kInvalidFd) {
+      internal_close(STDOUT_FILENO);
+      internal_dup2(stdout_fd, STDOUT_FILENO);
+      internal_close(stdout_fd);
+    }
+    if (stderr_fd != kInvalidFd) {
+      internal_close(STDERR_FILENO);
+      internal_dup2(stderr_fd, STDERR_FILENO);
+      internal_close(stderr_fd);
+    }
+
+    for (int fd = sysconf(_SC_OPEN_MAX); fd > 2; fd--) internal_close(fd);
+
+    execv(program, const_cast<char **>(&argv[0]));
+    internal__exit(1);
+  }
+
+  return pid;
+}
+
+bool IsProcessRunning(pid_t pid) {
+  int process_status;
+  uptr waitpid_status = internal_waitpid(pid, &process_status, WNOHANG);
+  int local_errno;
+  if (internal_iserror(waitpid_status, &local_errno)) {
+    VReport(1, "Waiting on the process failed (errno %d).\n", local_errno);
+    return false;
+  }
+  return waitpid_status == 0;
+}
+
+int WaitForProcess(pid_t pid) {
+  int process_status;
+  uptr waitpid_status = internal_waitpid(pid, &process_status, 0);
+  int local_errno;
+  if (internal_iserror(waitpid_status, &local_errno)) {
+    VReport(1, "Waiting on the process failed (errno %d).\n", local_errno);
+    return -1;
+  }
+  return process_status;
+}
+
 } // namespace __sanitizer
 
 #endif // SANITIZER_POSIX
diff --git a/lib/sanitizer_common/sanitizer_printf.cc b/lib/sanitizer_common/sanitizer_printf.cc
index 2794e66..434ebb9 100644
--- a/lib/sanitizer_common/sanitizer_printf.cc
+++ b/lib/sanitizer_common/sanitizer_printf.cc
@@ -278,9 +278,12 @@
 #   undef CHECK_NEEDED_LENGTH
   }
   RawWrite(buffer);
-  if (common_flags()->log_to_syslog && ShouldLogAfterPrintf())
-    WriteToSyslog(buffer);
+
+  // Remove color sequences from the message.
+  RemoveANSIEscapeSequencesFromString(buffer);
   CallPrintfAndReportCallback(buffer);
+  LogMessageOnPrintf(buffer);
+
   // If we had mapped any memory, clean up.
   if (buffer != local_buffer)
     UnmapOrDie((void *)buffer, buffer_size);
diff --git a/lib/sanitizer_common/sanitizer_procmaps.h b/lib/sanitizer_common/sanitizer_procmaps.h
index 94e3871..1fe59ab 100644
--- a/lib/sanitizer_common/sanitizer_procmaps.h
+++ b/lib/sanitizer_common/sanitizer_procmaps.h
@@ -43,9 +43,8 @@
   // instead of aborting.
   static void CacheMemoryMappings();
 
-  // Stores the list of mapped objects into an array.
-  uptr DumpListOfModules(LoadedModule *modules, uptr max_modules,
-                         string_predicate_t filter);
+  // Adds all mapped objects into a vector.
+  void DumpListOfModules(InternalMmapVector<LoadedModule> *modules);
 
   // Memory protection masks.
   static const uptr kProtectionRead = 1;
diff --git a/lib/sanitizer_common/sanitizer_procmaps_common.cc b/lib/sanitizer_common/sanitizer_procmaps_common.cc
index d43432c..fac3fbd 100644
--- a/lib/sanitizer_common/sanitizer_procmaps_common.cc
+++ b/lib/sanitizer_common/sanitizer_procmaps_common.cc
@@ -116,22 +116,17 @@
   }
 }
 
-uptr MemoryMappingLayout::DumpListOfModules(LoadedModule *modules,
-                                            uptr max_modules,
-                                            string_predicate_t filter) {
+void MemoryMappingLayout::DumpListOfModules(
+    InternalMmapVector<LoadedModule> *modules) {
   Reset();
   uptr cur_beg, cur_end, cur_offset, prot;
   InternalScopedString module_name(kMaxPathLength);
-  uptr n_modules = 0;
-  for (uptr i = 0; n_modules < max_modules &&
-                       Next(&cur_beg, &cur_end, &cur_offset, module_name.data(),
-                            module_name.size(), &prot);
+  for (uptr i = 0; Next(&cur_beg, &cur_end, &cur_offset, module_name.data(),
+                        module_name.size(), &prot);
        i++) {
     const char *cur_name = module_name.data();
     if (cur_name[0] == '\0')
       continue;
-    if (filter && !filter(cur_name))
-      continue;
     // Don't subtract 'cur_beg' from the first entry:
     // * If a binary is compiled w/o -pie, then the first entry in
     //   process maps is likely the binary itself (all dynamic libs
@@ -144,12 +139,11 @@
     //   shadow memory of the tool), so the module can't be the
     //   first entry.
     uptr base_address = (i ? cur_beg : 0) - cur_offset;
-    LoadedModule *cur_module = &modules[n_modules];
-    cur_module->set(cur_name, base_address);
-    cur_module->addAddressRange(cur_beg, cur_end, prot & kProtectionExecute);
-    n_modules++;
+    LoadedModule cur_module;
+    cur_module.set(cur_name, base_address);
+    cur_module.addAddressRange(cur_beg, cur_end, prot & kProtectionExecute);
+    modules->push_back(cur_module);
   }
-  return n_modules;
 }
 
 void GetMemoryProfile(fill_profile_f cb, uptr *stats, uptr stats_size) {
diff --git a/lib/sanitizer_common/sanitizer_procmaps_mac.cc b/lib/sanitizer_common/sanitizer_procmaps_mac.cc
index d10881e..417cc90 100644
--- a/lib/sanitizer_common/sanitizer_procmaps_mac.cc
+++ b/lib/sanitizer_common/sanitizer_procmaps_mac.cc
@@ -155,34 +155,28 @@
   return false;
 }
 
-uptr MemoryMappingLayout::DumpListOfModules(LoadedModule *modules,
-                                            uptr max_modules,
-                                            string_predicate_t filter) {
+void MemoryMappingLayout::DumpListOfModules(
+    InternalMmapVector<LoadedModule> *modules) {
   Reset();
   uptr cur_beg, cur_end, prot;
   InternalScopedString module_name(kMaxPathLength);
-  uptr n_modules = 0;
-  for (uptr i = 0; n_modules < max_modules &&
-                       Next(&cur_beg, &cur_end, 0, module_name.data(),
-                            module_name.size(), &prot);
+  for (uptr i = 0; Next(&cur_beg, &cur_end, 0, module_name.data(),
+                        module_name.size(), &prot);
        i++) {
     const char *cur_name = module_name.data();
     if (cur_name[0] == '\0')
       continue;
-    if (filter && !filter(cur_name))
-      continue;
     LoadedModule *cur_module = nullptr;
-    if (n_modules > 0 &&
-        0 == internal_strcmp(cur_name, modules[n_modules - 1].full_name())) {
-      cur_module = &modules[n_modules - 1];
+    if (!modules->empty() &&
+        0 == internal_strcmp(cur_name, modules->back().full_name())) {
+      cur_module = &modules->back();
     } else {
-      cur_module = &modules[n_modules];
+      modules->push_back(LoadedModule());
+      cur_module = &modules->back();
       cur_module->set(cur_name, cur_beg);
-      n_modules++;
     }
     cur_module->addAddressRange(cur_beg, cur_end, prot & kProtectionExecute);
   }
-  return n_modules;
 }
 
 }  // namespace __sanitizer
diff --git a/lib/sanitizer_common/sanitizer_quarantine.h b/lib/sanitizer_common/sanitizer_quarantine.h
index 9e0bf2d..ccc22bf 100644
--- a/lib/sanitizer_common/sanitizer_quarantine.h
+++ b/lib/sanitizer_common/sanitizer_quarantine.h
@@ -101,10 +101,12 @@
   void NOINLINE DoRecycle(Cache *c, Callback cb) {
     while (QuarantineBatch *b = c->DequeueBatch()) {
       const uptr kPrefetch = 16;
+      CHECK(kPrefetch <= ARRAY_SIZE(b->batch));
       for (uptr i = 0; i < kPrefetch; i++)
         PREFETCH(b->batch[i]);
-      for (uptr i = 0; i < b->count; i++) {
-        PREFETCH(b->batch[i + kPrefetch]);
+      for (uptr i = 0, count = b->count; i < count; i++) {
+        if (i + kPrefetch < count)
+          PREFETCH(b->batch[i + kPrefetch]);
         cb.Recycle((Node*)b->batch[i]);
       }
       cb.Deallocate(b);
diff --git a/lib/sanitizer_common/sanitizer_stacktrace.cc b/lib/sanitizer_common/sanitizer_stacktrace.cc
index 7862575..7ad1f1f 100644
--- a/lib/sanitizer_common/sanitizer_stacktrace.cc
+++ b/lib/sanitizer_common/sanitizer_stacktrace.cc
@@ -40,11 +40,6 @@
   top_frame_bp = 0;
 }
 
-// Check if given pointer points into allocated stack area.
-static inline bool IsValidFrame(uptr frame, uptr stack_top, uptr stack_bottom) {
-  return frame > stack_bottom && frame < stack_top - 2 * sizeof (uhwptr);
-}
-
 // In GCC on ARM bp points to saved lr, not fp, so we should check the next
 // cell in stack to be a saved frame pointer. GetCanonicFrame returns the
 // pointer to saved frame pointer in any case.
@@ -71,6 +66,7 @@
 
 void BufferedStackTrace::FastUnwindStack(uptr pc, uptr bp, uptr stack_top,
                                          uptr stack_bottom, u32 max_depth) {
+  const uptr kPageSize = GetPageSizeCached();
   CHECK_GE(max_depth, 2);
   trace_buffer[0] = pc;
   size = 1;
@@ -92,9 +88,16 @@
         !IsAligned((uptr)caller_frame, sizeof(uhwptr)))
       break;
     uhwptr pc1 = caller_frame[2];
+#elif defined(__s390__)
+    uhwptr pc1 = frame[14];
 #else
     uhwptr pc1 = frame[1];
 #endif
+    // Let's assume that any pointer in the 0th page (i.e. <0x1000 on i386 and
+    // x86_64) is invalid and stop unwinding here.  If we're adding support for
+    // a platform where this isn't true, we need to reconsider this check.
+    if (pc1 < kPageSize)
+      break;
     if (pc1 != pc) {
       trace_buffer[size++] = (uptr) pc1;
     }
@@ -118,7 +121,7 @@
 uptr BufferedStackTrace::LocatePcInTrace(uptr pc) {
   // Use threshold to find PC in stack trace, as PC we want to unwind from may
   // slightly differ from return address in the actual unwinded stack trace.
-  const int kPcThreshold = 320;
+  const int kPcThreshold = 350;
   for (uptr i = 0; i < size; ++i) {
     if (MatchPc(pc, trace[i], kPcThreshold))
       return i;
diff --git a/lib/sanitizer_common/sanitizer_stacktrace.h b/lib/sanitizer_common/sanitizer_stacktrace.h
index 969cedb..90142df 100644
--- a/lib/sanitizer_common/sanitizer_stacktrace.h
+++ b/lib/sanitizer_common/sanitizer_stacktrace.h
@@ -110,6 +110,11 @@
   void operator=(const BufferedStackTrace &);
 };
 
+// Check if given pointer points into allocated stack area.
+static inline bool IsValidFrame(uptr frame, uptr stack_top, uptr stack_bottom) {
+  return frame > stack_bottom && frame < stack_top - 2 * sizeof (uhwptr);
+}
+
 }  // namespace __sanitizer
 
 // Use this macro if you want to print stack trace with the caller
diff --git a/lib/sanitizer_common/sanitizer_stacktrace_libcdep.cc b/lib/sanitizer_common/sanitizer_stacktrace_libcdep.cc
index f66fa79..59ca927 100644
--- a/lib/sanitizer_common/sanitizer_stacktrace_libcdep.cc
+++ b/lib/sanitizer_common/sanitizer_stacktrace_libcdep.cc
@@ -25,6 +25,8 @@
     return;
   }
   InternalScopedString frame_desc(GetPageSizeCached() * 2);
+  InternalScopedString dedup_token(GetPageSizeCached());
+  int dedup_frames = common_flags()->dedup_token_length;
   uptr frame_num = 0;
   for (uptr i = 0; i < size && trace[i]; i++) {
     // PCs in stack traces are actually the return addresses, that is,
@@ -38,11 +40,18 @@
                   cur->info, common_flags()->symbolize_vs_style,
                   common_flags()->strip_path_prefix);
       Printf("%s\n", frame_desc.data());
+      if (dedup_frames-- > 0) {
+        if (dedup_token.length())
+          dedup_token.append("--");
+        dedup_token.append(cur->info.function);
+      }
     }
     frames->ClearAll();
   }
   // Always print a trailing empty line after stack trace.
   Printf("\n");
+  if (dedup_token.length())
+    Printf("DEDUP_TOKEN: %s\n", dedup_token.data());
 }
 
 void BufferedStackTrace::Unwind(u32 max_depth, uptr pc, uptr bp, void *context,
diff --git a/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cc b/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cc
index d1e2f6a..1f8861f 100644
--- a/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cc
+++ b/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cc
@@ -15,7 +15,8 @@
 #include "sanitizer_platform.h"
 
 #if SANITIZER_LINUX && (defined(__x86_64__) || defined(__mips__) || \
-                        defined(__aarch64__) || defined(__powerpc64__))
+                        defined(__aarch64__) || defined(__powerpc64__) || \
+                        defined(__s390__))
 
 #include "sanitizer_stoptheworld.h"
 
@@ -232,8 +233,8 @@
 // Signal handler to wake up suspended threads when the tracer thread dies.
 static void TracerThreadSignalHandler(int signum, void *siginfo, void *uctx) {
   SignalContext ctx = SignalContext::Create(siginfo, uctx);
-  VPrintf(1, "Tracer caught signal %d: addr=0x%zx pc=0x%zx sp=0x%zx\n",
-      signum, ctx.addr, ctx.pc, ctx.sp);
+  Printf("Tracer caught signal %d: addr=0x%zx pc=0x%zx sp=0x%zx\n", signum,
+         ctx.addr, ctx.pc, ctx.sp);
   ThreadSuspender *inst = thread_suspender_instance;
   if (inst) {
     if (signum == SIGABRT)
@@ -481,6 +482,11 @@
 #define REG_SP sp
 #define ARCH_IOVEC_FOR_GETREGSET
 
+#elif defined(__s390__)
+typedef _user_regs_struct regs_struct;
+#define REG_SP gprs[15]
+#define ARCH_IOVEC_FOR_GETREGSET
+
 #else
 #error "Unsupported architecture"
 #endif // SANITIZER_ANDROID && defined(__arm__)
@@ -520,3 +526,4 @@
 
 #endif  // SANITIZER_LINUX && (defined(__x86_64__) || defined(__mips__)
         // || defined(__aarch64__) || defined(__powerpc64__)
+        // || defined(__s390__)
diff --git a/lib/sanitizer_common/sanitizer_symbolizer.cc b/lib/sanitizer_common/sanitizer_symbolizer.cc
index 8b2496a..534e55f 100644
--- a/lib/sanitizer_common/sanitizer_symbolizer.cc
+++ b/lib/sanitizer_common/sanitizer_symbolizer.cc
@@ -60,6 +60,7 @@
 
 void DataInfo::Clear() {
   InternalFree(module);
+  InternalFree(file);
   InternalFree(name);
   internal_memset(this, 0, sizeof(DataInfo));
 }
@@ -96,7 +97,7 @@
 }
 
 Symbolizer::Symbolizer(IntrusiveList<SymbolizerTool> tools)
-    : module_names_(&mu_), n_modules_(0), modules_fresh_(false), tools_(tools),
+    : module_names_(&mu_), modules_(), modules_fresh_(false), tools_(tools),
       start_hook_(0), end_hook_(0) {}
 
 Symbolizer::SymbolizerScope::SymbolizerScope(const Symbolizer *sym)
diff --git a/lib/sanitizer_common/sanitizer_symbolizer.h b/lib/sanitizer_common/sanitizer_symbolizer.h
index 9233223..bfe8509 100644
--- a/lib/sanitizer_common/sanitizer_symbolizer.h
+++ b/lib/sanitizer_common/sanitizer_symbolizer.h
@@ -65,6 +65,8 @@
   // (de)allocated using sanitizer internal allocator.
   char *module;
   uptr module_offset;
+  char *file;
+  uptr line;
   char *name;
   uptr start;
   uptr size;
@@ -80,6 +82,7 @@
   /// Initialize and return platform-specific implementation of symbolizer
   /// (if it wasn't already initialized).
   static Symbolizer *GetOrInit();
+  static void LateInitialize();
   // Returns a list of symbolized frames for a given address (containing
   // all inlined functions, if necessary).
   SymbolizedStack *SymbolizePC(uptr address);
@@ -113,6 +116,8 @@
   void AddHooks(StartSymbolizationHook start_hook,
                 EndSymbolizationHook end_hook);
 
+  const LoadedModule *FindModuleForAddress(uptr address);
+
  private:
   // GetModuleNameAndOffsetForPC has to return a string to the caller.
   // Since the corresponding module might get unloaded later, we should create
@@ -139,9 +144,7 @@
 
   bool FindModuleNameAndOffsetForAddress(uptr address, const char **module_name,
                                          uptr *module_offset);
-  LoadedModule *FindModuleForAddress(uptr address);
-  LoadedModule modules_[kMaxNumberOfModules];
-  uptr n_modules_;
+  ListOfModules modules_;
   // If stale, need to reload the modules before looking up addresses.
   bool modules_fresh_;
 
@@ -157,7 +160,6 @@
   // always synchronized.
   BlockingMutex mu_;
 
-  typedef IntrusiveList<SymbolizerTool>::Iterator Iterator;
   IntrusiveList<SymbolizerTool> tools_;
 
   explicit Symbolizer(IntrusiveList<SymbolizerTool> tools);
diff --git a/lib/sanitizer_common/sanitizer_symbolizer_internal.h b/lib/sanitizer_common/sanitizer_symbolizer_internal.h
index 12c70b6..ada059c 100644
--- a/lib/sanitizer_common/sanitizer_symbolizer_internal.h
+++ b/lib/sanitizer_common/sanitizer_symbolizer_internal.h
@@ -28,7 +28,7 @@
 const char *ExtractTokenUpToDelimiter(const char *str, const char *delimiter,
                                       char **result);
 
-const char *DemangleCXXABI(const char *name);
+const char *DemangleSwiftAndCXX(const char *name);
 
 // SymbolizerTool is an interface that is implemented by individual "tools"
 // that can perform symbolication (external llvm-symbolizer, libbacktrace,
diff --git a/lib/sanitizer_common/sanitizer_symbolizer_libcdep.cc b/lib/sanitizer_common/sanitizer_symbolizer_libcdep.cc
index 8c3ad81..36b4fa9 100644
--- a/lib/sanitizer_common/sanitizer_symbolizer_libcdep.cc
+++ b/lib/sanitizer_common/sanitizer_symbolizer_libcdep.cc
@@ -69,10 +69,9 @@
     return res;
   // Always fill data about module name and offset.
   res->info.FillModuleInfo(module_name, module_offset);
-  for (auto iter = Iterator(&tools_); iter.hasNext();) {
-    auto *tool = iter.next();
+  for (auto &tool : tools_) {
     SymbolizerScope sym_scope(this);
-    if (tool->SymbolizePC(addr, res)) {
+    if (tool.SymbolizePC(addr, res)) {
       return res;
     }
   }
@@ -88,10 +87,9 @@
   info->Clear();
   info->module = internal_strdup(module_name);
   info->module_offset = module_offset;
-  for (auto iter = Iterator(&tools_); iter.hasNext();) {
-    auto *tool = iter.next();
+  for (auto &tool : tools_) {
     SymbolizerScope sym_scope(this);
-    if (tool->SymbolizeData(addr, info)) {
+    if (tool.SymbolizeData(addr, info)) {
       return true;
     }
   }
@@ -113,19 +111,17 @@
 
 void Symbolizer::Flush() {
   BlockingMutexLock l(&mu_);
-  for (auto iter = Iterator(&tools_); iter.hasNext();) {
-    auto *tool = iter.next();
+  for (auto &tool : tools_) {
     SymbolizerScope sym_scope(this);
-    tool->Flush();
+    tool.Flush();
   }
 }
 
 const char *Symbolizer::Demangle(const char *name) {
   BlockingMutexLock l(&mu_);
-  for (auto iter = Iterator(&tools_); iter.hasNext();) {
-    auto *tool = iter.next();
+  for (auto &tool : tools_) {
     SymbolizerScope sym_scope(this);
-    if (const char *demangled = tool->Demangle(name))
+    if (const char *demangled = tool.Demangle(name))
       return demangled;
   }
   return PlatformDemangle(name);
@@ -139,27 +135,23 @@
 bool Symbolizer::FindModuleNameAndOffsetForAddress(uptr address,
                                                    const char **module_name,
                                                    uptr *module_offset) {
-  LoadedModule *module = FindModuleForAddress(address);
-  if (module == 0)
+  const LoadedModule *module = FindModuleForAddress(address);
+  if (module == nullptr)
     return false;
   *module_name = module->full_name();
   *module_offset = address - module->base_address();
   return true;
 }
 
-LoadedModule *Symbolizer::FindModuleForAddress(uptr address) {
+const LoadedModule *Symbolizer::FindModuleForAddress(uptr address) {
   bool modules_were_reloaded = false;
   if (!modules_fresh_) {
-    for (uptr i = 0; i < n_modules_; i++)
-      modules_[i].clear();
-    n_modules_ =
-        GetListOfModules(modules_, kMaxNumberOfModules, /* filter */ nullptr);
-    CHECK_GT(n_modules_, 0);
-    CHECK_LT(n_modules_, kMaxNumberOfModules);
+    modules_.init();
+    RAW_CHECK(modules_.size() > 0);
     modules_fresh_ = true;
     modules_were_reloaded = true;
   }
-  for (uptr i = 0; i < n_modules_; i++) {
+  for (uptr i = 0; i < modules_.size(); i++) {
     if (modules_[i].containsAddress(address)) {
       return &modules_[i];
     }
@@ -213,10 +205,18 @@
     const char* const kSymbolizerArch = "--default-arch=x86_64";
 #elif defined(__i386__)
     const char* const kSymbolizerArch = "--default-arch=i386";
-#elif defined(__powerpc64__) && defined(__BIG_ENDIAN__)
+#elif defined(__aarch64__)
+    const char* const kSymbolizerArch = "--default-arch=arm64";
+#elif defined(__arm__)
+    const char* const kSymbolizerArch = "--default-arch=arm";
+#elif defined(__powerpc64__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
     const char* const kSymbolizerArch = "--default-arch=powerpc64";
-#elif defined(__powerpc64__) && defined(__LITTLE_ENDIAN__)
+#elif defined(__powerpc64__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
     const char* const kSymbolizerArch = "--default-arch=powerpc64le";
+#elif defined(__s390x__)
+    const char* const kSymbolizerArch = "--default-arch=s390x";
+#elif defined(__s390__)
+    const char* const kSymbolizerArch = "--default-arch=s390";
 #else
     const char* const kSymbolizerArch = "--default-arch=unknown";
 #endif
diff --git a/lib/sanitizer_common/sanitizer_symbolizer_mac.cc b/lib/sanitizer_common/sanitizer_symbolizer_mac.cc
index 64048fa..d591abc 100644
--- a/lib/sanitizer_common/sanitizer_symbolizer_mac.cc
+++ b/lib/sanitizer_common/sanitizer_symbolizer_mac.cc
@@ -32,7 +32,7 @@
   Dl_info info;
   int result = dladdr((const void *)addr, &info);
   if (!result) return false;
-  const char *demangled = DemangleCXXABI(info.dli_sname);
+  const char *demangled = DemangleSwiftAndCXX(info.dli_sname);
   stack->info.function = demangled ? internal_strdup(demangled) : nullptr;
   return true;
 }
@@ -41,7 +41,7 @@
   Dl_info info;
   int result = dladdr((const void *)addr, &info);
   if (!result) return false;
-  const char *demangled = DemangleCXXABI(info.dli_sname);
+  const char *demangled = DemangleSwiftAndCXX(info.dli_sname);
   datainfo->name = internal_strdup(demangled);
   datainfo->start = (uptr)info.dli_saddr;
   return true;
@@ -79,23 +79,6 @@
   char pid_str_[16];
 };
 
-static const char *kAtosErrorMessages[] = {
-  "atos cannot examine process",
-  "unable to get permission to examine process",
-  "An admin user name and password is required",
-  "could not load inserted library",
-  "architecture mismatch between analysis process",
-};
-
-static bool IsAtosErrorMessage(const char *str) {
-  for (uptr i = 0; i < ARRAY_SIZE(kAtosErrorMessages); i++) {
-    if (internal_strstr(str, kAtosErrorMessages[i])) {
-      return true;
-    }
-  }
-  return false;
-}
-
 static bool ParseCommandOutput(const char *str, uptr addr, char **out_name,
                                char **out_module, char **out_file, uptr *line,
                                uptr *start_address) {
@@ -112,12 +95,6 @@
   //   0xdeadbeef (in library.dylib)
   //   0xdeadbeef
 
-  if (IsAtosErrorMessage(trim)) {
-    Report("atos returned an error: %s\n", trim);
-    InternalFree(trim);
-    return false;
-  }
-
   const char *rest = trim;
   char *symbol_name;
   rest = ExtractTokenUpToDelimiter(rest, " (in ", &symbol_name);
@@ -157,6 +134,7 @@
 
 bool AtosSymbolizer::SymbolizePC(uptr addr, SymbolizedStack *stack) {
   if (!process_) return false;
+  if (addr == 0) return false;
   char command[32];
   internal_snprintf(command, sizeof(command), "0x%zx\n", addr);
   const char *buf = process_->SendCommand(command);
diff --git a/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cc b/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cc
index fc8a7d9..7028da6 100644
--- a/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cc
+++ b/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cc
@@ -26,7 +26,9 @@
 #include "sanitizer_symbolizer_libbacktrace.h"
 #include "sanitizer_symbolizer_mac.h"
 
+#include <dlfcn.h>   // for dlsym()
 #include <errno.h>
+#include <stdint.h>
 #include <stdlib.h>
 #include <sys/wait.h>
 #include <unistd.h>
@@ -61,6 +63,44 @@
   return name;
 }
 
+// As of now, there are no headers for the Swift runtime. Once they are
+// present, we will weakly link since we do not require Swift runtime to be
+// linked.
+typedef char *(*swift_demangle_ft)(const char *mangledName,
+                                   size_t mangledNameLength, char *outputBuffer,
+                                   size_t *outputBufferSize, uint32_t flags);
+static swift_demangle_ft swift_demangle_f;
+
+// This must not happen lazily at symbolication time, because dlsym uses
+// malloc and thread-local storage, which is not a good thing to do during
+// symbolication.
+static void InitializeSwiftDemangler() {
+  swift_demangle_f = (swift_demangle_ft)dlsym(RTLD_DEFAULT, "swift_demangle");
+}
+
+// Attempts to demangle a Swift name. The demangler will return nullptr if a
+// non-Swift name is passed in.
+const char *DemangleSwift(const char *name) {
+  if (!name) return nullptr;
+
+  // Check if we are dealing with a Swift mangled name first.
+  if (name[0] != '_' || name[1] != 'T') {
+    return nullptr;
+  }
+
+  if (swift_demangle_f)
+    return swift_demangle_f(name, internal_strlen(name), 0, 0, 0);
+
+  return nullptr;
+}
+
+const char *DemangleSwiftAndCXX(const char *name) {
+  if (!name) return nullptr;
+  if (const char *swift_demangled_name = DemangleSwift(name))
+    return swift_demangled_name;
+  return DemangleCXXABI(name);
+}
+
 bool SymbolizerProcess::StartSymbolizerSubprocess() {
   if (!FileExists(path_)) {
     if (!reported_invalid_path_) {
@@ -74,6 +114,13 @@
   if (use_forkpty_) {
 #if SANITIZER_MAC
     fd_t fd = kInvalidFd;
+
+    // forkpty redirects stdout and stderr into a single stream, so we would
+    // receive error messages as standard replies. To avoid that, let's dup
+    // stderr and restore it in the child.
+    int saved_stderr = dup(STDERR_FILENO);
+    CHECK_GE(saved_stderr, 0);
+
     // Use forkpty to disable buffering in the new terminal.
     pid = internal_forkpty(&fd);
     if (pid == -1) {
@@ -83,6 +130,11 @@
       return false;
     } else if (pid == 0) {
       // Child subprocess.
+
+      // Restore stderr.
+      CHECK_GE(dup2(saved_stderr, STDERR_FILENO), 0);
+      close(saved_stderr);
+
       const char *argv[kArgVMax];
       GetArgV(path_, argv);
       execv(path_, const_cast<char **>(&argv[0]));
@@ -92,6 +144,8 @@
     // Continue execution in parent process.
     input_fd_ = output_fd_ = fd;
 
+    close(saved_stderr);
+
     // Disable echo in the new terminal, disable CR.
     struct termios termflags;
     tcgetattr(fd, &termflags);
@@ -137,47 +191,23 @@
     CHECK(infd);
     CHECK(outfd);
 
-    // Real fork() may call user callbacks registered with pthread_atfork().
-    pid = internal_fork();
-    if (pid == -1) {
-      // Fork() failed.
+    const char *argv[kArgVMax];
+    GetArgV(path_, argv);
+    pid = StartSubprocess(path_, argv, /* stdin */ outfd[0],
+                          /* stdout */ infd[1]);
+    if (pid < 0) {
       internal_close(infd[0]);
-      internal_close(infd[1]);
-      internal_close(outfd[0]);
       internal_close(outfd[1]);
-      Report("WARNING: failed to fork external symbolizer "
-             " (errno: %d)\n", errno);
       return false;
-    } else if (pid == 0) {
-      // Child subprocess.
-      internal_close(STDOUT_FILENO);
-      internal_close(STDIN_FILENO);
-      internal_dup2(outfd[0], STDIN_FILENO);
-      internal_dup2(infd[1], STDOUT_FILENO);
-      internal_close(outfd[0]);
-      internal_close(outfd[1]);
-      internal_close(infd[0]);
-      internal_close(infd[1]);
-      for (int fd = sysconf(_SC_OPEN_MAX); fd > 2; fd--)
-        internal_close(fd);
-      const char *argv[kArgVMax];
-      GetArgV(path_, argv);
-      execv(path_, const_cast<char **>(&argv[0]));
-      internal__exit(1);
     }
 
-    // Continue execution in parent process.
-    internal_close(outfd[0]);
-    internal_close(infd[1]);
     input_fd_ = infd[0];
     output_fd_ = outfd[1];
   }
 
   // Check that symbolizer subprocess started successfully.
-  int pid_status;
   SleepForMillis(kSymbolizerStartupTimeMillis);
-  int exited_pid = waitpid(pid, &pid_status, WNOHANG);
-  if (exited_pid != 0) {
+  if (!IsProcessRunning(pid)) {
     // Either waitpid failed, or child has already exited.
     Report("WARNING: external symbolizer didn't start up correctly!\n");
     return false;
@@ -374,7 +404,7 @@
 #endif  // SANITIZER_SUPPORTS_WEAK_HOOKS
 
 const char *Symbolizer::PlatformDemangle(const char *name) {
-  return DemangleCXXABI(name);
+  return DemangleSwiftAndCXX(name);
 }
 
 void Symbolizer::PlatformPrepareForSandboxing() {}
@@ -461,6 +491,11 @@
   return new(symbolizer_allocator_) Symbolizer(list);
 }
 
+void Symbolizer::LateInitialize() {
+  Symbolizer::GetOrInit();
+  InitializeSwiftDemangler();
+}
+
 }  // namespace __sanitizer
 
 #endif  // SANITIZER_POSIX
diff --git a/lib/sanitizer_common/sanitizer_symbolizer_win.cc b/lib/sanitizer_common/sanitizer_symbolizer_win.cc
index b1dceeb..96bb3b4 100644
--- a/lib/sanitizer_common/sanitizer_symbolizer_win.cc
+++ b/lib/sanitizer_common/sanitizer_symbolizer_win.cc
@@ -279,6 +279,10 @@
   return new(symbolizer_allocator_) Symbolizer(list);
 }
 
+void Symbolizer::LateInitialize() {
+  Symbolizer::GetOrInit();
+}
+
 }  // namespace __sanitizer
 
 #endif  // _WIN32
diff --git a/lib/sanitizer_common/sanitizer_termination.cc b/lib/sanitizer_common/sanitizer_termination.cc
new file mode 100644
index 0000000..8243fc0
--- /dev/null
+++ b/lib/sanitizer_common/sanitizer_termination.cc
@@ -0,0 +1,86 @@
+//===-- sanitizer_termination.cc --------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// This file contains the Sanitizer termination functions CheckFailed and Die,
+/// and the callback functionalities associated with them.
+///
+//===----------------------------------------------------------------------===//
+
+#include "sanitizer_common.h"
+#include "sanitizer_libc.h"
+
+namespace __sanitizer {
+
+static const int kMaxNumOfInternalDieCallbacks = 5;
+static DieCallbackType InternalDieCallbacks[kMaxNumOfInternalDieCallbacks];
+
+bool AddDieCallback(DieCallbackType callback) {
+  for (int i = 0; i < kMaxNumOfInternalDieCallbacks; i++) {
+    if (InternalDieCallbacks[i] == nullptr) {
+      InternalDieCallbacks[i] = callback;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool RemoveDieCallback(DieCallbackType callback) {
+  for (int i = 0; i < kMaxNumOfInternalDieCallbacks; i++) {
+    if (InternalDieCallbacks[i] == callback) {
+      internal_memmove(&InternalDieCallbacks[i], &InternalDieCallbacks[i + 1],
+                       sizeof(InternalDieCallbacks[0]) *
+                           (kMaxNumOfInternalDieCallbacks - i - 1));
+      InternalDieCallbacks[kMaxNumOfInternalDieCallbacks - 1] = nullptr;
+      return true;
+    }
+  }
+  return false;
+}
+
+static DieCallbackType UserDieCallback;
+void SetUserDieCallback(DieCallbackType callback) {
+  UserDieCallback = callback;
+}
+
+void NORETURN Die() {
+  if (UserDieCallback)
+    UserDieCallback();
+  for (int i = kMaxNumOfInternalDieCallbacks - 1; i >= 0; i--) {
+    if (InternalDieCallbacks[i])
+      InternalDieCallbacks[i]();
+  }
+  if (common_flags()->abort_on_error)
+    Abort();
+  internal__exit(common_flags()->exitcode);
+}
+
+static CheckFailedCallbackType CheckFailedCallback;
+void SetCheckFailedCallback(CheckFailedCallbackType callback) {
+  CheckFailedCallback = callback;
+}
+
+const int kSecondsToSleepWhenRecursiveCheckFailed = 2;
+
+void NORETURN CheckFailed(const char *file, int line, const char *cond,
+                          u64 v1, u64 v2) {
+  static atomic_uint32_t num_calls;
+  if (atomic_fetch_add(&num_calls, 1, memory_order_relaxed) > 10) {
+    SleepForSeconds(kSecondsToSleepWhenRecursiveCheckFailed);
+    Trap();
+  }
+
+  if (CheckFailedCallback) {
+    CheckFailedCallback(file, line, cond, v1, v2);
+  }
+  Report("Sanitizer CHECK failed: %s:%d %s (%lld, %lld)\n", file, line, cond,
+                                                            v1, v2);
+  Die();
+}
+
+} // namespace __sanitizer
diff --git a/lib/sanitizer_common/sanitizer_thread_registry.cc b/lib/sanitizer_common/sanitizer_thread_registry.cc
index 2ec92ff..6e7ddfa 100644
--- a/lib/sanitizer_common/sanitizer_thread_registry.cc
+++ b/lib/sanitizer_common/sanitizer_thread_registry.cc
@@ -277,6 +277,8 @@
 }
 
 void ThreadRegistry::QuarantinePush(ThreadContextBase *tctx) {
+  if (tctx->tid == 0)
+    return;  // Don't reuse the main thread.  It's a special snowflake.
   dead_threads_.push_back(tctx);
   if (dead_threads_.size() <= thread_quarantine_size_)
     return;
diff --git a/lib/sanitizer_common/sanitizer_tls_get_addr.cc b/lib/sanitizer_common/sanitizer_tls_get_addr.cc
index 213aced..77c1947 100644
--- a/lib/sanitizer_common/sanitizer_tls_get_addr.cc
+++ b/lib/sanitizer_common/sanitizer_tls_get_addr.cc
@@ -78,7 +78,7 @@
   DTLS_Deallocate(dtls.dtv, s);
 }
 
-#if defined(__powerpc64__)
+#if defined(__powerpc64__) || defined(__mips__)
 // This is glibc's TLS_DTV_OFFSET:
 // "Dynamic thread vector pointers point 0x8000 past the start of each
 //  TLS block."
diff --git a/lib/sanitizer_common/sanitizer_unwind_linux_libcdep.cc b/lib/sanitizer_common/sanitizer_unwind_linux_libcdep.cc
index 1082ccf..5943125 100644
--- a/lib/sanitizer_common/sanitizer_unwind_linux_libcdep.cc
+++ b/lib/sanitizer_common/sanitizer_unwind_linux_libcdep.cc
@@ -48,6 +48,11 @@
 
 #if SANITIZER_ANDROID
 void SanitizerInitializeUnwinder() {
+  if (AndroidGetApiLevel() >= ANDROID_LOLLIPOP_MR1) return;
+
+  // Pre-lollipop Android can not unwind through signal handler frames with
+  // libgcc unwinder, but it has a libcorkscrew.so library with the necessary
+  // workarounds.
   void *p = dlopen("libcorkscrew.so", RTLD_LAZY);
   if (!p) {
     VReport(1,
@@ -103,6 +108,11 @@
   UnwindTraceArg *arg = (UnwindTraceArg*)param;
   CHECK_LT(arg->stack->size, arg->max_depth);
   uptr pc = Unwind_GetIP(ctx);
+  const uptr kPageSize = GetPageSizeCached();
+  // Let's assume that any pointer in the 0th page (i.e. <0x1000 on i386 and
+  // x86_64) is invalid and stop unwinding here.  If we're adding support for
+  // a platform where this isn't true, we need to reconsider this check.
+  if (pc < kPageSize) return UNWIND_STOP;
   arg->stack->trace_buffer[arg->stack->size++] = pc;
   if (arg->stack->size == arg->max_depth) return UNWIND_STOP;
   return UNWIND_CONTINUE;
diff --git a/lib/sanitizer_common/sanitizer_win.cc b/lib/sanitizer_common/sanitizer_win.cc
index 861261d..a6998af 100644
--- a/lib/sanitizer_common/sanitizer_win.cc
+++ b/lib/sanitizer_common/sanitizer_win.cc
@@ -35,13 +35,15 @@
 
 // --------------------- sanitizer_common.h
 uptr GetPageSize() {
-  // FIXME: there is an API for getting the system page size (GetSystemInfo or
-  // GetNativeSystemInfo), but if we use it here we get test failures elsewhere.
-  return 1U << 14;
+  SYSTEM_INFO si;
+  GetSystemInfo(&si);
+  return si.dwPageSize;
 }
 
 uptr GetMmapGranularity() {
-  return 1U << 16;  // FIXME: is this configurable?
+  SYSTEM_INFO si;
+  GetSystemInfo(&si);
+  return si.dwAllocationGranularity;
 }
 
 uptr GetMaxVirtualAddress() {
@@ -95,20 +97,90 @@
   if (!size || !addr)
     return;
 
-  if (VirtualFree(addr, size, MEM_DECOMMIT) == 0) {
-    Report("ERROR: %s failed to "
-           "deallocate 0x%zx (%zd) bytes at address %p (error code: %d)\n",
-           SanitizerToolName, size, size, addr, GetLastError());
-    CHECK("unable to unmap" && 0);
+  MEMORY_BASIC_INFORMATION mbi;
+  CHECK(VirtualQuery(addr, &mbi, sizeof(mbi)));
+
+  // MEM_RELEASE can only be used to unmap whole regions previously mapped with
+  // VirtualAlloc. So we first try MEM_RELEASE since it is better, and if that
+  // fails try MEM_DECOMMIT.
+  if (VirtualFree(addr, 0, MEM_RELEASE) == 0) {
+    if (VirtualFree(addr, size, MEM_DECOMMIT) == 0) {
+      Report("ERROR: %s failed to "
+             "deallocate 0x%zx (%zd) bytes at address %p (error code: %d)\n",
+             SanitizerToolName, size, size, addr, GetLastError());
+      CHECK("unable to unmap" && 0);
+    }
   }
 }
 
+// We want to map a chunk of address space aligned to 'alignment'.
+void *MmapAlignedOrDie(uptr size, uptr alignment, const char *mem_type) {
+  CHECK(IsPowerOfTwo(size));
+  CHECK(IsPowerOfTwo(alignment));
+
+  // Windows will align our allocations to at least 64K.
+  alignment = Max(alignment, GetMmapGranularity());
+
+  uptr mapped_addr =
+      (uptr)VirtualAlloc(0, size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
+  if (!mapped_addr)
+    ReportMmapFailureAndDie(size, mem_type, "allocate aligned", GetLastError());
+
+  // If we got it right on the first try, return. Otherwise, unmap it and go to
+  // the slow path.
+  if (IsAligned(mapped_addr, alignment))
+    return (void*)mapped_addr;
+  if (VirtualFree((void *)mapped_addr, 0, MEM_RELEASE) == 0)
+    ReportMmapFailureAndDie(size, mem_type, "deallocate", GetLastError());
+
+  // If we didn't get an aligned address, overallocate, find an aligned address,
+  // unmap, and try to allocate at that aligned address.
+  int retries = 0;
+  const int kMaxRetries = 10;
+  for (; retries < kMaxRetries &&
+         (mapped_addr == 0 || !IsAligned(mapped_addr, alignment));
+       retries++) {
+    // Overallocate size + alignment bytes.
+    mapped_addr =
+        (uptr)VirtualAlloc(0, size + alignment, MEM_RESERVE, PAGE_NOACCESS);
+    if (!mapped_addr)
+      ReportMmapFailureAndDie(size, mem_type, "allocate aligned",
+                              GetLastError());
+
+    // Find the aligned address.
+    uptr aligned_addr = RoundUpTo(mapped_addr, alignment);
+
+    // Free the overallocation.
+    if (VirtualFree((void *)mapped_addr, 0, MEM_RELEASE) == 0)
+      ReportMmapFailureAndDie(size, mem_type, "deallocate", GetLastError());
+
+    // Attempt to allocate exactly the number of bytes we need at the aligned
+    // address. This may fail for a number of reasons, in which case we continue
+    // the loop.
+    mapped_addr = (uptr)VirtualAlloc((void *)aligned_addr, size,
+                                     MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
+  }
+
+  // Fail if we can't make this work quickly.
+  if (retries == kMaxRetries && mapped_addr == 0)
+    ReportMmapFailureAndDie(size, mem_type, "allocate aligned", GetLastError());
+
+  return (void *)mapped_addr;
+}
+
 void *MmapFixedNoReserve(uptr fixed_addr, uptr size, const char *name) {
   // FIXME: is this really "NoReserve"? On Win32 this does not matter much,
   // but on Win64 it does.
-  (void)name; // unsupported
-  void *p = VirtualAlloc((LPVOID)fixed_addr, size,
-      MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
+  (void)name;  // unsupported
+#if SANITIZER_WINDOWS64
+  // On Windows64, use MEM_COMMIT would result in error
+  // 1455:ERROR_COMMITMENT_LIMIT.
+  // We use exception handler to commit page on demand.
+  void *p = VirtualAlloc((LPVOID)fixed_addr, size, MEM_RESERVE, PAGE_READWRITE);
+#else
+  void *p = VirtualAlloc((LPVOID)fixed_addr, size, MEM_RESERVE | MEM_COMMIT,
+                         PAGE_READWRITE);
+#endif
   if (p == 0)
     Report("ERROR: %s failed to "
            "allocate %p (%zd) bytes at %p (error code: %d)\n",
@@ -116,8 +188,18 @@
   return p;
 }
 
+// Memory space mapped by 'MmapFixedOrDie' must have been reserved by
+// 'MmapFixedNoAccess'.
 void *MmapFixedOrDie(uptr fixed_addr, uptr size) {
-  return MmapFixedNoReserve(fixed_addr, size);
+  void *p = VirtualAlloc((LPVOID)fixed_addr, size,
+      MEM_COMMIT, PAGE_READWRITE);
+  if (p == 0) {
+    char mem_type[30];
+    internal_snprintf(mem_type, sizeof(mem_type), "memory at address 0x%zx",
+                      fixed_addr);
+    ReportMmapFailureAndDie(size, mem_type, "allocate", GetLastError());
+  }
+  return p;
 }
 
 void *MmapNoReserveOrDie(uptr size, const char *mem_type) {
@@ -125,10 +207,10 @@
   return MmapOrDie(size, mem_type);
 }
 
-void *MmapNoAccess(uptr fixed_addr, uptr size, const char *name) {
+void *MmapFixedNoAccess(uptr fixed_addr, uptr size, const char *name) {
   (void)name; // unsupported
   void *res = VirtualAlloc((LPVOID)fixed_addr, size,
-                           MEM_RESERVE | MEM_COMMIT, PAGE_NOACCESS);
+                           MEM_RESERVE, PAGE_NOACCESS);
   if (res == 0)
     Report("WARNING: %s failed to "
            "mprotect %p (%zd) bytes at %p (error code: %d)\n",
@@ -136,6 +218,11 @@
   return res;
 }
 
+void *MmapNoAccess(uptr size) {
+  // FIXME: unsupported.
+  return nullptr;
+}
+
 bool MprotectNoAccess(uptr addr, uptr size) {
   DWORD old_protection;
   return VirtualProtect((LPVOID)addr, size, PAGE_NOACCESS, &old_protection);
@@ -234,15 +321,15 @@
 #ifndef SANITIZER_GO
 void DumpProcessMap() {
   Report("Dumping process modules:\n");
-  InternalScopedBuffer<LoadedModule> modules(kMaxNumberOfModules);
-  uptr num_modules =
-      GetListOfModules(modules.data(), kMaxNumberOfModules, nullptr);
+  ListOfModules modules;
+  modules.init();
+  uptr num_modules = modules.size();
 
   InternalScopedBuffer<ModuleInfo> module_infos(num_modules);
   for (size_t i = 0; i < num_modules; ++i) {
     module_infos[i].filepath = modules[i].full_name();
     module_infos[i].base_address = modules[i].base_address();
-    module_infos[i].end_address = modules[i].ranges().next()->end;
+    module_infos[i].end_address = modules[i].ranges().front()->end;
   }
   qsort(module_infos.data(), num_modules, sizeof(ModuleInfo),
         CompareModulesBase);
@@ -317,6 +404,7 @@
   internal__exit(3);
 }
 
+#ifndef SANITIZER_GO
 // Read the file to extract the ImageBase field from the PE header. If ASLR is
 // disabled and this virtual address is available, the loader will typically
 // load the image at this address. Therefore, we call it the preferred base. Any
@@ -369,9 +457,8 @@
   return (uptr)pe_header->ImageBase;
 }
 
-#ifndef SANITIZER_GO
-uptr GetListOfModules(LoadedModule *modules, uptr max_modules,
-                      string_predicate_t filter) {
+void ListOfModules::init() {
+  clear();
   HANDLE cur_process = GetCurrentProcess();
 
   // Query the list of modules.  Start by assuming there are no more than 256
@@ -393,10 +480,8 @@
   }
 
   // |num_modules| is the number of modules actually present,
-  // |count| is the number of modules we return.
-  size_t nun_modules = bytes_required / sizeof(HMODULE),
-         count = 0;
-  for (size_t i = 0; i < nun_modules && count < max_modules; ++i) {
+  size_t num_modules = bytes_required / sizeof(HMODULE);
+  for (size_t i = 0; i < num_modules; ++i) {
     HMODULE handle = hmodules[i];
     MODULEINFO mi;
     if (!GetModuleInformation(cur_process, handle, &mi, sizeof(mi)))
@@ -414,9 +499,6 @@
                               &module_name[0], kMaxPathLength, NULL, NULL);
     module_name[module_name_len] = '\0';
 
-    if (filter && !filter(module_name))
-      continue;
-
     uptr base_address = (uptr)mi.lpBaseOfDll;
     uptr end_address = (uptr)mi.lpBaseOfDll + mi.SizeOfImage;
 
@@ -427,15 +509,13 @@
     uptr preferred_base = GetPreferredBase(&module_name[0]);
     uptr adjusted_base = base_address - preferred_base;
 
-    LoadedModule *cur_module = &modules[count];
-    cur_module->set(module_name, adjusted_base);
+    LoadedModule cur_module;
+    cur_module.set(module_name, adjusted_base);
     // We add the whole module as one single address range.
-    cur_module->addAddressRange(base_address, end_address, /*executable*/ true);
-    count++;
+    cur_module.addAddressRange(base_address, end_address, /*executable*/ true);
+    modules_.push_back(cur_module);
   }
   UnmapOrDie(hmodules, modules_buffer_size);
-
-  return count;
 };
 
 // We can't use atexit() directly at __asan_init time as the CRT is not fully
@@ -462,14 +542,15 @@
 
 // ------------------ sanitizer_libc.h
 fd_t OpenFile(const char *filename, FileAccessMode mode, error_t *last_error) {
+  // FIXME: Use the wide variants to handle Unicode filenames.
   fd_t res;
   if (mode == RdOnly) {
-    res = CreateFile(filename, GENERIC_READ,
-                     FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
-                     nullptr, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, nullptr);
+    res = CreateFileA(filename, GENERIC_READ,
+                      FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+                      nullptr, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, nullptr);
   } else if (mode == WrOnly) {
-    res = CreateFile(filename, GENERIC_WRITE, 0, nullptr, CREATE_ALWAYS,
-                     FILE_ATTRIBUTE_NORMAL, nullptr);
+    res = CreateFileA(filename, GENERIC_WRITE, 0, nullptr, CREATE_ALWAYS,
+                      FILE_ATTRIBUTE_NORMAL, nullptr);
   } else {
     UNIMPLEMENTED();
   }
@@ -700,7 +781,7 @@
   // FIXME: Decide what to do on Windows.
 }
 
-bool IsDeadlySignal(int signum) {
+bool IsHandledDeadlySignal(int signum) {
   // FIXME: Decide what to do on Windows.
   return false;
 }
@@ -731,8 +812,8 @@
 }
 
 SignalContext SignalContext::Create(void *siginfo, void *context) {
-  EXCEPTION_RECORD *exception_record = (EXCEPTION_RECORD*)siginfo;
-  CONTEXT *context_record = (CONTEXT*)context;
+  EXCEPTION_RECORD *exception_record = (EXCEPTION_RECORD *)siginfo;
+  CONTEXT *context_record = (CONTEXT *)context;
 
   uptr pc = (uptr)exception_record->ExceptionAddress;
 #ifdef _WIN64
@@ -744,7 +825,19 @@
 #endif
   uptr access_addr = exception_record->ExceptionInformation[1];
 
-  return SignalContext(context, access_addr, pc, sp, bp);
+  // The contents of this array are documented at
+  // https://msdn.microsoft.com/en-us/library/windows/desktop/aa363082(v=vs.85).aspx
+  // The first element indicates read as 0, write as 1, or execute as 8.  The
+  // second element is the faulting address.
+  WriteFlag write_flag = SignalContext::UNKNOWN;
+  switch (exception_record->ExceptionInformation[0]) {
+  case 0: write_flag = SignalContext::READ; break;
+  case 1: write_flag = SignalContext::WRITE; break;
+  case 8: write_flag = SignalContext::UNKNOWN; break;
+  }
+  bool is_memory_access = write_flag != SignalContext::UNKNOWN;
+  return SignalContext(context, access_addr, pc, sp, bp, is_memory_access,
+                       write_flag);
 }
 
 uptr ReadBinaryName(/*out*/char *buf, uptr buf_len) {
@@ -762,14 +855,31 @@
   // Do nothing.
 }
 
-void DisableReexec() {
-  // No need to re-exec on Windows.
-}
-
 void MaybeReexec() {
   // No need to re-exec on Windows.
 }
 
+char **GetArgv() {
+  // FIXME: Actually implement this function.
+  return 0;
+}
+
+pid_t StartSubprocess(const char *program, const char *const argv[],
+                      fd_t stdin_fd, fd_t stdout_fd, fd_t stderr_fd) {
+  // FIXME: implement on this platform
+  // Should be implemented based on
+  // SymbolizerProcess::StarAtSymbolizerSubprocess
+  // from lib/sanitizer_common/sanitizer_symbolizer_win.cc.
+  return -1;
+}
+
+bool IsProcessRunning(pid_t pid) {
+  // FIXME: implement on this platform.
+  return false;
+}
+
+int WaitForProcess(pid_t pid) { return -1; }
+
 }  // namespace __sanitizer
 
 #endif  // _WIN32
diff --git a/lib/sanitizer_common/scripts/sancov.py b/lib/sanitizer_common/scripts/sancov.py
index a5ae957..e19afdb 100755
--- a/lib/sanitizer_common/scripts/sancov.py
+++ b/lib/sanitizer_common/scripts/sancov.py
@@ -30,6 +30,10 @@
   CheckBits(bits)
   return 'L' if bits == 64 else 'I'
 
+def TypeCodeForStruct(bits):
+  CheckBits(bits)
+  return 'Q' if bits == 64 else 'I'
+
 kMagic32SecondHalf = 0xFFFFFF32;
 kMagic64SecondHalf = 0xFFFFFF64;
 kMagicFirstHalf    = 0xC0BFFFFF;
@@ -64,7 +68,7 @@
       raise Exception('File %s is short (< 8 bytes)' % path)
     bits = ReadMagicAndReturnBitness(f, path)
     size -= 8
-    s = array.array(TypeCodeForBits(bits), f.read(size))
+    s = struct.unpack_from(TypeCodeForStruct(bits) * (size * 8 / bits), f.read(size))
   print >>sys.stderr, "%s: read %d %d-bit PCs from %s" % (prog_name, size * 8 / bits, bits, path)
   return s
 
@@ -94,8 +98,8 @@
   if max(s) > 0xFFFFFFFF:
     bits = 64
   array.array('I', MagicForBits(bits)).tofile(sys.stdout)
-  a = array.array(TypeCodeForBits(bits), s)
-  a.tofile(sys.stdout)
+  a = struct.pack(TypeCodeForStruct(bits) * len(s), *s)
+  sys.stdout.write(a)
 
 
 def UnpackOneFile(path):
@@ -148,7 +152,7 @@
     f.seek(0, 2)
     size = f.tell()
     f.seek(0, 0)
-    pcs = array.array(TypeCodeForBits(bits), f.read(size))
+    pcs = struct.unpack_from(TypeCodeForStruct(bits) * (size * 8 / bits), f.read(size))
     mem_map_pcs = [[] for i in range(0, len(mem_map))]
 
     for pc in pcs:
@@ -166,11 +170,12 @@
       assert path.endswith('.sancov.raw')
       dst_path = module_path + '.' + os.path.basename(path)[:-4]
       print >> sys.stderr, "%s: writing %d PCs to %s" % (prog_name, len(pc_list), dst_path)
-      arr = array.array(TypeCodeForBits(bits))
-      arr.fromlist(sorted(pc_list))
-      with open(dst_path, 'ab') as f2:
+      sorted_pc_list = sorted(pc_list)
+      pc_buffer = struct.pack(TypeCodeForStruct(bits) * len(pc_list), *sorted_pc_list)
+      with open(dst_path, 'ab+') as f2:
         array.array('I', MagicForBits(bits)).tofile(f2)
-        arr.tofile(f2)
+        f2.seek(0, 2)
+        f2.write(pc_buffer)
 
 def RawUnpack(files):
   for f in files:
diff --git a/lib/sanitizer_common/tests/CMakeLists.txt b/lib/sanitizer_common/tests/CMakeLists.txt
index 18b7636..0a828dc 100644
--- a/lib/sanitizer_common/tests/CMakeLists.txt
+++ b/lib/sanitizer_common/tests/CMakeLists.txt
@@ -44,7 +44,7 @@
 endforeach()
 
 set(SANITIZER_TEST_CFLAGS_COMMON
-  ${COMPILER_RT_TEST_CFLAGS}
+  ${COMPILER_RT_UNITTEST_CFLAGS}
   ${COMPILER_RT_GTEST_CFLAGS}
   -I${COMPILER_RT_SOURCE_DIR}/include
   -I${COMPILER_RT_SOURCE_DIR}/lib
@@ -65,6 +65,10 @@
 else()
   list(APPEND SANITIZER_TEST_CFLAGS_COMMON -g)
 endif()
+if(MSVC)
+  list(APPEND SANITIZER_TEST_CFLAGS_COMMON -gcodeview)
+endif()
+list(APPEND SANITIZER_TEST_LINK_FLAGS_COMMON -g)
 
 if(NOT MSVC)
   list(APPEND SANITIZER_TEST_LINK_FLAGS_COMMON --driver-mode=g++)
@@ -74,6 +78,13 @@
   list(APPEND SANITIZER_TEST_LINK_FLAGS_COMMON -pie)
 endif()
 
+# MSVC linker is allocating 1M for the stack by default, which is not
+# enough for the unittests. Some unittests require more than 2M.
+# The default stack size for clang is 8M.
+if(MSVC)
+  list(APPEND SANITIZER_TEST_LINK_FLAGS_COMMON -Wl,/STACK:0xC00000)
+endif()
+
 set(SANITIZER_TEST_LINK_LIBS)
 append_list_if(COMPILER_RT_HAS_LIBLOG log SANITIZER_TEST_LINK_LIBS)
 # NDK r10 requires -latomic almost always.
@@ -96,7 +107,8 @@
 macro(add_sanitizer_common_lib library)
   add_library(${library} STATIC ${ARGN})
   set_target_properties(${library} PROPERTIES
-    ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+    FOLDER "Compiler-RT Runtime tests")
 endmacro()
 
 function(get_sanitizer_common_lib_for_arch arch lib lib_name)
@@ -106,17 +118,21 @@
     set(tgt_name "RTSanitizerCommon.test.${arch}")
   endif()
   set(${lib} "${tgt_name}" PARENT_SCOPE)
-  if(NOT MSVC)
-    set(${lib_name} "lib${tgt_name}.a" PARENT_SCOPE)
+  if(CMAKE_CONFIGURATION_TYPES)
+   set(configuration_path "${CMAKE_CFG_INTDIR}/")
   else()
-    set(${lib_name} "${tgt_name}.lib" PARENT_SCOPE)
+   set(configuration_path "")
+  endif()
+  if(NOT MSVC)
+    set(${lib_name} "${configuration_path}lib${tgt_name}.a" PARENT_SCOPE)
+  else()
+    set(${lib_name} "${configuration_path}${tgt_name}.lib" PARENT_SCOPE)
   endif()
 endfunction()
 
 # Sanitizer_common unit tests testsuite.
 add_custom_target(SanitizerUnitTests)
-set_target_properties(SanitizerUnitTests PROPERTIES
-  FOLDER "Sanitizer unittests")
+set_target_properties(SanitizerUnitTests PROPERTIES FOLDER "Compiler-RT Tests")
 
 # Adds sanitizer tests for architecture.
 macro(add_sanitizer_tests_for_arch arch)
@@ -130,7 +146,11 @@
   set(SANITIZER_TEST_OBJECTS)
   foreach(source ${SANITIZER_TEST_SOURCES})
     get_filename_component(basename ${source} NAME)
-    set(output_obj "${basename}.${arch}.o")
+    if(CMAKE_CONFIGURATION_TYPES)
+      set(output_obj "${CMAKE_CFG_INTDIR}/${basename}.${arch}.o")
+    else()
+      set(output_obj "${basename}.${arch}.o")
+    endif()
     clang_compile(${output_obj} ${source}
                   CFLAGS ${SANITIZER_TEST_CFLAGS_COMMON} ${TARGET_FLAGS}
                   DEPS ${SANITIZER_TEST_COMPILE_DEPS})
diff --git a/lib/sanitizer_common/tests/sanitizer_allocator_test.cc b/lib/sanitizer_common/tests/sanitizer_allocator_test.cc
index 7ba3345..31eec19 100644
--- a/lib/sanitizer_common/tests/sanitizer_allocator_test.cc
+++ b/lib/sanitizer_common/tests/sanitizer_allocator_test.cc
@@ -29,9 +29,15 @@
 #if !SANITIZER_DEBUG
 
 #if SANITIZER_CAN_USE_ALLOCATOR64
+#if SANITIZER_WINDOWS
+static const uptr kAllocatorSpace = 0x10000000000ULL;
+static const uptr kAllocatorSize  =  0x10000000000ULL;  // 1T.
+static const u64 kAddressSpaceSize = 1ULL << 40;
+#else
 static const uptr kAllocatorSpace = 0x700000000000ULL;
 static const uptr kAllocatorSize  = 0x010000000000ULL;  // 1T.
 static const u64 kAddressSpaceSize = 1ULL << 47;
+#endif
 
 typedef SizeClassAllocator64<
   kAllocatorSpace, kAllocatorSize, 16, DefaultSizeClassMap> Allocator64;
@@ -42,6 +48,10 @@
 static const u64 kAddressSpaceSize = 1ULL << 40;
 #elif defined(__aarch64__)
 static const u64 kAddressSpaceSize = 1ULL << 39;
+#elif defined(__s390x__)
+static const u64 kAddressSpaceSize = 1ULL << 53;
+#elif defined(__s390__)
+static const u64 kAddressSpaceSize = 1ULL << 31;
 #else
 static const u64 kAddressSpaceSize = 1ULL << 32;
 #endif
@@ -331,7 +341,6 @@
 }
 #endif
 
-#if !defined(_WIN32)  // FIXME: This currently fails on Windows.
 TEST(SanitizerCommon, LargeMmapAllocator) {
   LargeMmapAllocator<> a;
   a.Init(/* may_return_null */ false);
@@ -407,7 +416,6 @@
   CHECK_NE(p, (char *)a.GetBlockBegin(p + page_size));
   a.Deallocate(&stats, p);
 }
-#endif
 
 template
 <class PrimaryAllocator, class SecondaryAllocator, class AllocatorCache>
@@ -479,13 +487,11 @@
 }
 #endif
 
-#if !defined(_WIN32)  // FIXME: This currently fails on Windows.
 TEST(SanitizerCommon, CombinedAllocator32Compact) {
   TestCombinedAllocator<Allocator32Compact,
       LargeMmapAllocator<>,
       SizeClassAllocatorLocalCache<Allocator32Compact> > ();
 }
-#endif
 
 template <class AllocatorCache>
 void TestSizeClassAllocatorLocalCache() {
@@ -601,6 +607,8 @@
   pthread_t t;
   PTHREAD_CREATE(&t, 0, DeallocNewThreadWorker, params);
   PTHREAD_JOIN(t, 0);
+
+  allocator.TestOnlyUnmap();
 }
 #endif
 
diff --git a/lib/sanitizer_common/tests/sanitizer_allocator_testlib.cc b/lib/sanitizer_common/tests/sanitizer_allocator_testlib.cc
index 0cc3b9b..038d9c5 100644
--- a/lib/sanitizer_common/tests/sanitizer_allocator_testlib.cc
+++ b/lib/sanitizer_common/tests/sanitizer_allocator_testlib.cc
@@ -11,9 +11,10 @@
 // for CombinedAllocator.
 //===----------------------------------------------------------------------===//
 /* Usage:
-clang++ -fno-exceptions  -g -fPIC -I. -I../include -Isanitizer \
+clang++ -std=c++11 -fno-exceptions  -g -fPIC -I. -I../include -Isanitizer \
  sanitizer_common/tests/sanitizer_allocator_testlib.cc \
- sanitizer_common/sanitizer_*.cc -shared -lpthread -o testmalloc.so
+ $(\ls sanitizer_common/sanitizer_*.cc | grep -v sanitizer_common_nolibc.cc) \
+ -shared -lpthread -o testmalloc.so
 LD_PRELOAD=`pwd`/testmalloc.so /your/app
 */
 #include "sanitizer_common/sanitizer_allocator.h"
@@ -36,7 +37,8 @@
 static const uptr kAllocatorSpace = 0x600000000000ULL;
 static const uptr kAllocatorSize  =  0x10000000000ULL;  // 1T.
 
-typedef SizeClassAllocator64<kAllocatorSpace, kAllocatorSize, 0,
+// typedef SizeClassAllocator64<kAllocatorSpace, kAllocatorSize, 0,
+typedef SizeClassAllocator64<~(uptr)0, kAllocatorSize, 0,
   CompactSizeClassMap> PrimaryAllocator;
 typedef SizeClassAllocatorLocalCache<PrimaryAllocator> AllocatorCache;
 typedef LargeMmapAllocator<> SecondaryAllocator;
@@ -60,12 +62,12 @@
 static void NOINLINE thread_init() {
   if (!global_inited) {
     global_inited = true;
-    allocator.Init();
+    allocator.Init(false /*may_return_null*/);
     pthread_key_create(&pkey, thread_dtor);
   }
   thread_inited = true;
   pthread_setspecific(pkey, (void*)1);
-  cache.Init();
+  cache.Init(nullptr);
 }
 }  // namespace
 
diff --git a/lib/sanitizer_common/tests/sanitizer_flags_test.cc b/lib/sanitizer_common/tests/sanitizer_flags_test.cc
index 3e5d838..24a3f3d 100644
--- a/lib/sanitizer_common/tests/sanitizer_flags_test.cc
+++ b/lib/sanitizer_common/tests/sanitizer_flags_test.cc
@@ -47,6 +47,9 @@
   parser.ParseString(env);
 
   EXPECT_EQ(0, internal_strcmp(final_value, flag));
+
+  // Reporting unrecognized flags is needed to reset them.
+  ReportUnrecognizedFlags();
 }
 
 TEST(SanitizerCommon, BooleanFlags) {
@@ -97,6 +100,9 @@
 
   EXPECT_EQ(expected_flag1, flag1);
   EXPECT_EQ(0, internal_strcmp(flag2, expected_flag2));
+
+  // Reporting unrecognized flags is needed to reset them.
+  ReportUnrecognizedFlags();
 }
 
 TEST(SanitizerCommon, MultipleFlags) {
diff --git a/lib/sanitizer_common/tests/sanitizer_ioctl_test.cc b/lib/sanitizer_common/tests/sanitizer_ioctl_test.cc
index 22fa522..6e2a20b 100644
--- a/lib/sanitizer_common/tests/sanitizer_ioctl_test.cc
+++ b/lib/sanitizer_common/tests/sanitizer_ioctl_test.cc
@@ -78,7 +78,8 @@
 // Test decoding KVM ioctl numbers.
 TEST(SanitizerIoctl, KVM_GET_MP_STATE) {
   ioctl_desc desc;
-  bool res = ioctl_decode(0x8004ae98U, &desc);
+  unsigned int desc_value = SANITIZER_MIPS ? 0x4004ae98U : 0x8004ae98U;
+  bool res = ioctl_decode(desc_value, &desc);
   EXPECT_TRUE(res);
   EXPECT_EQ(ioctl_desc::WRITE, desc.type);
   EXPECT_EQ(4U, desc.size);
@@ -86,7 +87,8 @@
 
 TEST(SanitizerIoctl, KVM_GET_LAPIC) {
   ioctl_desc desc;
-  bool res = ioctl_decode(0x8400ae8eU, &desc);
+  unsigned int desc_value = SANITIZER_MIPS ? 0x4400ae8eU : 0x8400ae8eU;
+  bool res = ioctl_decode(desc_value, &desc);
   EXPECT_TRUE(res);
   EXPECT_EQ(ioctl_desc::WRITE, desc.type);
   EXPECT_EQ(1024U, desc.size);
diff --git a/lib/sanitizer_common/tests/sanitizer_linux_test.cc b/lib/sanitizer_common/tests/sanitizer_linux_test.cc
index eef7101..fb6b109 100644
--- a/lib/sanitizer_common/tests/sanitizer_linux_test.cc
+++ b/lib/sanitizer_common/tests/sanitizer_linux_test.cc
@@ -263,6 +263,41 @@
 }
 #endif
 
+TEST(SanitizerCommon, StartSubprocessTest) {
+  int pipe_fds[2];
+  ASSERT_EQ(0, pipe(pipe_fds));
+#if SANITIZER_ANDROID
+  const char *shell = "/system/bin/sh";
+#else
+  const char *shell = "/bin/sh";
+#endif
+  const char *argv[] = {shell, "-c", "echo -n 'hello'", (char *)NULL};
+  int pid = StartSubprocess(shell, argv,
+                            /* stdin */ kInvalidFd, /* stdout */ pipe_fds[1]);
+  ASSERT_GT(pid, 0);
+
+  // wait for process to finish.
+  while (IsProcessRunning(pid)) {
+  }
+  ASSERT_FALSE(IsProcessRunning(pid));
+
+  char buffer[256];
+  {
+    char *ptr = buffer;
+    uptr bytes_read;
+    while (ReadFromFile(pipe_fds[0], ptr, 256, &bytes_read)) {
+      if (!bytes_read) {
+        break;
+      }
+      ptr += bytes_read;
+    }
+    ASSERT_EQ(5, ptr - buffer);
+    *ptr = 0;
+  }
+  ASSERT_EQ(0, strcmp(buffer, "hello")) << "Buffer: " << buffer;
+  internal_close(pipe_fds[0]);
+}
+
 }  // namespace __sanitizer
 
 #endif  // SANITIZER_LINUX
diff --git a/lib/sanitizer_common/tests/sanitizer_posix_test.cc b/lib/sanitizer_common/tests/sanitizer_posix_test.cc
index 03ca449..b7cca83 100644
--- a/lib/sanitizer_common/tests/sanitizer_posix_test.cc
+++ b/lib/sanitizer_common/tests/sanitizer_posix_test.cc
@@ -56,6 +56,7 @@
   EXPECT_TRUE(destructor_executed);
   SpawnThread(GetPthreadDestructorIterations() + 1);
   EXPECT_FALSE(destructor_executed);
+  ASSERT_EQ(0, pthread_key_delete(key));
 }
 
 TEST(SanitizerCommon, IsAccessibleMemoryRange) {
diff --git a/lib/sanitizer_common/tests/sanitizer_printf_test.cc b/lib/sanitizer_common/tests/sanitizer_printf_test.cc
index 5e39e0a..5a77b47 100644
--- a/lib/sanitizer_common/tests/sanitizer_printf_test.cc
+++ b/lib/sanitizer_common/tests/sanitizer_printf_test.cc
@@ -23,9 +23,9 @@
   char buf[1024];
   uptr len = internal_snprintf(buf, sizeof(buf),
       "a%db%zdc%ue%zuf%xh%zxq%pe%sr",
-      (int)-1, (long)-2, // NOLINT
-      (unsigned)-4, (unsigned long)5, // NOLINT
-      (unsigned)10, (unsigned long)11, // NOLINT
+      (int)-1, (uptr)-2, // NOLINT
+      (unsigned)-4, (uptr)5, // NOLINT
+      (unsigned)10, (uptr)11, // NOLINT
       (void*)0x123, "_string_");
   EXPECT_EQ(len, strlen(buf));
 
diff --git a/lib/sanitizer_common/tests/sanitizer_procmaps_test.cc b/lib/sanitizer_common/tests/sanitizer_procmaps_test.cc
index 12bc9e1..ae7c5d5 100644
--- a/lib/sanitizer_common/tests/sanitizer_procmaps_test.cc
+++ b/lib/sanitizer_common/tests/sanitizer_procmaps_test.cc
@@ -37,11 +37,11 @@
   const char *binary_name = last_slash ? last_slash + 1 : argv0;
   MemoryMappingLayout memory_mapping(false);
   const uptr kMaxModules = 100;
-  LoadedModule modules[kMaxModules];
-  uptr n_modules = memory_mapping.DumpListOfModules(modules, kMaxModules, 0);
-  EXPECT_GT(n_modules, 0U);
+  InternalMmapVector<LoadedModule> modules(kMaxModules);
+  memory_mapping.DumpListOfModules(&modules);
+  EXPECT_GT(modules.size(), 0U);
   bool found = false;
-  for (uptr i = 0; i < n_modules; ++i) {
+  for (uptr i = 0; i < modules.size(); ++i) {
     if (modules[i].containsAddress((uptr)&noop)) {
       // Verify that the module name is sane.
       if (strstr(modules[i].full_name(), binary_name) != 0)
diff --git a/lib/sanitizer_common/tests/sanitizer_pthread_wrappers.h b/lib/sanitizer_common/tests/sanitizer_pthread_wrappers.h
index 47b0f97..b7d784c 100644
--- a/lib/sanitizer_common/tests/sanitizer_pthread_wrappers.h
+++ b/lib/sanitizer_common/tests/sanitizer_pthread_wrappers.h
@@ -48,7 +48,9 @@
   data->start_routine = start_routine;
   data->arg = arg;
   *thread = CreateThread(0, 0, PthreadHelperThreadProc, data, 0, 0);
-  ASSERT_NE(nullptr, *thread) << "Failed to create a thread.";
+  DWORD err = GetLastError();
+  ASSERT_NE(nullptr, *thread) << "Failed to create a thread, got error 0x"
+                              << std::hex << err;
 }
 
 inline void PTHREAD_JOIN(pthread_t thread, void **value_ptr) {
diff --git a/lib/sanitizer_common/tests/sanitizer_stacktrace_test.cc b/lib/sanitizer_common/tests/sanitizer_stacktrace_test.cc
index 3d57ede..ba9f4fd 100644
--- a/lib/sanitizer_common/tests/sanitizer_stacktrace_test.cc
+++ b/lib/sanitizer_common/tests/sanitizer_stacktrace_test.cc
@@ -136,6 +136,19 @@
   EXPECT_EQ(PC(1), trace.trace[1]);
 }
 
+TEST_F(FastUnwindTest, CloseToZeroFrame) {
+  // Make one pc a NULL pointer.
+  fake_stack[5] = 0x0;
+  if (!TryFastUnwind(kStackTraceMax))
+    return;
+  // The stack should be truncated at the NULL pointer (and not include it).
+  EXPECT_EQ(3U, trace.size);
+  EXPECT_EQ(start_pc, trace.trace[0]);
+  for (uptr i = 1; i < 3U; i++) {
+    EXPECT_EQ(PC(i*2 - 1), trace.trace[i]);
+  }
+}
+
 TEST(SlowUnwindTest, ShortStackTrace) {
   if (StackTrace::WillUseFastUnwind(false))
     return;
diff --git a/lib/sanitizer_common/tests/sanitizer_symbolizer_test.cc b/lib/sanitizer_common/tests/sanitizer_symbolizer_test.cc
index 429ac59..3d5678a 100644
--- a/lib/sanitizer_common/tests/sanitizer_symbolizer_test.cc
+++ b/lib/sanitizer_common/tests/sanitizer_symbolizer_test.cc
@@ -55,4 +55,16 @@
   InternalFree(token);
 }
 
+#if !SANITIZER_WINDOWS
+TEST(Symbolizer, DemangleSwiftAndCXX) {
+  // Swift names are not demangled in default llvm build because Swift
+  // runtime is not linked in.
+  EXPECT_STREQ("_TtSd", DemangleSwiftAndCXX("_TtSd"));
+  // Check that the rest demangles properly.
+  EXPECT_STREQ("f1(char*, int)", DemangleSwiftAndCXX("_Z2f1Pci"));
+  EXPECT_STREQ("foo", DemangleSwiftAndCXX("foo"));
+  EXPECT_STREQ("", DemangleSwiftAndCXX(""));
+}
+#endif
+
 }  // namespace __sanitizer
diff --git a/lib/sanitizer_common/tests/sanitizer_thread_registry_test.cc b/lib/sanitizer_common/tests/sanitizer_thread_registry_test.cc
index 58c627a..1132bfd 100644
--- a/lib/sanitizer_common/tests/sanitizer_thread_registry_test.cc
+++ b/lib/sanitizer_common/tests/sanitizer_thread_registry_test.cc
@@ -224,6 +224,10 @@
 }
 
 TEST(SanitizerCommon, ThreadRegistryThreadedTest) {
+  memset(&num_created, 0, sizeof(num_created));
+  memset(&num_started, 0, sizeof(num_created));
+  memset(&num_joined, 0, sizeof(num_created));
+
   ThreadRegistry registry(GetThreadContext<TestThreadContext>,
                           kThreadsPerShard * kNumShards + 1, 10);
   ThreadedTestRegistry(&registry);
diff --git a/lib/scudo/CMakeLists.txt b/lib/scudo/CMakeLists.txt
new file mode 100644
index 0000000..6cbb85f
--- /dev/null
+++ b/lib/scudo/CMakeLists.txt
@@ -0,0 +1,33 @@
+add_custom_target(scudo)
+set_target_properties(scudo PROPERTIES FOLDER "Compiler-RT Misc")
+
+include_directories(..)
+
+set(SCUDO_CFLAGS ${SANITIZER_COMMON_CFLAGS})
+append_rtti_flag(OFF SCUDO_CFLAGS)
+list(APPEND SCUDO_CFLAGS -msse4.2 -mcx16)
+
+set(SCUDO_SOURCES
+  scudo_allocator.cpp
+  scudo_flags.cpp
+  scudo_interceptors.cpp
+  scudo_new_delete.cpp
+  scudo_termination.cpp
+  scudo_utils.cpp)
+
+if(COMPILER_RT_HAS_SCUDO)
+  foreach(arch ${SCUDO_SUPPORTED_ARCH})
+    add_compiler_rt_runtime(clang_rt.scudo
+      STATIC
+      ARCHS ${arch}
+      SOURCES ${SCUDO_SOURCES}
+              $<TARGET_OBJECTS:RTInterception.${arch}>
+              $<TARGET_OBJECTS:RTSanitizerCommonNoTermination.${arch}>
+              $<TARGET_OBJECTS:RTSanitizerCommonLibc.${arch}>
+      CFLAGS ${SCUDO_CFLAGS}
+      PARENT_TARGET scudo)
+  endforeach()
+endif()
+
+add_dependencies(compiler-rt scudo)
+
diff --git a/lib/scudo/scudo_allocator.cpp b/lib/scudo/scudo_allocator.cpp
new file mode 100644
index 0000000..3ad499a
--- /dev/null
+++ b/lib/scudo/scudo_allocator.cpp
@@ -0,0 +1,635 @@
+//===-- scudo_allocator.cpp -------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// Scudo Hardened Allocator implementation.
+/// It uses the sanitizer_common allocator as a base and aims at mitigating
+/// heap corruption vulnerabilities. It provides a checksum-guarded chunk
+/// header, a delayed free list, and additional sanity checks.
+///
+//===----------------------------------------------------------------------===//
+
+#include "scudo_allocator.h"
+#include "scudo_utils.h"
+
+#include "sanitizer_common/sanitizer_allocator_interface.h"
+#include "sanitizer_common/sanitizer_quarantine.h"
+
+#include <limits.h>
+#include <pthread.h>
+#include <smmintrin.h>
+
+#include <atomic>
+#include <cstring>
+
+namespace __scudo {
+
+const uptr AllocatorSpace = ~0ULL;
+const uptr AllocatorSize  =  0x10000000000ULL;
+const uptr MinAlignmentLog = 4; // 16 bytes for x64
+const uptr MaxAlignmentLog = 24;
+
+typedef DefaultSizeClassMap SizeClassMap;
+typedef SizeClassAllocator64<AllocatorSpace, AllocatorSize, 0, SizeClassMap>
+  PrimaryAllocator;
+typedef SizeClassAllocatorLocalCache<PrimaryAllocator> AllocatorCache;
+typedef LargeMmapAllocator<> SecondaryAllocator;
+typedef CombinedAllocator<PrimaryAllocator, AllocatorCache, SecondaryAllocator>
+  ScudoAllocator;
+
+static ScudoAllocator &getAllocator();
+
+static thread_local Xorshift128Plus Prng;
+// Global static cookie, initialized at start-up.
+static u64 Cookie;
+
+enum ChunkState : u8 {
+  ChunkAvailable  = 0,
+  ChunkAllocated  = 1,
+  ChunkQuarantine = 2
+};
+
+typedef unsigned __int128 PackedHeader;
+typedef std::atomic<PackedHeader> AtomicPackedHeader;
+
+// Our header requires 128-bit of storage on x64 (the only platform supported
+// as of now), which fits nicely with the alignment requirements.
+// Having the offset saves us from using functions such as GetBlockBegin, that
+// is fairly costly. Our first implementation used the MetaData as well, which
+// offers the advantage of being stored away from the chunk itself, but
+// accessing it was costly as well.
+// The header will be atomically loaded and stored using the 16-byte primitives
+// offered by the platform (likely requires cmpxchg16b support).
+struct UnpackedHeader {
+  // 1st 8 bytes
+  u16 Checksum      : 16;
+  u64 RequestedSize : 40; // Needed for reallocation purposes.
+  u8  State         : 2;  // available, allocated, or quarantined
+  u8  AllocType     : 2;  // malloc, new, new[], or memalign
+  u8  Unused_0_     : 4;
+  // 2nd 8 bytes
+  u64 Offset        : 20; // Offset from the beginning of the backend
+                          // allocation to the beginning chunk itself, in
+                          // multiples of MinAlignment. See comment about its
+                          // maximum value and test in Initialize.
+  u64 Unused_1_     : 28;
+  u16 Salt          : 16;
+};
+
+COMPILER_CHECK(sizeof(UnpackedHeader) == sizeof(PackedHeader));
+
+const uptr ChunkHeaderSize = sizeof(PackedHeader);
+
+struct ScudoChunk : UnpackedHeader {
+  // We can't use the offset member of the chunk itself, as we would double
+  // fetch it without any warranty that it wouldn't have been tampered. To
+  // prevent this, we work with a local copy of the header.
+  void *AllocBeg(UnpackedHeader *Header) {
+    return reinterpret_cast<void *>(
+        reinterpret_cast<uptr>(this) - (Header->Offset << MinAlignmentLog));
+  }
+
+  // CRC32 checksum of the Chunk pointer and its ChunkHeader.
+  // It currently uses the Intel Nehalem SSE4.2 crc32 64-bit instruction.
+  u16 Checksum(UnpackedHeader *Header) const {
+    u64 HeaderHolder[2];
+    memcpy(HeaderHolder, Header, sizeof(HeaderHolder));
+    u64 Crc = _mm_crc32_u64(Cookie, reinterpret_cast<uptr>(this));
+    // This is somewhat of a shortcut. The checksum is stored in the 16 least
+    // significant bits of the first 8 bytes of the header, hence zero-ing
+    // those bits out. It would be more valid to zero the checksum field of the
+    // UnpackedHeader, but would require holding an additional copy of it.
+    Crc = _mm_crc32_u64(Crc, HeaderHolder[0] & 0xffffffffffff0000ULL);
+    Crc = _mm_crc32_u64(Crc, HeaderHolder[1]);
+    return static_cast<u16>(Crc);
+  }
+
+  // Loads and unpacks the header, verifying the checksum in the process.
+  void loadHeader(UnpackedHeader *NewUnpackedHeader) const {
+    const AtomicPackedHeader *AtomicHeader =
+        reinterpret_cast<const AtomicPackedHeader *>(this);
+    PackedHeader NewPackedHeader =
+        AtomicHeader->load(std::memory_order_relaxed);
+    *NewUnpackedHeader = bit_cast<UnpackedHeader>(NewPackedHeader);
+    if ((NewUnpackedHeader->Unused_0_ != 0) ||
+        (NewUnpackedHeader->Unused_1_ != 0) ||
+        (NewUnpackedHeader->Checksum != Checksum(NewUnpackedHeader))) {
+      dieWithMessage("ERROR: corrupted chunk header at address %p\n", this);
+    }
+  }
+
+  // Packs and stores the header, computing the checksum in the process.
+  void storeHeader(UnpackedHeader *NewUnpackedHeader) {
+    NewUnpackedHeader->Checksum = Checksum(NewUnpackedHeader);
+    PackedHeader NewPackedHeader = bit_cast<PackedHeader>(*NewUnpackedHeader);
+    AtomicPackedHeader *AtomicHeader =
+        reinterpret_cast<AtomicPackedHeader *>(this);
+    AtomicHeader->store(NewPackedHeader, std::memory_order_relaxed);
+  }
+
+  // Packs and stores the header, computing the checksum in the process. We
+  // compare the current header with the expected provided one to ensure that
+  // we are not being raced by a corruption occurring in another thread.
+  void compareExchangeHeader(UnpackedHeader *NewUnpackedHeader,
+                             UnpackedHeader *OldUnpackedHeader) {
+    NewUnpackedHeader->Checksum = Checksum(NewUnpackedHeader);
+    PackedHeader NewPackedHeader = bit_cast<PackedHeader>(*NewUnpackedHeader);
+    PackedHeader OldPackedHeader = bit_cast<PackedHeader>(*OldUnpackedHeader);
+    AtomicPackedHeader *AtomicHeader =
+        reinterpret_cast<AtomicPackedHeader *>(this);
+    if (!AtomicHeader->compare_exchange_strong(OldPackedHeader,
+                                               NewPackedHeader,
+                                               std::memory_order_relaxed,
+                                               std::memory_order_relaxed)) {
+      dieWithMessage("ERROR: race on chunk header at address %p\n", this);
+    }
+  }
+};
+
+static bool ScudoInitIsRunning = false;
+
+static pthread_once_t GlobalInited = PTHREAD_ONCE_INIT;
+static pthread_key_t pkey;
+
+static thread_local bool ThreadInited = false;
+static thread_local bool ThreadTornDown = false;
+static thread_local AllocatorCache Cache;
+
+static void teardownThread(void *p) {
+  uptr v = reinterpret_cast<uptr>(p);
+  // The glibc POSIX thread-local-storage deallocation routine calls user
+  // provided destructors in a loop of PTHREAD_DESTRUCTOR_ITERATIONS.
+  // We want to be called last since other destructors might call free and the
+  // like, so we wait until PTHREAD_DESTRUCTOR_ITERATIONS before draining the
+  // quarantine and swallowing the cache.
+  if (v < PTHREAD_DESTRUCTOR_ITERATIONS) {
+    pthread_setspecific(pkey, reinterpret_cast<void *>(v + 1));
+    return;
+  }
+  drainQuarantine();
+  getAllocator().DestroyCache(&Cache);
+  ThreadTornDown = true;
+}
+
+static void initInternal() {
+  SanitizerToolName = "Scudo";
+  CHECK(!ScudoInitIsRunning && "Scudo init calls itself!");
+  ScudoInitIsRunning = true;
+
+  initFlags();
+
+  AllocatorOptions Options;
+  Options.setFrom(getFlags(), common_flags());
+  initAllocator(Options);
+
+  ScudoInitIsRunning = false;
+}
+
+static void initGlobal() {
+  pthread_key_create(&pkey, teardownThread);
+  initInternal();
+}
+
+static void NOINLINE initThread() {
+  pthread_once(&GlobalInited, initGlobal);
+  pthread_setspecific(pkey, reinterpret_cast<void *>(1));
+  getAllocator().InitCache(&Cache);
+  ThreadInited = true;
+}
+
+struct QuarantineCallback {
+  explicit QuarantineCallback(AllocatorCache *Cache)
+    : Cache_(Cache) {}
+
+  // Chunk recycling function, returns a quarantined chunk to the backend.
+  void Recycle(ScudoChunk *Chunk) {
+    UnpackedHeader Header;
+    Chunk->loadHeader(&Header);
+    if (Header.State != ChunkQuarantine) {
+      dieWithMessage("ERROR: invalid chunk state when recycling address %p\n",
+                     Chunk);
+    }
+    void *Ptr = Chunk->AllocBeg(&Header);
+    getAllocator().Deallocate(Cache_, Ptr);
+  }
+
+  /// Internal quarantine allocation and deallocation functions.
+  void *Allocate(uptr Size) {
+    // The internal quarantine memory cannot be protected by us. But the only
+    // structures allocated are QuarantineBatch, that are 8KB for x64. So we
+    // will use mmap for those, and given that Deallocate doesn't pass a size
+    // in, we enforce the size of the allocation to be sizeof(QuarantineBatch).
+    // TODO(kostyak): switching to mmap impacts greatly performances, we have
+    //                to find another solution
+    // CHECK_EQ(Size, sizeof(QuarantineBatch));
+    // return MmapOrDie(Size, "QuarantineBatch");
+    return getAllocator().Allocate(Cache_, Size, 1, false);
+  }
+
+  void Deallocate(void *Ptr) {
+    // UnmapOrDie(Ptr, sizeof(QuarantineBatch));
+    getAllocator().Deallocate(Cache_, Ptr);
+  }
+
+  AllocatorCache *Cache_;
+};
+
+typedef Quarantine<QuarantineCallback, ScudoChunk> ScudoQuarantine;
+typedef ScudoQuarantine::Cache QuarantineCache;
+static thread_local QuarantineCache ThreadQuarantineCache;
+
+void AllocatorOptions::setFrom(const Flags *f, const CommonFlags *cf) {
+  MayReturnNull = cf->allocator_may_return_null;
+  QuarantineSizeMb = f->QuarantineSizeMb;
+  ThreadLocalQuarantineSizeKb = f->ThreadLocalQuarantineSizeKb;
+  DeallocationTypeMismatch = f->DeallocationTypeMismatch;
+  DeleteSizeMismatch = f->DeleteSizeMismatch;
+  ZeroContents = f->ZeroContents;
+}
+
+void AllocatorOptions::copyTo(Flags *f, CommonFlags *cf) const {
+  cf->allocator_may_return_null = MayReturnNull;
+  f->QuarantineSizeMb = QuarantineSizeMb;
+  f->ThreadLocalQuarantineSizeKb = ThreadLocalQuarantineSizeKb;
+  f->DeallocationTypeMismatch = DeallocationTypeMismatch;
+  f->DeleteSizeMismatch = DeleteSizeMismatch;
+  f->ZeroContents = ZeroContents;
+}
+
+struct Allocator {
+  static const uptr MaxAllowedMallocSize = 1ULL << 40;
+  static const uptr MinAlignment = 1 << MinAlignmentLog;
+  static const uptr MaxAlignment = 1 << MaxAlignmentLog; // 16 MB
+
+  ScudoAllocator BackendAllocator;
+  ScudoQuarantine AllocatorQuarantine;
+
+  // The fallback caches are used when the thread local caches have been
+  // 'detroyed' on thread tear-down. They are protected by a Mutex as they can
+  // be accessed by different threads.
+  StaticSpinMutex FallbackMutex;
+  AllocatorCache FallbackAllocatorCache;
+  QuarantineCache FallbackQuarantineCache;
+
+  bool DeallocationTypeMismatch;
+  bool ZeroContents;
+  bool DeleteSizeMismatch;
+
+  explicit Allocator(LinkerInitialized)
+    : AllocatorQuarantine(LINKER_INITIALIZED),
+      FallbackQuarantineCache(LINKER_INITIALIZED) {}
+
+  void init(const AllocatorOptions &Options) {
+    // Currently SSE 4.2 support is required. This might change later.
+    CHECK(testCPUFeature(SSE4_2)); // for crc32
+
+    // Verify that the header offset field can hold the maximum offset. In the
+    // worst case scenario, the backend allocation is already aligned on
+    // MaxAlignment, so in order to store the header and still be aligned, we
+    // add an extra MaxAlignment. As a result, the offset from the beginning of
+    // the backend allocation to the chunk will be MaxAlignment -
+    // ChunkHeaderSize.
+    UnpackedHeader Header = {};
+    uptr MaximumOffset = (MaxAlignment - ChunkHeaderSize) >> MinAlignmentLog;
+    Header.Offset = MaximumOffset;
+    if (Header.Offset != MaximumOffset) {
+      dieWithMessage("ERROR: the maximum possible offset doesn't fit in the "
+                     "header\n");
+    }
+
+    DeallocationTypeMismatch = Options.DeallocationTypeMismatch;
+    DeleteSizeMismatch = Options.DeleteSizeMismatch;
+    ZeroContents = Options.ZeroContents;
+    BackendAllocator.Init(Options.MayReturnNull);
+    AllocatorQuarantine.Init(static_cast<uptr>(Options.QuarantineSizeMb) << 20,
+                             static_cast<uptr>(
+                                 Options.ThreadLocalQuarantineSizeKb) << 10);
+    BackendAllocator.InitCache(&FallbackAllocatorCache);
+    Cookie = Prng.Next();
+  }
+
+  // Allocates a chunk.
+  void *allocate(uptr Size, uptr Alignment, AllocType Type) {
+    if (UNLIKELY(!ThreadInited))
+      initThread();
+    if (!IsPowerOfTwo(Alignment)) {
+      dieWithMessage("ERROR: malloc alignment is not a power of 2\n");
+    }
+    if (Alignment > MaxAlignment)
+      return BackendAllocator.ReturnNullOrDie();
+    if (Alignment < MinAlignment)
+      Alignment = MinAlignment;
+    if (Size == 0)
+      Size = 1;
+    if (Size >= MaxAllowedMallocSize)
+      return BackendAllocator.ReturnNullOrDie();
+    uptr RoundedSize = RoundUpTo(Size, MinAlignment);
+    uptr ExtraBytes = ChunkHeaderSize;
+    if (Alignment > MinAlignment)
+      ExtraBytes += Alignment;
+    uptr NeededSize = RoundedSize + ExtraBytes;
+    if (NeededSize >= MaxAllowedMallocSize)
+      return BackendAllocator.ReturnNullOrDie();
+
+    void *Ptr;
+    if (LIKELY(!ThreadTornDown)) {
+      Ptr = BackendAllocator.Allocate(&Cache, NeededSize, MinAlignment);
+    } else {
+      SpinMutexLock l(&FallbackMutex);
+      Ptr = BackendAllocator.Allocate(&FallbackAllocatorCache, NeededSize,
+                               MinAlignment);
+    }
+    if (!Ptr)
+      return BackendAllocator.ReturnNullOrDie();
+
+    // If requested, we will zero out the entire contents of the returned chunk.
+    if (ZeroContents && BackendAllocator.FromPrimary(Ptr))
+       memset(Ptr, 0, BackendAllocator.GetActuallyAllocatedSize(Ptr));
+
+    uptr AllocBeg = reinterpret_cast<uptr>(Ptr);
+    uptr ChunkBeg = AllocBeg + ChunkHeaderSize;
+    if (!IsAligned(ChunkBeg, Alignment))
+      ChunkBeg = RoundUpTo(ChunkBeg, Alignment);
+    CHECK_LE(ChunkBeg + Size, AllocBeg + NeededSize);
+    ScudoChunk *Chunk =
+        reinterpret_cast<ScudoChunk *>(ChunkBeg - ChunkHeaderSize);
+    UnpackedHeader Header = {};
+    Header.State = ChunkAllocated;
+    Header.Offset = (ChunkBeg - ChunkHeaderSize - AllocBeg) >> MinAlignmentLog;
+    Header.AllocType = Type;
+    Header.RequestedSize = Size;
+    Header.Salt = static_cast<u16>(Prng.Next());
+    Chunk->storeHeader(&Header);
+    void *UserPtr = reinterpret_cast<void *>(ChunkBeg);
+    // TODO(kostyak): hooks sound like a terrible idea security wise but might
+    //                be needed for things to work properly?
+    // if (&__sanitizer_malloc_hook) __sanitizer_malloc_hook(UserPtr, Size);
+    return UserPtr;
+  }
+
+  // Deallocates a Chunk, which means adding it to the delayed free list (or
+  // Quarantine).
+  void deallocate(void *UserPtr, uptr DeleteSize, AllocType Type) {
+    if (UNLIKELY(!ThreadInited))
+      initThread();
+    // TODO(kostyak): see hook comment above
+    // if (&__sanitizer_free_hook) __sanitizer_free_hook(UserPtr);
+    if (!UserPtr)
+      return;
+    uptr ChunkBeg = reinterpret_cast<uptr>(UserPtr);
+    if (!IsAligned(ChunkBeg, MinAlignment)) {
+      dieWithMessage("ERROR: attempted to deallocate a chunk not properly "
+                     "aligned at address %p\n", UserPtr);
+    }
+    ScudoChunk *Chunk =
+        reinterpret_cast<ScudoChunk *>(ChunkBeg - ChunkHeaderSize);
+    UnpackedHeader OldHeader;
+    Chunk->loadHeader(&OldHeader);
+    if (OldHeader.State != ChunkAllocated) {
+      dieWithMessage("ERROR: invalid chunk state when deallocating address "
+                     "%p\n", Chunk);
+    }
+    UnpackedHeader NewHeader = OldHeader;
+    NewHeader.State = ChunkQuarantine;
+    Chunk->compareExchangeHeader(&NewHeader, &OldHeader);
+    if (DeallocationTypeMismatch) {
+      // The deallocation type has to match the allocation one.
+      if (NewHeader.AllocType != Type) {
+        // With the exception of memalign'd Chunks, that can be still be free'd.
+        if (NewHeader.AllocType != FromMemalign || Type != FromMalloc) {
+          dieWithMessage("ERROR: allocation type mismatch on address %p\n",
+                         Chunk);
+        }
+      }
+    }
+    uptr Size = NewHeader.RequestedSize;
+    if (DeleteSizeMismatch) {
+      if (DeleteSize && DeleteSize != Size) {
+        dieWithMessage("ERROR: invalid sized delete on chunk at address %p\n",
+                       Chunk);
+      }
+    }
+    if (LIKELY(!ThreadTornDown)) {
+      AllocatorQuarantine.Put(&ThreadQuarantineCache,
+                              QuarantineCallback(&Cache), Chunk, Size);
+    } else {
+      SpinMutexLock l(&FallbackMutex);
+      AllocatorQuarantine.Put(&FallbackQuarantineCache,
+                              QuarantineCallback(&FallbackAllocatorCache),
+                              Chunk, Size);
+    }
+  }
+
+  // Returns the actual usable size of a chunk. Since this requires loading the
+  // header, we will return it in the second parameter, as it can be required
+  // by the caller to perform additional processing.
+  uptr getUsableSize(const void *Ptr, UnpackedHeader *Header) {
+    if (UNLIKELY(!ThreadInited))
+      initThread();
+    if (!Ptr)
+      return 0;
+    uptr ChunkBeg = reinterpret_cast<uptr>(Ptr);
+    ScudoChunk *Chunk =
+        reinterpret_cast<ScudoChunk *>(ChunkBeg - ChunkHeaderSize);
+    Chunk->loadHeader(Header);
+    // Getting the usable size of a chunk only makes sense if it's allocated.
+    if (Header->State != ChunkAllocated) {
+      dieWithMessage("ERROR: attempted to size a non-allocated chunk at "
+                     "address %p\n", Chunk);
+    }
+    uptr Size =
+        BackendAllocator.GetActuallyAllocatedSize(Chunk->AllocBeg(Header));
+    // UsableSize works as malloc_usable_size, which is also what (AFAIU)
+    // tcmalloc's MallocExtension::GetAllocatedSize aims at providing. This
+    // means we will return the size of the chunk from the user beginning to
+    // the end of the 'user' allocation, hence us subtracting the header size
+    // and the offset from the size.
+    if (Size == 0)
+      return Size;
+    return Size - ChunkHeaderSize - (Header->Offset << MinAlignmentLog);
+  }
+
+  // Helper function that doesn't care about the header.
+  uptr getUsableSize(const void *Ptr) {
+    UnpackedHeader Header;
+    return getUsableSize(Ptr, &Header);
+  }
+
+  // Reallocates a chunk. We can save on a new allocation if the new requested
+  // size still fits in the chunk.
+  void *reallocate(void *OldPtr, uptr NewSize) {
+    if (UNLIKELY(!ThreadInited))
+      initThread();
+    UnpackedHeader OldHeader;
+    uptr Size = getUsableSize(OldPtr, &OldHeader);
+    uptr ChunkBeg = reinterpret_cast<uptr>(OldPtr);
+    ScudoChunk *Chunk =
+        reinterpret_cast<ScudoChunk *>(ChunkBeg - ChunkHeaderSize);
+    if (OldHeader.AllocType != FromMalloc) {
+      dieWithMessage("ERROR: invalid chunk type when reallocating address %p\n",
+                     Chunk);
+    }
+    UnpackedHeader NewHeader = OldHeader;
+    // The new size still fits in the current chunk.
+    if (NewSize <= Size) {
+      NewHeader.RequestedSize = NewSize;
+      Chunk->compareExchangeHeader(&NewHeader, &OldHeader);
+      return OldPtr;
+    }
+    // Otherwise, we have to allocate a new chunk and copy the contents of the
+    // old one.
+    void *NewPtr = allocate(NewSize, MinAlignment, FromMalloc);
+    if (NewPtr) {
+      uptr OldSize = OldHeader.RequestedSize;
+      memcpy(NewPtr, OldPtr, Min(NewSize, OldSize));
+      NewHeader.State = ChunkQuarantine;
+      Chunk->compareExchangeHeader(&NewHeader, &OldHeader);
+      if (LIKELY(!ThreadTornDown)) {
+        AllocatorQuarantine.Put(&ThreadQuarantineCache,
+                                QuarantineCallback(&Cache), Chunk, OldSize);
+      } else {
+        SpinMutexLock l(&FallbackMutex);
+        AllocatorQuarantine.Put(&FallbackQuarantineCache,
+                                QuarantineCallback(&FallbackAllocatorCache),
+                                Chunk, OldSize);
+      }
+    }
+    return NewPtr;
+  }
+
+  void *calloc(uptr NMemB, uptr Size) {
+    if (UNLIKELY(!ThreadInited))
+      initThread();
+    uptr Total = NMemB * Size;
+    if (Size != 0 && Total / Size != NMemB) // Overflow check
+      return BackendAllocator.ReturnNullOrDie();
+    void *Ptr = allocate(Total, MinAlignment, FromMalloc);
+    // If ZeroContents, the content of the chunk has already been zero'd out.
+    if (!ZeroContents && Ptr && BackendAllocator.FromPrimary(Ptr))
+      memset(Ptr, 0, getUsableSize(Ptr));
+    return Ptr;
+  }
+
+  void drainQuarantine() {
+    AllocatorQuarantine.Drain(&ThreadQuarantineCache,
+                              QuarantineCallback(&Cache));
+  }
+};
+
+static Allocator Instance(LINKER_INITIALIZED);
+
+static ScudoAllocator &getAllocator() {
+  return Instance.BackendAllocator;
+}
+
+void initAllocator(const AllocatorOptions &Options) {
+  Instance.init(Options);
+}
+
+void drainQuarantine() {
+  Instance.drainQuarantine();
+}
+
+void *scudoMalloc(uptr Size, AllocType Type) {
+  return Instance.allocate(Size, Allocator::MinAlignment, Type);
+}
+
+void scudoFree(void *Ptr, AllocType Type) {
+  Instance.deallocate(Ptr, 0, Type);
+}
+
+void scudoSizedFree(void *Ptr, uptr Size, AllocType Type) {
+  Instance.deallocate(Ptr, Size, Type);
+}
+
+void *scudoRealloc(void *Ptr, uptr Size) {
+  if (!Ptr)
+    return Instance.allocate(Size, Allocator::MinAlignment, FromMalloc);
+  if (Size == 0) {
+    Instance.deallocate(Ptr, 0, FromMalloc);
+    return nullptr;
+  }
+  return Instance.reallocate(Ptr, Size);
+}
+
+void *scudoCalloc(uptr NMemB, uptr Size) {
+  return Instance.calloc(NMemB, Size);
+}
+
+void *scudoValloc(uptr Size) {
+  return Instance.allocate(Size, GetPageSizeCached(), FromMemalign);
+}
+
+void *scudoMemalign(uptr Alignment, uptr Size) {
+  return Instance.allocate(Size, Alignment, FromMemalign);
+}
+
+void *scudoPvalloc(uptr Size) {
+  uptr PageSize = GetPageSizeCached();
+  Size = RoundUpTo(Size, PageSize);
+  if (Size == 0) {
+    // pvalloc(0) should allocate one page.
+    Size = PageSize;
+  }
+  return Instance.allocate(Size, PageSize, FromMemalign);
+}
+
+int scudoPosixMemalign(void **MemPtr, uptr Alignment, uptr Size) {
+  *MemPtr = Instance.allocate(Size, Alignment, FromMemalign);
+  return 0;
+}
+
+void *scudoAlignedAlloc(uptr Alignment, uptr Size) {
+  // size must be a multiple of the alignment. To avoid a division, we first
+  // make sure that alignment is a power of 2.
+  CHECK(IsPowerOfTwo(Alignment));
+  CHECK_EQ((Size & (Alignment - 1)), 0);
+  return Instance.allocate(Size, Alignment, FromMalloc);
+}
+
+uptr scudoMallocUsableSize(void *Ptr) {
+  return Instance.getUsableSize(Ptr);
+}
+
+} // namespace __scudo
+
+using namespace __scudo;
+
+// MallocExtension helper functions
+
+uptr __sanitizer_get_current_allocated_bytes() {
+  uptr stats[AllocatorStatCount];
+  getAllocator().GetStats(stats);
+  return stats[AllocatorStatAllocated];
+}
+
+uptr __sanitizer_get_heap_size() {
+  uptr stats[AllocatorStatCount];
+  getAllocator().GetStats(stats);
+  return stats[AllocatorStatMapped];
+}
+
+uptr __sanitizer_get_free_bytes() {
+  return 1;
+}
+
+uptr __sanitizer_get_unmapped_bytes() {
+  return 1;
+}
+
+uptr __sanitizer_get_estimated_allocated_size(uptr size) {
+  return size;
+}
+
+int __sanitizer_get_ownership(const void *p) {
+  return Instance.getUsableSize(p) != 0;
+}
+
+uptr __sanitizer_get_allocated_size(const void *p) {
+  return Instance.getUsableSize(p);
+}
diff --git a/lib/scudo/scudo_allocator.h b/lib/scudo/scudo_allocator.h
new file mode 100644
index 0000000..7e9c788
--- /dev/null
+++ b/lib/scudo/scudo_allocator.h
@@ -0,0 +1,63 @@
+//===-- scudo_allocator.h ---------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// Header for scudo_allocator.cpp.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_ALLOCATOR_H_
+#define SCUDO_ALLOCATOR_H_
+
+#ifndef __x86_64__
+# error "The Scudo hardened allocator currently only supports x86_64."
+#endif
+
+#include "scudo_flags.h"
+
+#include "sanitizer_common/sanitizer_allocator.h"
+
+namespace __scudo {
+
+enum AllocType : u8 {
+  FromMalloc    = 0, // Memory block came from malloc, realloc, calloc, etc.
+  FromNew       = 1, // Memory block came from operator new.
+  FromNewArray  = 2, // Memory block came from operator new [].
+  FromMemalign  = 3, // Memory block came from memalign, posix_memalign, etc.
+};
+
+struct AllocatorOptions {
+  u32 QuarantineSizeMb;
+  u32 ThreadLocalQuarantineSizeKb;
+  bool MayReturnNull;
+  bool DeallocationTypeMismatch;
+  bool DeleteSizeMismatch;
+  bool ZeroContents;
+
+  void setFrom(const Flags *f, const CommonFlags *cf);
+  void copyTo(Flags *f, CommonFlags *cf) const;
+};
+
+void initAllocator(const AllocatorOptions &options);
+void drainQuarantine();
+
+void *scudoMalloc(uptr Size, AllocType Type);
+void scudoFree(void *Ptr, AllocType Type);
+void scudoSizedFree(void *Ptr, uptr Size, AllocType Type);
+void *scudoRealloc(void *Ptr, uptr Size);
+void *scudoCalloc(uptr NMemB, uptr Size);
+void *scudoMemalign(uptr Alignment, uptr Size);
+void *scudoValloc(uptr Size);
+void *scudoPvalloc(uptr Size);
+int scudoPosixMemalign(void **MemPtr, uptr Alignment, uptr Size);
+void *scudoAlignedAlloc(uptr Alignment, uptr Size);
+uptr scudoMallocUsableSize(void *Ptr);
+
+} // namespace __scudo
+
+#endif  // SCUDO_ALLOCATOR_H_
diff --git a/lib/scudo/scudo_flags.cpp b/lib/scudo/scudo_flags.cpp
new file mode 100644
index 0000000..430dcd2
--- /dev/null
+++ b/lib/scudo/scudo_flags.cpp
@@ -0,0 +1,81 @@
+//===-- scudo_flags.cpp -----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// Hardened Allocator flag parsing logic.
+///
+//===----------------------------------------------------------------------===//
+
+#include "scudo_flags.h"
+#include "scudo_utils.h"
+
+#include "sanitizer_common/sanitizer_flags.h"
+#include "sanitizer_common/sanitizer_flag_parser.h"
+
+namespace __scudo {
+
+Flags scudo_flags_dont_use_directly;  // use via flags().
+
+void Flags::setDefaults() {
+#define SCUDO_FLAG(Type, Name, DefaultValue, Description) Name = DefaultValue;
+#include "scudo_flags.inc"
+#undef SCUDO_FLAG
+}
+
+static void RegisterScudoFlags(FlagParser *parser, Flags *f) {
+#define SCUDO_FLAG(Type, Name, DefaultValue, Description) \
+  RegisterFlag(parser, #Name, Description, &f->Name);
+#include "scudo_flags.inc"
+#undef SCUDO_FLAG
+}
+
+void initFlags() {
+  SetCommonFlagsDefaults();
+  {
+    CommonFlags cf;
+    cf.CopyFrom(*common_flags());
+    cf.exitcode = 1;
+    OverrideCommonFlags(cf);
+  }
+  Flags *f = getFlags();
+  f->setDefaults();
+
+  FlagParser scudo_parser;
+  RegisterScudoFlags(&scudo_parser, f);
+  RegisterCommonFlags(&scudo_parser);
+
+  scudo_parser.ParseString(GetEnv("SCUDO_OPTIONS"));
+
+  InitializeCommonFlags();
+
+  // Sanity checks and default settings for the Quarantine parameters.
+
+  if (f->QuarantineSizeMb < 0) {
+    const int DefaultQuarantineSizeMb = 64;
+    f->QuarantineSizeMb = DefaultQuarantineSizeMb;
+  }
+  // We enforce an upper limit for the quarantine size of 4Gb.
+  if (f->QuarantineSizeMb > (4 * 1024)) {
+    dieWithMessage("ERROR: the quarantine size is too large\n");
+  }
+  if (f->ThreadLocalQuarantineSizeKb < 0) {
+    const int DefaultThreadLocalQuarantineSizeKb = 1024;
+    f->ThreadLocalQuarantineSizeKb = DefaultThreadLocalQuarantineSizeKb;
+  }
+  // And an upper limit of 128Mb for the thread quarantine cache.
+  if (f->ThreadLocalQuarantineSizeKb > (128 * 1024)) {
+    dieWithMessage("ERROR: the per thread quarantine cache size is too "
+                   "large\n");
+  }
+}
+
+Flags *getFlags() {
+  return &scudo_flags_dont_use_directly;
+}
+
+}
diff --git a/lib/scudo/scudo_flags.h b/lib/scudo/scudo_flags.h
new file mode 100644
index 0000000..c16f635
--- /dev/null
+++ b/lib/scudo/scudo_flags.h
@@ -0,0 +1,33 @@
+//===-- scudo_flags.h -------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// Header for scudo_flags.cpp.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_FLAGS_H_
+#define SCUDO_FLAGS_H_
+
+namespace __scudo {
+
+struct Flags {
+#define SCUDO_FLAG(Type, Name, DefaultValue, Description) Type Name;
+#include "scudo_flags.inc"
+#undef SCUDO_FLAG
+
+  void setDefaults();
+};
+
+Flags *getFlags();
+
+void initFlags();
+
+} // namespace __scudo
+
+#endif  // SCUDO_FLAGS_H_
diff --git a/lib/scudo/scudo_flags.inc b/lib/scudo/scudo_flags.inc
new file mode 100644
index 0000000..c7a2acf
--- /dev/null
+++ b/lib/scudo/scudo_flags.inc
@@ -0,0 +1,35 @@
+//===-- scudo_flags.inc -----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// Hardened Allocator runtime flags.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_FLAG
+# error "Define SCUDO_FLAG prior to including this file!"
+#endif
+
+SCUDO_FLAG(int, QuarantineSizeMb, 64,
+           "Size (in Mb) of quarantine used to delay the actual deallocation "
+           "of chunks. Lower value may reduce memory usage but decrease the "
+           "effectiveness of the mitigation.")
+
+SCUDO_FLAG(int, ThreadLocalQuarantineSizeKb, 1024,
+          "Size (in Kb) of per-thread cache used to offload the global "
+          "quarantine. Lower value may reduce memory usage but might increase "
+          "the contention on the global quarantine.")
+
+SCUDO_FLAG(bool, DeallocationTypeMismatch, true,
+          "Report errors on malloc/delete, new/free, new/delete[], etc.")
+
+SCUDO_FLAG(bool, DeleteSizeMismatch, true,
+           "Report errors on mismatch between size of new and delete.")
+
+SCUDO_FLAG(bool, ZeroContents, false,
+          "Zero chunk contents on allocation and deallocation.")
diff --git a/lib/scudo/scudo_interceptors.cpp b/lib/scudo/scudo_interceptors.cpp
new file mode 100644
index 0000000..9204652
--- /dev/null
+++ b/lib/scudo/scudo_interceptors.cpp
@@ -0,0 +1,75 @@
+//===-- scudo_interceptors.cpp ----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// Linux specific malloc interception functions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "sanitizer_common/sanitizer_platform.h"
+#if SANITIZER_LINUX
+
+#include "scudo_allocator.h"
+
+#include "interception/interception.h"
+
+using namespace __scudo;
+
+INTERCEPTOR(void, free, void *ptr) {
+  scudoFree(ptr, FromMalloc);
+}
+
+INTERCEPTOR(void, cfree, void *ptr) {
+  scudoFree(ptr, FromMalloc);
+}
+
+INTERCEPTOR(void*, malloc, uptr size) {
+  return scudoMalloc(size, FromMalloc);
+}
+
+INTERCEPTOR(void*, realloc, void *ptr, uptr size) {
+  return scudoRealloc(ptr, size);
+}
+
+INTERCEPTOR(void*, calloc, uptr nmemb, uptr size) {
+  return scudoCalloc(nmemb, size);
+}
+
+INTERCEPTOR(void*, valloc, uptr size) {
+  return scudoValloc(size);
+}
+
+INTERCEPTOR(void*, memalign, uptr alignment, uptr size) {
+  return scudoMemalign(alignment, size);
+}
+
+INTERCEPTOR(void*, __libc_memalign, uptr alignment, uptr size) {
+  return scudoMemalign(alignment, size);
+}
+
+INTERCEPTOR(void*, pvalloc, uptr size) {
+  return scudoPvalloc(size);
+}
+
+INTERCEPTOR(void*, aligned_alloc, uptr alignment, uptr size) {
+  return scudoAlignedAlloc(alignment, size);
+}
+
+INTERCEPTOR(int, posix_memalign, void **memptr, uptr alignment, uptr size) {
+  return scudoPosixMemalign(memptr, alignment, size);
+}
+
+INTERCEPTOR(uptr, malloc_usable_size, void *ptr) {
+  return scudoMallocUsableSize(ptr);
+}
+
+INTERCEPTOR(int, mallopt, int cmd, int value) {
+  return -1;
+}
+
+#endif // SANITIZER_LINUX
diff --git a/lib/scudo/scudo_new_delete.cpp b/lib/scudo/scudo_new_delete.cpp
new file mode 100644
index 0000000..172f565
--- /dev/null
+++ b/lib/scudo/scudo_new_delete.cpp
@@ -0,0 +1,69 @@
+//===-- scudo_new_delete.cpp ------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// Interceptors for operators new and delete.
+///
+//===----------------------------------------------------------------------===//
+
+#include "scudo_allocator.h"
+
+#include "interception/interception.h"
+
+#include <cstddef>
+
+using namespace __scudo;
+
+#define CXX_OPERATOR_ATTRIBUTE INTERCEPTOR_ATTRIBUTE
+
+// Fake std::nothrow_t to avoid including <new>.
+namespace std {
+struct nothrow_t {};
+} // namespace std
+
+CXX_OPERATOR_ATTRIBUTE
+void *operator new(size_t size) {
+  return scudoMalloc(size, FromNew);
+}
+CXX_OPERATOR_ATTRIBUTE
+void *operator new[](size_t size) {
+  return scudoMalloc(size, FromNewArray);
+}
+CXX_OPERATOR_ATTRIBUTE
+void *operator new(size_t size, std::nothrow_t const&) {
+  return scudoMalloc(size, FromNew);
+}
+CXX_OPERATOR_ATTRIBUTE
+void *operator new[](size_t size, std::nothrow_t const&) {
+  return scudoMalloc(size, FromNewArray);
+}
+
+CXX_OPERATOR_ATTRIBUTE
+void operator delete(void *ptr) NOEXCEPT {
+  return scudoFree(ptr, FromNew);
+}
+CXX_OPERATOR_ATTRIBUTE
+void operator delete[](void *ptr) NOEXCEPT {
+  return scudoFree(ptr, FromNewArray);
+}
+CXX_OPERATOR_ATTRIBUTE
+void operator delete(void *ptr, std::nothrow_t const&) NOEXCEPT {
+  return scudoFree(ptr, FromNew);
+}
+CXX_OPERATOR_ATTRIBUTE
+void operator delete[](void *ptr, std::nothrow_t const&) NOEXCEPT {
+  return scudoFree(ptr, FromNewArray);
+}
+CXX_OPERATOR_ATTRIBUTE
+void operator delete(void *ptr, size_t size) NOEXCEPT {
+  scudoSizedFree(ptr, size, FromNew);
+}
+CXX_OPERATOR_ATTRIBUTE
+void operator delete[](void *ptr, size_t size) NOEXCEPT {
+  scudoSizedFree(ptr, size, FromNewArray);
+}
diff --git a/lib/scudo/scudo_termination.cpp b/lib/scudo/scudo_termination.cpp
new file mode 100644
index 0000000..32421d3
--- /dev/null
+++ b/lib/scudo/scudo_termination.cpp
@@ -0,0 +1,41 @@
+//===-- scudo_termination.cpp -----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// This file contains bare-bones termination functions to replace the
+/// __sanitizer ones, in order to avoid any potential abuse of the callbacks
+/// functionality.
+///
+//===----------------------------------------------------------------------===//
+
+#include "sanitizer_common/sanitizer_common.h"
+
+namespace __sanitizer {
+
+bool AddDieCallback(DieCallbackType callback) { return true; }
+
+bool RemoveDieCallback(DieCallbackType callback) { return true; }
+
+void SetUserDieCallback(DieCallbackType callback) {}
+
+void NORETURN Die() {
+  if (common_flags()->abort_on_error)
+    Abort();
+  internal__exit(common_flags()->exitcode);
+}
+
+void SetCheckFailedCallback(CheckFailedCallbackType callback) {}
+
+void NORETURN CheckFailed(const char *file, int line, const char *cond,
+                          u64 v1, u64 v2) {
+  Report("Sanitizer CHECK failed: %s:%d %s (%lld, %lld)\n", file, line, cond,
+                                                            v1, v2);
+  Die();
+}
+
+} // namespace __sanitizer
diff --git a/lib/scudo/scudo_utils.cpp b/lib/scudo/scudo_utils.cpp
new file mode 100644
index 0000000..6b96e84
--- /dev/null
+++ b/lib/scudo/scudo_utils.cpp
@@ -0,0 +1,133 @@
+//===-- scudo_utils.cpp -----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// Platform specific utility functions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "scudo_utils.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdarg.h>
+#include <unistd.h>
+
+#include <cstring>
+
+// TODO(kostyak): remove __sanitizer *Printf uses in favor for our own less
+//                complicated string formatting code. The following is a
+//                temporary workaround to be able to use __sanitizer::VSNPrintf.
+namespace __sanitizer {
+
+extern int VSNPrintf(char *buff, int buff_length, const char *format,
+                     va_list args);
+
+} // namespace __sanitizer
+
+namespace __scudo {
+
+FORMAT(1, 2)
+void dieWithMessage(const char *Format, ...) {
+  // Our messages are tiny, 128 characters is more than enough.
+  char Message[128];
+  va_list Args;
+  va_start(Args, Format);
+  __sanitizer::VSNPrintf(Message, sizeof(Message), Format, Args);
+  va_end(Args);
+  RawWrite(Message);
+  Die();
+}
+
+typedef struct {
+  u32 Eax;
+  u32 Ebx;
+  u32 Ecx;
+  u32 Edx;
+} CPUIDInfo;
+
+static void getCPUID(CPUIDInfo *info, u32 leaf, u32 subleaf)
+{
+  asm volatile("cpuid"
+      : "=a" (info->Eax), "=b" (info->Ebx), "=c" (info->Ecx), "=d" (info->Edx)
+      : "a" (leaf), "c" (subleaf)
+  );
+}
+
+// Returns true is the CPU is a "GenuineIntel" or "AuthenticAMD"
+static bool isSupportedCPU()
+{
+  CPUIDInfo Info;
+
+  getCPUID(&Info, 0, 0);
+  if (memcmp(reinterpret_cast<char *>(&Info.Ebx), "Genu", 4) == 0 &&
+      memcmp(reinterpret_cast<char *>(&Info.Edx), "ineI", 4) == 0 &&
+      memcmp(reinterpret_cast<char *>(&Info.Ecx), "ntel", 4) == 0) {
+      return true;
+  }
+  if (memcmp(reinterpret_cast<char *>(&Info.Ebx), "Auth", 4) == 0 &&
+      memcmp(reinterpret_cast<char *>(&Info.Edx), "enti", 4) == 0 &&
+      memcmp(reinterpret_cast<char *>(&Info.Ecx), "cAMD", 4) == 0) {
+      return true;
+  }
+  return false;
+}
+
+bool testCPUFeature(CPUFeature feature)
+{
+  static bool InfoInitialized = false;
+  static CPUIDInfo CPUInfo = {};
+
+  if (InfoInitialized == false) {
+    if (isSupportedCPU() == true)
+      getCPUID(&CPUInfo, 1, 0);
+    else
+      UNIMPLEMENTED();
+    InfoInitialized = true;
+  }
+  switch (feature) {
+    case SSE4_2:
+      return ((CPUInfo.Ecx >> 20) & 0x1) != 0;
+    default:
+      break;
+  }
+  return false;
+}
+
+// readRetry will attempt to read Count bytes from the Fd specified, and if
+// interrupted will retry to read additional bytes to reach Count.
+static ssize_t readRetry(int Fd, u8 *Buffer, size_t Count) {
+  ssize_t AmountRead = 0;
+  while (static_cast<size_t>(AmountRead) < Count) {
+    ssize_t Result = read(Fd, Buffer + AmountRead, Count - AmountRead);
+    if (Result > 0)
+      AmountRead += Result;
+    else if (!Result)
+      break;
+    else if (errno != EINTR) {
+      AmountRead = -1;
+      break;
+    }
+  }
+  return AmountRead;
+}
+
+// Default constructor for Xorshift128Plus seeds the state with /dev/urandom
+Xorshift128Plus::Xorshift128Plus() {
+  int Fd = open("/dev/urandom", O_RDONLY);
+  bool Success = readRetry(Fd, reinterpret_cast<u8 *>(&State_0_),
+                           sizeof(State_0_)) == sizeof(State_0_);
+  Success &= readRetry(Fd, reinterpret_cast<u8 *>(&State_1_),
+                           sizeof(State_1_)) == sizeof(State_1_);
+  close(Fd);
+  if (!Success) {
+    dieWithMessage("ERROR: failed to read enough data from /dev/urandom.\n");
+  }
+}
+
+} // namespace __scudo
diff --git a/lib/scudo/scudo_utils.h b/lib/scudo/scudo_utils.h
new file mode 100644
index 0000000..07394ff
--- /dev/null
+++ b/lib/scudo/scudo_utils.h
@@ -0,0 +1,59 @@
+//===-- scudo_utils.h -------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// Header for scudo_utils.cpp.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_UTILS_H_
+#define SCUDO_UTILS_H_
+
+#include <string.h>
+
+#include "sanitizer_common/sanitizer_common.h"
+
+namespace __scudo {
+
+template <class Dest, class Source>
+inline Dest bit_cast(const Source& source) {
+  static_assert(sizeof(Dest) == sizeof(Source), "Sizes are not equal!");
+  Dest dest;
+  memcpy(&dest, &source, sizeof(dest));
+  return dest;
+}
+
+void dieWithMessage(const char *Format, ...);
+
+enum  CPUFeature {
+  SSE4_2 = 0,
+  ENUM_CPUFEATURE_MAX
+};
+bool testCPUFeature(CPUFeature feature);
+
+// Tiny PRNG based on https://en.wikipedia.org/wiki/Xorshift#xorshift.2B
+// The state (128 bits) will be stored in thread local storage.
+struct Xorshift128Plus {
+ public:
+  Xorshift128Plus();
+  u64 Next() {
+    u64 x = State_0_;
+    const u64 y = State_1_;
+    State_0_ = y;
+    x ^= x << 23;
+    State_1_ = x ^ y ^ (x >> 17) ^ (y >> 26);
+    return State_1_ + y;
+  }
+ private:
+  u64 State_0_;
+  u64 State_1_;
+};
+
+} // namespace __scudo
+
+#endif  // SCUDO_UTILS_H_
diff --git a/lib/stats/CMakeLists.txt b/lib/stats/CMakeLists.txt
new file mode 100644
index 0000000..33ab1ae
--- /dev/null
+++ b/lib/stats/CMakeLists.txt
@@ -0,0 +1,28 @@
+include_directories(..)
+
+add_custom_target(stats)
+set_target_properties(stats PROPERTIES FOLDER "Compiler-RT Misc")
+
+if(APPLE)
+  set(STATS_LIB_FLAVOR SHARED)
+else()
+  set(STATS_LIB_FLAVOR STATIC)
+endif()
+
+add_compiler_rt_runtime(clang_rt.stats
+  ${STATS_LIB_FLAVOR}
+  ARCHS ${SANITIZER_COMMON_SUPPORTED_ARCH}
+  OS ${SANITIZER_COMMON_SUPPORTED_OS}
+  SOURCES stats.cc
+  OBJECT_LIBS RTSanitizerCommon
+              RTSanitizerCommonLibc
+  CFLAGS ${SANITIZER_COMMON_CFLAGS}
+  PARENT_TARGET stats)
+
+add_compiler_rt_runtime(clang_rt.stats_client
+  STATIC
+  ARCHS ${SANITIZER_COMMON_SUPPORTED_ARCH}
+  OS ${SANITIZER_COMMON_SUPPORTED_OS}
+  SOURCES stats_client.cc
+  CFLAGS ${SANITIZER_COMMON_CFLAGS}
+  PARENT_TARGET stats)
diff --git a/lib/stats/stats.cc b/lib/stats/stats.cc
new file mode 100644
index 0000000..df9845a
--- /dev/null
+++ b/lib/stats/stats.cc
@@ -0,0 +1,136 @@
+//===-- stats.cc ----------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Sanitizer statistics gathering. Manages statistics for a process and is
+// responsible for writing the report file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_internal_defs.h"
+#if SANITIZER_POSIX
+#include "sanitizer_common/sanitizer_posix.h"
+#endif
+#include "sanitizer_common/sanitizer_symbolizer.h"
+#include "stats/stats.h"
+#if SANITIZER_POSIX
+#include <signal.h>
+#endif
+
+using namespace __sanitizer;
+
+namespace {
+
+InternalMmapVectorNoCtor<StatModule **> modules;
+StaticSpinMutex modules_mutex;
+
+fd_t stats_fd;
+
+void WriteLE(fd_t fd, uptr val) {
+  char chars[sizeof(uptr)];
+  for (unsigned i = 0; i != sizeof(uptr); ++i) {
+    chars[i] = val >> (i * 8);
+  }
+  WriteToFile(fd, chars, sizeof(uptr));
+}
+
+void OpenStatsFile(const char *path_env) {
+  InternalScopedBuffer<char> path(kMaxPathLength);
+  SubstituteForFlagValue(path_env, path.data(), kMaxPathLength);
+
+  error_t err;
+  stats_fd = OpenFile(path.data(), WrOnly, &err);
+  if (stats_fd == kInvalidFd) {
+    Report("stats: failed to open %s for writing (reason: %d)\n", path.data(),
+           err);
+    return;
+  }
+  char sizeof_uptr = sizeof(uptr);
+  WriteToFile(stats_fd, &sizeof_uptr, 1);
+}
+
+void WriteModuleReport(StatModule **smodp) {
+  CHECK(smodp);
+  const char *path_env = GetEnv("SANITIZER_STATS_PATH");
+  if (!path_env || stats_fd == kInvalidFd)
+    return;
+  if (!stats_fd)
+    OpenStatsFile(path_env);
+  const LoadedModule *mod = Symbolizer::GetOrInit()->FindModuleForAddress(
+      reinterpret_cast<uptr>(smodp));
+  WriteToFile(stats_fd, mod->full_name(),
+              internal_strlen(mod->full_name()) + 1);
+  for (StatModule *smod = *smodp; smod; smod = smod->next) {
+    for (u32 i = 0; i != smod->size; ++i) {
+      StatInfo *s = &smod->infos[i];
+      if (!s->addr)
+        continue;
+      WriteLE(stats_fd, s->addr - mod->base_address());
+      WriteLE(stats_fd, s->data);
+    }
+  }
+  WriteLE(stats_fd, 0);
+  WriteLE(stats_fd, 0);
+}
+
+} // namespace
+
+extern "C"
+SANITIZER_INTERFACE_ATTRIBUTE
+unsigned __sanitizer_stats_register(StatModule **mod) {
+  SpinMutexLock l(&modules_mutex);
+  modules.push_back(mod);
+  return modules.size() - 1;
+}
+
+extern "C"
+SANITIZER_INTERFACE_ATTRIBUTE
+void __sanitizer_stats_unregister(unsigned index) {
+  SpinMutexLock l(&modules_mutex);
+  WriteModuleReport(modules[index]);
+  modules[index] = 0;
+}
+
+namespace {
+
+void WriteFullReport() {
+  SpinMutexLock l(&modules_mutex);
+  for (StatModule **mod : modules) {
+    if (!mod)
+      continue;
+    WriteModuleReport(mod);
+  }
+  if (stats_fd != 0 && stats_fd != kInvalidFd) {
+    CloseFile(stats_fd);
+    stats_fd = kInvalidFd;
+  }
+}
+
+#if SANITIZER_POSIX
+void USR2Handler(int sig) {
+  WriteFullReport();
+}
+#endif
+
+struct WriteReportOnExitOrSignal {
+  WriteReportOnExitOrSignal() {
+#if SANITIZER_POSIX
+    struct sigaction sigact;
+    internal_memset(&sigact, 0, sizeof(sigact));
+    sigact.sa_handler = USR2Handler;
+    internal_sigaction(SIGUSR2, &sigact, nullptr);
+#endif
+  }
+
+  ~WriteReportOnExitOrSignal() {
+    WriteFullReport();
+  }
+} wr;
+
+} // namespace
diff --git a/lib/stats/stats.h b/lib/stats/stats.h
new file mode 100644
index 0000000..6194706
--- /dev/null
+++ b/lib/stats/stats.h
@@ -0,0 +1,43 @@
+//===-- stats.h -------------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Data definitions for sanitizer statistics gathering.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SANITIZER_STATS_STATS_H
+#define SANITIZER_STATS_STATS_H
+
+#include "sanitizer_common/sanitizer_internal_defs.h"
+
+namespace __sanitizer {
+
+// Number of bits in data that are used for the sanitizer kind. Needs to match
+// llvm::kSanitizerStatKindBits in
+// llvm/include/llvm/Transforms/Utils/SanitizerStats.h
+enum { kKindBits = 3 };
+
+struct StatInfo {
+  uptr addr;
+  uptr data;
+};
+
+struct StatModule {
+  StatModule *next;
+  u32 size;
+  StatInfo infos[1];
+};
+
+inline uptr CountFromData(uptr data) {
+  return data & ((1ull << (sizeof(uptr) * 8 - kKindBits)) - 1);
+}
+
+}
+
+#endif
diff --git a/lib/stats/stats_client.cc b/lib/stats/stats_client.cc
new file mode 100644
index 0000000..fa4b2d9
--- /dev/null
+++ b/lib/stats/stats_client.cc
@@ -0,0 +1,83 @@
+//===-- stats_client.cc ---------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Sanitizer statistics gathering. Manages statistics for a module (executable
+// or DSO) and registers statistics with the process.
+//
+// This is linked into each individual modle and cannot directly use functions
+// declared in sanitizer_common.
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef _WIN32
+#include <windows.h>
+#else
+#include <dlfcn.h>
+#endif
+#include <stdint.h>
+#include <stdio.h>
+
+#include "sanitizer_common/sanitizer_internal_defs.h"
+#include "stats/stats.h"
+
+using namespace __sanitizer;
+
+namespace {
+
+void *LookupSymbolFromMain(const char *name) {
+#ifdef _WIN32
+  return reinterpret_cast<void *>(GetProcAddress(GetModuleHandle(0), name));
+#else
+  return dlsym(RTLD_DEFAULT, name);
+#endif
+}
+
+StatModule *list;
+
+struct RegisterSanStats {
+  unsigned module_id;
+
+  RegisterSanStats() {
+    typedef unsigned (*reg_func_t)(StatModule **);
+    reg_func_t reg_func = reinterpret_cast<reg_func_t>(
+        LookupSymbolFromMain("__sanitizer_stats_register"));
+    if (reg_func)
+      module_id = reg_func(&list);
+  }
+
+  ~RegisterSanStats() {
+    typedef void (*unreg_func_t)(unsigned);
+    unreg_func_t unreg_func = reinterpret_cast<unreg_func_t>(
+        LookupSymbolFromMain("__sanitizer_stats_unregister"));
+    if (unreg_func)
+      unreg_func(module_id);
+  }
+} reg;
+
+}
+
+extern "C" void __sanitizer_stat_init(StatModule *mod) {
+  mod->next = list;
+  list = mod;
+}
+
+extern "C" void __sanitizer_stat_report(StatInfo *s) {
+  s->addr = GET_CALLER_PC();
+#if defined(_WIN64) && !defined(__clang__)
+  uptr old_data = InterlockedIncrement64(reinterpret_cast<LONG64 *>(&s->data));
+#elif defined(_WIN32) && !defined(__clang__)
+  uptr old_data = InterlockedIncrement(&s->data);
+#else
+  uptr old_data = __sync_fetch_and_add(&s->data, 1);
+#endif
+
+  // Overflow check.
+  if (CountFromData(old_data + 1) == 0)
+    Trap();
+}
diff --git a/lib/tsan/Android.bp b/lib/tsan/Android.bp
index 2d93818..23247d6 100644
--- a/lib/tsan/Android.bp
+++ b/lib/tsan/Android.bp
@@ -31,31 +31,7 @@
     include_dirs: ["external/compiler-rt/lib"],
     cppflags: tsan_rtl_cppflags,
     srcs: [
-        "rtl/tsan_clock.cc",
-        "rtl/tsan_flags.cc",
-        "rtl/tsan_fd.cc",
-        "rtl/tsan_ignoreset.cc",
-        "rtl/tsan_interceptors.cc",
-        "rtl/tsan_interface_ann.cc",
-        "rtl/tsan_interface_atomic.cc",
-        "rtl/tsan_interface.cc",
-        "rtl/tsan_interface_java.cc",
-        "rtl/tsan_md5.cc",
-        "rtl/tsan_mman.cc",
-        "rtl/tsan_mutex.cc",
-        "rtl/tsan_mutexset.cc",
-        "rtl/tsan_report.cc",
-        "rtl/tsan_rtl.cc",
-        "rtl/tsan_rtl_mutex.cc",
-        "rtl/tsan_rtl_report.cc",
-        "rtl/tsan_rtl_thread.cc",
-        "rtl/tsan_stack_trace.cc",
-        "rtl/tsan_stat.cc",
-        "rtl/tsan_suppressions.cc",
-        "rtl/tsan_symbolize.cc",
-        "rtl/tsan_sync.cc",
-        "rtl/tsan_platform_linux.cc",
-        "rtl/tsan_platform_posix.cc",
+        "rtl/*.cc",
         "rtl/tsan_rtl_amd64.S",
     ],
     stl: "none",
diff --git a/lib/tsan/CMakeLists.txt b/lib/tsan/CMakeLists.txt
index 0e60cd3..7c84e0a 100644
--- a/lib/tsan/CMakeLists.txt
+++ b/lib/tsan/CMakeLists.txt
@@ -6,7 +6,7 @@
 # SANITIZER_COMMON_CFLAGS contains -fPIC, but it's performance-critical for
 # TSan runtime to be built with -fPIE to reduce the number of register spills.
 append_list_if(COMPILER_RT_HAS_FPIE_FLAG -fPIE TSAN_CFLAGS)
-append_no_rtti_flag(TSAN_CFLAGS)
+append_rtti_flag(OFF TSAN_CFLAGS)
 
 if(COMPILER_RT_TSAN_DEBUG_OUTPUT)
   # Add extra debug information to TSan runtime. This configuration is rarely
@@ -24,22 +24,25 @@
 
 set(TSAN_SOURCES
   rtl/tsan_clock.cc
-  rtl/tsan_flags.cc
+  rtl/tsan_debugging.cc
   rtl/tsan_fd.cc
+  rtl/tsan_flags.cc
   rtl/tsan_ignoreset.cc
   rtl/tsan_interceptors.cc
+  rtl/tsan_interface.cc
   rtl/tsan_interface_ann.cc
   rtl/tsan_interface_atomic.cc
-  rtl/tsan_interface.cc
   rtl/tsan_interface_java.cc
   rtl/tsan_malloc_mac.cc
   rtl/tsan_md5.cc
   rtl/tsan_mman.cc
   rtl/tsan_mutex.cc
   rtl/tsan_mutexset.cc
+  rtl/tsan_preinit.cc
   rtl/tsan_report.cc
   rtl/tsan_rtl.cc
   rtl/tsan_rtl_mutex.cc
+  rtl/tsan_rtl_proc.cc
   rtl/tsan_rtl_report.cc
   rtl/tsan_rtl_thread.cc
   rtl/tsan_stack_trace.cc
@@ -94,6 +97,7 @@
 
 set(TSAN_RUNTIME_LIBRARIES)
 add_custom_target(tsan)
+set_target_properties(tsan PROPERTIES FOLDER "Compiler-RT Misc")
 
 if(APPLE)
   set(TSAN_ASM_SOURCES rtl/tsan_rtl_amd64.S)
@@ -115,7 +119,7 @@
                 RTUbsan
     CFLAGS ${TSAN_RTL_CFLAGS}
     PARENT_TARGET tsan)
-  add_compiler_rt_object_libraries(RTTsan_dynamic 
+  add_compiler_rt_object_libraries(RTTsan_dynamic
     OS ${TSAN_SUPPORTED_OS}
     ARCHS ${TSAN_SUPPORTED_ARCH}
     SOURCES ${TSAN_SOURCES} ${TSAN_CXX_SOURCES} ${TSAN_ASM_SOURCES}
@@ -192,7 +196,11 @@
 add_dependencies(compiler-rt tsan)
 
 # Make sure that non-platform-specific files don't include any system headers.
-if(COMPILER_RT_HAS_SYSROOT_FLAG)
+# FreeBSD does not install a number of Clang-provided headers for the compiler
+# in the base system due to incompatibilities between FreeBSD's and Clang's
+# versions. As a workaround do not use --sysroot=. on FreeBSD until this is
+# addressed.
+if(COMPILER_RT_HAS_SYSROOT_FLAG AND NOT CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
   file(GLOB _tsan_generic_sources rtl/tsan*)
   file(GLOB _tsan_platform_sources rtl/tsan*posix* rtl/tsan*mac*
                                    rtl/tsan*linux*)
@@ -204,10 +212,17 @@
 # Build libcxx instrumented with TSan.
 if(COMPILER_RT_HAS_LIBCXX_SOURCES AND
    COMPILER_RT_TEST_COMPILER_ID STREQUAL "Clang")
-  set(LIBCXX_PREFIX ${CMAKE_CURRENT_BINARY_DIR}/libcxx_tsan)
-  add_custom_libcxx(libcxx_tsan ${LIBCXX_PREFIX}
-    DEPS ${TSAN_RUNTIME_LIBRARIES}
-    CFLAGS -fsanitize=thread)
+  set(libcxx_tsan_deps)
+  foreach(arch ${TSAN_SUPPORTED_ARCH})
+    get_target_flags_for_arch(${arch} TARGET_CFLAGS)
+    set(LIBCXX_PREFIX ${CMAKE_CURRENT_BINARY_DIR}/libcxx_tsan_${arch})
+    add_custom_libcxx(libcxx_tsan_${arch} ${LIBCXX_PREFIX}
+      DEPS ${TSAN_RUNTIME_LIBRARIES}
+      CFLAGS ${TARGET_CFLAGS} -fsanitize=thread)
+    list(APPEND libcxx_tsan_deps libcxx_tsan_${arch})
+  endforeach()
+
+  add_custom_target(libcxx_tsan DEPENDS ${libcxx_tsan_deps})
 endif()
 
 if(COMPILER_RT_INCLUDE_TESTS)
diff --git a/lib/tsan/check_analyze.sh b/lib/tsan/check_analyze.sh
index 0f6cc06..a5d3632 100755
--- a/lib/tsan/check_analyze.sh
+++ b/lib/tsan/check_analyze.sh
@@ -32,13 +32,7 @@
   check $f pop 2
 done
 
-for f in write2 write4; do
-  check $f rsp 1
-  check $f push 4
-  check $f pop 4
-done
-
-for f in write8; do
+for f in write2 write4 write8; do
   check $f rsp 1
   check $f push 3
   check $f pop 3
diff --git a/lib/tsan/dd/CMakeLists.txt b/lib/tsan/dd/CMakeLists.txt
index 6330bd9..bcff35f 100644
--- a/lib/tsan/dd/CMakeLists.txt
+++ b/lib/tsan/dd/CMakeLists.txt
@@ -3,7 +3,7 @@
 include_directories(../..)
 
 set(DD_CFLAGS ${SANITIZER_COMMON_CFLAGS})
-append_no_rtti_flag(DD_CFLAGS)
+append_rtti_flag(OFF DD_CFLAGS)
 
 set(DD_SOURCES
   dd_rtl.cc
diff --git a/lib/tsan/dd/dd_interceptors.cc b/lib/tsan/dd/dd_interceptors.cc
index 8151f7f..97c72dd 100644
--- a/lib/tsan/dd/dd_interceptors.cc
+++ b/lib/tsan/dd/dd_interceptors.cc
@@ -15,9 +15,6 @@
 
 using namespace __dsan;
 
-extern "C" void *__libc_malloc(uptr size);
-extern "C" void __libc_free(void *ptr);
-
 __attribute__((tls_model("initial-exec")))
 static __thread Thread *thr;
 __attribute__((tls_model("initial-exec")))
diff --git a/lib/tsan/go/build.bat b/lib/tsan/go/build.bat
index 7d393dc..3ada9ab 100644
--- a/lib/tsan/go/build.bat
+++ b/lib/tsan/go/build.bat
@@ -1,4 +1,4 @@
-type tsan_go.cc ..\rtl\tsan_interface_atomic.cc ..\rtl\tsan_clock.cc ..\rtl\tsan_flags.cc ..\rtl\tsan_md5.cc ..\rtl\tsan_mutex.cc ..\rtl\tsan_report.cc ..\rtl\tsan_rtl.cc ..\rtl\tsan_rtl_mutex.cc ..\rtl\tsan_rtl_report.cc ..\rtl\tsan_rtl_thread.cc ..\rtl\tsan_stat.cc ..\rtl\tsan_suppressions.cc ..\rtl\tsan_sync.cc ..\rtl\tsan_stack_trace.cc ..\..\sanitizer_common\sanitizer_allocator.cc ..\..\sanitizer_common\sanitizer_common.cc ..\..\sanitizer_common\sanitizer_flags.cc ..\..\sanitizer_common\sanitizer_stacktrace.cc ..\..\sanitizer_common\sanitizer_libc.cc ..\..\sanitizer_common\sanitizer_printf.cc ..\..\sanitizer_common\sanitizer_suppressions.cc ..\..\sanitizer_common\sanitizer_thread_registry.cc ..\rtl\tsan_platform_windows.cc ..\..\sanitizer_common\sanitizer_win.cc ..\..\sanitizer_common\sanitizer_deadlock_detector1.cc ..\..\sanitizer_common\sanitizer_stackdepot.cc ..\..\sanitizer_common\sanitizer_persistent_allocator.cc ..\..\sanitizer_common\sanitizer_flag_parser.cc ..\..\sanitizer_common\sanitizer_symbolizer.cc > gotsan.cc
+type tsan_go.cc ..\rtl\tsan_interface_atomic.cc ..\rtl\tsan_clock.cc ..\rtl\tsan_flags.cc ..\rtl\tsan_md5.cc ..\rtl\tsan_mutex.cc ..\rtl\tsan_report.cc ..\rtl\tsan_rtl.cc ..\rtl\tsan_rtl_mutex.cc ..\rtl\tsan_rtl_report.cc ..\rtl\tsan_rtl_thread.cc ..\rtl\tsan_rtl_proc.cc ..\rtl\tsan_stat.cc ..\rtl\tsan_suppressions.cc ..\rtl\tsan_sync.cc ..\rtl\tsan_stack_trace.cc ..\..\sanitizer_common\sanitizer_allocator.cc ..\..\sanitizer_common\sanitizer_common.cc ..\..\sanitizer_common\sanitizer_flags.cc ..\..\sanitizer_common\sanitizer_stacktrace.cc ..\..\sanitizer_common\sanitizer_libc.cc ..\..\sanitizer_common\sanitizer_printf.cc ..\..\sanitizer_common\sanitizer_suppressions.cc ..\..\sanitizer_common\sanitizer_thread_registry.cc ..\rtl\tsan_platform_windows.cc ..\..\sanitizer_common\sanitizer_win.cc ..\..\sanitizer_common\sanitizer_deadlock_detector1.cc ..\..\sanitizer_common\sanitizer_stackdepot.cc ..\..\sanitizer_common\sanitizer_persistent_allocator.cc ..\..\sanitizer_common\sanitizer_flag_parser.cc ..\..\sanitizer_common\sanitizer_symbolizer.cc ..\..\sanitizer_common\sanitizer_termination.cc > gotsan.cc
 
 gcc -c -o race_windows_amd64.syso gotsan.cc -I..\rtl -I..\.. -I..\..\sanitizer_common -I..\..\..\include -m64 -Wall -fno-exceptions -fno-rtti -DSANITIZER_GO -Wno-error=attributes -Wno-attributes -Wno-format -Wno-maybe-uninitialized -DSANITIZER_DEBUG=0 -O3 -fomit-frame-pointer -std=c++11
 
diff --git a/lib/tsan/go/buildgo.sh b/lib/tsan/go/buildgo.sh
index fdbd405..834e325 100755
--- a/lib/tsan/go/buildgo.sh
+++ b/lib/tsan/go/buildgo.sh
@@ -14,6 +14,7 @@
 	../rtl/tsan_rtl_mutex.cc
 	../rtl/tsan_rtl_report.cc
 	../rtl/tsan_rtl_thread.cc
+	../rtl/tsan_rtl_proc.cc
 	../rtl/tsan_stack_trace.cc
 	../rtl/tsan_stat.cc
 	../rtl/tsan_suppressions.cc
@@ -32,6 +33,7 @@
 	../../sanitizer_common/sanitizer_stackdepot.cc
 	../../sanitizer_common/sanitizer_stacktrace.cc
 	../../sanitizer_common/sanitizer_symbolizer.cc
+	../../sanitizer_common/sanitizer_termination.cc
 "
 
 if [ "`uname -a | grep Linux`" != "" ]; then
@@ -50,23 +52,24 @@
 		../../sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cc
 	"
 elif [ "`uname -a | grep FreeBSD`" != "" ]; then
-        SUFFIX="freebsd_amd64"
-        OSCFLAGS="-fno-strict-aliasing -fPIC -Werror"
-        OSLDFLAGS="-lpthread -fPIC -fpie"
-        SRCS="
-                $SRCS
-                ../rtl/tsan_platform_linux.cc
-                ../../sanitizer_common/sanitizer_posix.cc
-                ../../sanitizer_common/sanitizer_posix_libcdep.cc
-                ../../sanitizer_common/sanitizer_procmaps_common.cc
-                ../../sanitizer_common/sanitizer_procmaps_freebsd.cc
-                ../../sanitizer_common/sanitizer_linux.cc
-                ../../sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cc
-        "
+	SUFFIX="freebsd_amd64"
+	OSCFLAGS="-fno-strict-aliasing -fPIC -Werror"
+	OSLDFLAGS="-lpthread -fPIC -fpie"
+	SRCS="
+		$SRCS
+		../rtl/tsan_platform_linux.cc
+		../../sanitizer_common/sanitizer_posix.cc
+		../../sanitizer_common/sanitizer_posix_libcdep.cc
+		../../sanitizer_common/sanitizer_procmaps_common.cc
+		../../sanitizer_common/sanitizer_procmaps_freebsd.cc
+		../../sanitizer_common/sanitizer_linux.cc
+		../../sanitizer_common/sanitizer_linux_libcdep.cc
+		../../sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cc
+	"
 elif [ "`uname -a | grep Darwin`" != "" ]; then
 	SUFFIX="darwin_amd64"
-	OSCFLAGS="-fPIC -Wno-unused-const-variable -Wno-unknown-warning-option"
-	OSLDFLAGS="-lpthread -fPIC -fpie"
+	OSCFLAGS="-fPIC -Wno-unused-const-variable -Wno-unknown-warning-option -mmacosx-version-min=10.7"
+	OSLDFLAGS="-lpthread -fPIC -fpie -mmacosx-version-min=10.7"
 	SRCS="
 		$SRCS
 		../rtl/tsan_platform_mac.cc
@@ -122,7 +125,7 @@
 fi
 $CC $DIR/gotsan.cc -c -o $DIR/race_$SUFFIX.syso $FLAGS $CFLAGS
 
-$CC test.c $DIR/race_$SUFFIX.syso -m64 -o $DIR/test $OSLDFLAGS
+$CC $OSCFLAGS test.c $DIR/race_$SUFFIX.syso -m64 -g -o $DIR/test $OSLDFLAGS
 
 export GORACE="exitcode=0 atexit_sleep_ms=0"
 if [ "$SILENT" != "1" ]; then
diff --git a/lib/tsan/go/test.c b/lib/tsan/go/test.c
index 94433f1..b3e31b1 100644
--- a/lib/tsan/go/test.c
+++ b/lib/tsan/go/test.c
@@ -12,22 +12,37 @@
 //===----------------------------------------------------------------------===//
 
 #include <stdio.h>
+#include <stdlib.h>
 
-void __tsan_init(void **thr, void (*cb)(void*));
+void __tsan_init(void **thr, void **proc, void (*cb)(long, void*));
 void __tsan_fini();
 void __tsan_map_shadow(void *addr, unsigned long size);
 void __tsan_go_start(void *thr, void **chthr, void *pc);
 void __tsan_go_end(void *thr);
+void __tsan_proc_create(void **pproc);
+void __tsan_proc_destroy(void *proc);
+void __tsan_proc_wire(void *proc, void *thr);
+void __tsan_proc_unwire(void *proc, void *thr);
 void __tsan_read(void *thr, void *addr, void *pc);
 void __tsan_write(void *thr, void *addr, void *pc);
 void __tsan_func_enter(void *thr, void *pc);
 void __tsan_func_exit(void *thr);
-void __tsan_malloc(void *p, unsigned long sz);
+void __tsan_malloc(void *thr, void *pc, void *p, unsigned long sz);
+void __tsan_free(void *p, unsigned long sz);
 void __tsan_acquire(void *thr, void *addr);
 void __tsan_release(void *thr, void *addr);
 void __tsan_release_merge(void *thr, void *addr);
 
-void symbolize_cb(void *ctx) {}
+void *current_proc;
+
+void symbolize_cb(long cmd, void *ctx) {
+  switch (cmd) {
+  case 0:
+    if (current_proc == 0)
+      abort();
+    *(void**)ctx = current_proc;
+  }
+}
 
 char buf0[100<<10];
 
@@ -36,18 +51,22 @@
 
 int main(void) {
   void *thr0 = 0;
+  void *proc0 = 0;
+  __tsan_init(&thr0, &proc0, symbolize_cb);
+  current_proc = proc0;
   char *buf = (char*)((unsigned long)buf0 + (64<<10) - 1 & ~((64<<10) - 1));
-  __tsan_malloc(buf, 10);
-  __tsan_init(&thr0, symbolize_cb);
   __tsan_map_shadow(buf, 4096);
+  __tsan_malloc(thr0, (char*)&barfoo + 1, buf, 10);
+  __tsan_free(buf, 10);
   __tsan_func_enter(thr0, (char*)&main + 1);
-  __tsan_malloc(buf, 10);
+  __tsan_malloc(thr0, (char*)&barfoo + 1, buf, 10);
   __tsan_release(thr0, buf);
   __tsan_release_merge(thr0, buf);
   void *thr1 = 0;
   __tsan_go_start(thr0, &thr1, (char*)&barfoo + 1);
   void *thr2 = 0;
   __tsan_go_start(thr0, &thr2, (char*)&barfoo + 1);
+  __tsan_func_exit(thr0);
   __tsan_func_enter(thr1, (char*)&foobar + 1);
   __tsan_func_enter(thr1, (char*)&foobar + 1);
   __tsan_write(thr1, buf, (char*)&barfoo + 1);
@@ -55,11 +74,16 @@
   __tsan_func_exit(thr1);
   __tsan_func_exit(thr1);
   __tsan_go_end(thr1);
+  void *proc1 = 0;
+  __tsan_proc_create(&proc1);
+  current_proc = proc1;
   __tsan_func_enter(thr2, (char*)&foobar + 1);
   __tsan_read(thr2, buf, (char*)&barfoo + 1);
+  __tsan_free(buf, 10);
   __tsan_func_exit(thr2);
   __tsan_go_end(thr2);
-  __tsan_func_exit(thr0);
+  __tsan_proc_destroy(proc1);
+  current_proc = proc0;
   __tsan_fini();
   return 0;
 }
diff --git a/lib/tsan/go/tsan_go.cc b/lib/tsan/go/tsan_go.cc
index ea0beb7..bc0d553 100644
--- a/lib/tsan/go/tsan_go.cc
+++ b/lib/tsan/go/tsan_go.cc
@@ -28,10 +28,6 @@
   return false;
 }
 
-ReportLocation *SymbolizeData(uptr addr) {
-  return 0;
-}
-
 void *internal_alloc(MBlockType typ, uptr sz) {
   return InternalAlloc(sz);
 }
@@ -40,7 +36,16 @@
   InternalFree(p);
 }
 
-struct SymbolizeContext {
+// Callback into Go.
+static void (*go_runtime_cb)(uptr cmd, void *ctx);
+
+enum {
+  CallbackGetProc = 0,
+  CallbackSymbolizeCode = 1,
+  CallbackSymbolizeData = 2,
+};
+
+struct SymbolizeCodeContext {
   uptr pc;
   char *func;
   char *file;
@@ -49,31 +54,83 @@
   uptr res;
 };
 
-// Callback into Go.
-static void (*symbolize_cb)(SymbolizeContext *ctx);
-
 SymbolizedStack *SymbolizeCode(uptr addr) {
   SymbolizedStack *s = SymbolizedStack::New(addr);
-  SymbolizeContext ctx;
-  internal_memset(&ctx, 0, sizeof(ctx));
-  ctx.pc = addr;
-  symbolize_cb(&ctx);
-  if (ctx.res) {
+  SymbolizeCodeContext cbctx;
+  internal_memset(&cbctx, 0, sizeof(cbctx));
+  cbctx.pc = addr;
+  go_runtime_cb(CallbackSymbolizeCode, &cbctx);
+  if (cbctx.res) {
     AddressInfo &info = s->info;
-    info.module_offset = ctx.off;
-    info.function = internal_strdup(ctx.func ? ctx.func : "??");
-    info.file = internal_strdup(ctx.file ? ctx.file : "-");
-    info.line = ctx.line;
+    info.module_offset = cbctx.off;
+    info.function = internal_strdup(cbctx.func ? cbctx.func : "??");
+    info.file = internal_strdup(cbctx.file ? cbctx.file : "-");
+    info.line = cbctx.line;
     info.column = 0;
   }
   return s;
 }
 
-extern "C" {
+struct SymbolizeDataContext {
+  uptr addr;
+  uptr heap;
+  uptr start;
+  uptr size;
+  char *name;
+  char *file;
+  uptr line;
+  uptr res;
+};
+
+ReportLocation *SymbolizeData(uptr addr) {
+  SymbolizeDataContext cbctx;
+  internal_memset(&cbctx, 0, sizeof(cbctx));
+  cbctx.addr = addr;
+  go_runtime_cb(CallbackSymbolizeData, &cbctx);
+  if (!cbctx.res)
+    return 0;
+  if (cbctx.heap) {
+    MBlock *b = ctx->metamap.GetBlock(cbctx.start);
+    if (!b)
+      return 0;
+    ReportLocation *loc = ReportLocation::New(ReportLocationHeap);
+    loc->heap_chunk_start = cbctx.start;
+    loc->heap_chunk_size = b->siz;
+    loc->tid = b->tid;
+    loc->stack = SymbolizeStackId(b->stk);
+    return loc;
+  } else {
+    ReportLocation *loc = ReportLocation::New(ReportLocationGlobal);
+    loc->global.name = internal_strdup(cbctx.name ? cbctx.name : "??");
+    loc->global.file = internal_strdup(cbctx.file ? cbctx.file : "??");
+    loc->global.line = cbctx.line;
+    loc->global.start = cbctx.start;
+    loc->global.size = cbctx.size;
+    return loc;
+  }
+}
 
 static ThreadState *main_thr;
 static bool inited;
 
+static Processor* get_cur_proc() {
+  if (UNLIKELY(!inited)) {
+    // Running Initialize().
+    // We have not yet returned the Processor to Go, so we cannot ask it back.
+    // Currently, Initialize() does not use the Processor, so return nullptr.
+    return nullptr;
+  }
+  Processor *proc;
+  go_runtime_cb(CallbackGetProc, &proc);
+  return proc;
+}
+
+Processor *ThreadState::proc() {
+  return get_cur_proc();
+}
+
+extern "C" {
+
 static ThreadState *AllocGoroutine() {
   ThreadState *thr = (ThreadState*)internal_alloc(MBlockThreadContex,
       sizeof(ThreadState));
@@ -81,11 +138,13 @@
   return thr;
 }
 
-void __tsan_init(ThreadState **thrp, void (*cb)(SymbolizeContext *cb)) {
-  symbolize_cb = cb;
+void __tsan_init(ThreadState **thrp, Processor **procp,
+                 void (*cb)(uptr cmd, void *cb)) {
+  go_runtime_cb = cb;
   ThreadState *thr = AllocGoroutine();
   main_thr = *thrp = thr;
   Initialize(thr);
+  *procp = thr->proc1;
   inited = true;
 }
 
@@ -140,12 +199,17 @@
   FuncExit(thr);
 }
 
-void __tsan_malloc(void *p, uptr sz) {
-  if (!inited)
-    return;
+void __tsan_malloc(ThreadState *thr, uptr pc, uptr p, uptr sz) {
+  CHECK(inited);
+  if (thr && pc)
+    ctx->metamap.AllocBlock(thr, pc, p, sz);
   MemoryResetRange(0, 0, (uptr)p, sz);
 }
 
+void __tsan_free(uptr p, uptr sz) {
+  ctx->metamap.FreeRange(get_cur_proc(), p, sz);
+}
+
 void __tsan_go_start(ThreadState *parent, ThreadState **pthr, void *pc) {
   ThreadState *thr = AllocGoroutine();
   *pthr = thr;
@@ -158,6 +222,14 @@
   internal_free(thr);
 }
 
+void __tsan_proc_create(Processor **pproc) {
+  *pproc = ProcCreate();
+}
+
+void __tsan_proc_destroy(Processor *proc) {
+  ProcDestroy(proc);
+}
+
 void __tsan_acquire(ThreadState *thr, void *addr) {
   Acquire(thr, 0, (uptr)addr);
 }
diff --git a/lib/tsan/rtl/tsan_debugging.cc b/lib/tsan/rtl/tsan_debugging.cc
new file mode 100644
index 0000000..ac24c89
--- /dev/null
+++ b/lib/tsan/rtl/tsan_debugging.cc
@@ -0,0 +1,162 @@
+//===-- tsan_debugging.cc -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of ThreadSanitizer (TSan), a race detector.
+//
+// TSan debugging API implementation.
+//===----------------------------------------------------------------------===//
+#include "tsan_interface.h"
+#include "tsan_report.h"
+#include "tsan_rtl.h"
+
+using namespace __tsan;
+
+static const char *ReportTypeDescription(ReportType typ) {
+  if (typ == ReportTypeRace) return "data-race";
+  if (typ == ReportTypeVptrRace) return "data-race-vptr";
+  if (typ == ReportTypeUseAfterFree) return "heap-use-after-free";
+  if (typ == ReportTypeVptrUseAfterFree) return "heap-use-after-free-vptr";
+  if (typ == ReportTypeThreadLeak) return "thread-leak";
+  if (typ == ReportTypeMutexDestroyLocked) return "locked-mutex-destroy";
+  if (typ == ReportTypeMutexDoubleLock) return "mutex-double-lock";
+  if (typ == ReportTypeMutexInvalidAccess) return "mutex-invalid-access";
+  if (typ == ReportTypeMutexBadUnlock) return "mutex-bad-unlock";
+  if (typ == ReportTypeMutexBadReadLock) return "mutex-bad-read-lock";
+  if (typ == ReportTypeMutexBadReadUnlock) return "mutex-bad-read-unlock";
+  if (typ == ReportTypeSignalUnsafe) return "signal-unsafe-call";
+  if (typ == ReportTypeErrnoInSignal) return "errno-in-signal-handler";
+  if (typ == ReportTypeDeadlock) return "lock-order-inversion";
+  return "";
+}
+
+static const char *ReportLocationTypeDescription(ReportLocationType typ) {
+  if (typ == ReportLocationGlobal) return "global";
+  if (typ == ReportLocationHeap) return "heap";
+  if (typ == ReportLocationStack) return "stack";
+  if (typ == ReportLocationTLS) return "tls";
+  if (typ == ReportLocationFD) return "fd";
+  return "";
+}
+
+static void CopyTrace(SymbolizedStack *first_frame, void **trace,
+                      uptr trace_size) {
+  uptr i = 0;
+  for (SymbolizedStack *frame = first_frame; frame != nullptr;
+       frame = frame->next) {
+    trace[i++] = (void *)frame->info.address;
+    if (i >= trace_size) break;
+  }
+}
+
+// Meant to be called by the debugger.
+SANITIZER_INTERFACE_ATTRIBUTE
+void *__tsan_get_current_report() {
+  return const_cast<ReportDesc*>(cur_thread()->current_report);
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE
+int __tsan_get_report_data(void *report, const char **description, int *count,
+                           int *stack_count, int *mop_count, int *loc_count,
+                           int *mutex_count, int *thread_count,
+                           int *unique_tid_count, void **sleep_trace,
+                           uptr trace_size) {
+  const ReportDesc *rep = (ReportDesc *)report;
+  *description = ReportTypeDescription(rep->typ);
+  *count = rep->count;
+  *stack_count = rep->stacks.Size();
+  *mop_count = rep->mops.Size();
+  *loc_count = rep->locs.Size();
+  *mutex_count = rep->mutexes.Size();
+  *thread_count = rep->threads.Size();
+  *unique_tid_count = rep->unique_tids.Size();
+  if (rep->sleep) CopyTrace(rep->sleep->frames, sleep_trace, trace_size);
+  return 1;
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE
+int __tsan_get_report_stack(void *report, uptr idx, void **trace,
+                            uptr trace_size) {
+  const ReportDesc *rep = (ReportDesc *)report;
+  CHECK_LT(idx, rep->stacks.Size());
+  ReportStack *stack = rep->stacks[idx];
+  if (stack) CopyTrace(stack->frames, trace, trace_size);
+  return stack ? 1 : 0;
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE
+int __tsan_get_report_mop(void *report, uptr idx, int *tid, void **addr,
+                          int *size, int *write, int *atomic, void **trace,
+                          uptr trace_size) {
+  const ReportDesc *rep = (ReportDesc *)report;
+  CHECK_LT(idx, rep->mops.Size());
+  ReportMop *mop = rep->mops[idx];
+  *tid = mop->tid;
+  *addr = (void *)mop->addr;
+  *size = mop->size;
+  *write = mop->write ? 1 : 0;
+  *atomic = mop->atomic ? 1 : 0;
+  if (mop->stack) CopyTrace(mop->stack->frames, trace, trace_size);
+  return 1;
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE
+int __tsan_get_report_loc(void *report, uptr idx, const char **type,
+                          void **addr, uptr *start, uptr *size, int *tid,
+                          int *fd, int *suppressable, void **trace,
+                          uptr trace_size) {
+  const ReportDesc *rep = (ReportDesc *)report;
+  CHECK_LT(idx, rep->locs.Size());
+  ReportLocation *loc = rep->locs[idx];
+  *type = ReportLocationTypeDescription(loc->type);
+  *addr = (void *)loc->global.start;
+  *start = loc->heap_chunk_start;
+  *size = loc->heap_chunk_size;
+  *tid = loc->tid;
+  *fd = loc->fd;
+  *suppressable = loc->suppressable;
+  if (loc->stack) CopyTrace(loc->stack->frames, trace, trace_size);
+  return 1;
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE
+int __tsan_get_report_mutex(void *report, uptr idx, uptr *mutex_id, void **addr,
+                            int *destroyed, void **trace, uptr trace_size) {
+  const ReportDesc *rep = (ReportDesc *)report;
+  CHECK_LT(idx, rep->mutexes.Size());
+  ReportMutex *mutex = rep->mutexes[idx];
+  *mutex_id = mutex->id;
+  *addr = (void *)mutex->addr;
+  *destroyed = mutex->destroyed;
+  if (mutex->stack) CopyTrace(mutex->stack->frames, trace, trace_size);
+  return 1;
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE
+int __tsan_get_report_thread(void *report, uptr idx, int *tid, uptr *os_id,
+                             int *running, const char **name, int *parent_tid,
+                             void **trace, uptr trace_size) {
+  const ReportDesc *rep = (ReportDesc *)report;
+  CHECK_LT(idx, rep->threads.Size());
+  ReportThread *thread = rep->threads[idx];
+  *tid = thread->id;
+  *os_id = thread->os_id;
+  *running = thread->running;
+  *name = thread->name;
+  *parent_tid = thread->parent_tid;
+  if (thread->stack) CopyTrace(thread->stack->frames, trace, trace_size);
+  return 1;
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE
+int __tsan_get_report_unique_tid(void *report, uptr idx, int *tid) {
+  const ReportDesc *rep = (ReportDesc *)report;
+  CHECK_LT(idx, rep->unique_tids.Size());
+  *tid = rep->unique_tids[idx];
+  return 1;
+}
diff --git a/lib/tsan/rtl/tsan_defs.h b/lib/tsan/rtl/tsan_defs.h
index 9c7b329..cdc23d0 100644
--- a/lib/tsan/rtl/tsan_defs.h
+++ b/lib/tsan/rtl/tsan_defs.h
@@ -29,7 +29,11 @@
 #endif
 
 #ifndef TSAN_CONTAINS_UBSAN
-# define TSAN_CONTAINS_UBSAN (CAN_SANITIZE_UB && !defined(SANITIZER_GO))
+# if CAN_SANITIZE_UB && !defined(SANITIZER_GO)
+#  define TSAN_CONTAINS_UBSAN 1
+# else
+#  define TSAN_CONTAINS_UBSAN 0
+# endif
 #endif
 
 namespace __tsan {
@@ -145,6 +149,7 @@
 
 MD5Hash md5_hash(const void *data, uptr size);
 
+struct Processor;
 struct ThreadState;
 class ThreadContext;
 struct Context;
diff --git a/lib/tsan/rtl/tsan_flags.cc b/lib/tsan/rtl/tsan_flags.cc
index 7615231..93f5986 100644
--- a/lib/tsan/rtl/tsan_flags.cc
+++ b/lib/tsan/rtl/tsan_flags.cc
@@ -71,6 +71,7 @@
     cf.print_suppressions = false;
     cf.stack_trace_format = "    #%n %f %S %M";
     cf.exitcode = 66;
+    cf.intercept_tls_get_addr = true;
     OverrideCommonFlags(cf);
   }
 
@@ -108,7 +109,7 @@
     f->report_signal_unsafe = false;
   }
 
-  SetVerbosity(common_flags()->verbosity);
+  InitializeCommonFlags();
 
   if (Verbosity()) ReportUnrecognizedFlags();
 
diff --git a/lib/tsan/rtl/tsan_flags.inc b/lib/tsan/rtl/tsan_flags.inc
index ab9ca99..4fb4436 100644
--- a/lib/tsan/rtl/tsan_flags.inc
+++ b/lib/tsan/rtl/tsan_flags.inc
@@ -76,3 +76,7 @@
 TSAN_FLAG(bool, die_after_fork, true,
           "Die after multi-threaded fork if the child creates new threads.")
 TSAN_FLAG(const char *, suppressions, "", "Suppressions file name.")
+TSAN_FLAG(bool, ignore_interceptors_accesses, false,
+          "Ignore reads and writes from all interceptors.")
+TSAN_FLAG(bool, shared_ptr_interceptor, true,
+          "Track atomic reference counting in libc++ shared_ptr and weak_ptr.")
diff --git a/lib/tsan/rtl/tsan_interceptors.cc b/lib/tsan/rtl/tsan_interceptors.cc
index 62c96cb..fb62276 100644
--- a/lib/tsan/rtl/tsan_interceptors.cc
+++ b/lib/tsan/rtl/tsan_interceptors.cc
@@ -19,6 +19,7 @@
 #include "sanitizer_common/sanitizer_platform_limits_posix.h"
 #include "sanitizer_common/sanitizer_placement_new.h"
 #include "sanitizer_common/sanitizer_stacktrace.h"
+#include "sanitizer_common/sanitizer_tls_get_addr.h"
 #include "interception/interception.h"
 #include "tsan_interceptors.h"
 #include "tsan_interface.h"
@@ -40,20 +41,8 @@
 #define stderr __stderrp
 #endif
 
-#if SANITIZER_FREEBSD
-#define __libc_realloc __realloc
-#define __libc_calloc __calloc
-#elif SANITIZER_MAC
-#define __libc_malloc REAL(malloc)
-#define __libc_realloc REAL(realloc)
-#define __libc_calloc REAL(calloc)
-#define __libc_free REAL(free)
-#elif SANITIZER_ANDROID
+#if SANITIZER_ANDROID
 #define __errno_location __errno
-#define __libc_malloc REAL(malloc)
-#define __libc_realloc REAL(realloc)
-#define __libc_calloc REAL(calloc)
-#define __libc_free REAL(free)
 #define mallopt(a, b)
 #endif
 
@@ -86,11 +75,9 @@
 };
 #endif
 
-#if defined(__x86_64__) || defined(__mips__) \
-  || (defined(__powerpc64__) && defined(__BIG_ENDIAN__))
+#if defined(__x86_64__) || defined(__mips__) || SANITIZER_PPC64V1
 #define PTHREAD_ABI_BASE  "GLIBC_2.3.2"
-#elif defined(__aarch64__) || (defined(__powerpc64__) \
-  && defined(__LITTLE_ENDIAN__))
+#elif defined(__aarch64__) || SANITIZER_PPC64V2
 #define PTHREAD_ABI_BASE  "GLIBC_2.17"
 #endif
 
@@ -103,8 +90,6 @@
 DECLARE_REAL(int, pthread_mutexattr_gettype, void *, void *)
 extern "C" int pthread_sigmask(int how, const __sanitizer_sigset_t *set,
                                __sanitizer_sigset_t *oldset);
-// REAL(sigfillset) defined in common interceptors.
-DECLARE_REAL(int, sigfillset, __sanitizer_sigset_t *set)
 DECLARE_REAL(int, fflush, __sanitizer_FILE *fp)
 DECLARE_REAL_AND_INTERCEPTOR(void *, malloc, uptr size)
 DECLARE_REAL_AND_INTERCEPTOR(void, free, void *ptr)
@@ -112,21 +97,22 @@
 extern "C" void _exit(int status);
 extern "C" int *__errno_location();
 extern "C" int fileno_unlocked(void *stream);
-#if !SANITIZER_ANDROID
-extern "C" void *__libc_calloc(uptr size, uptr n);
-extern "C" void *__libc_realloc(void *ptr, uptr size);
-#endif
 extern "C" int dirfd(void *dirp);
 #if !SANITIZER_FREEBSD && !SANITIZER_ANDROID
 extern "C" int mallopt(int param, int value);
 #endif
 extern __sanitizer_FILE *stdout, *stderr;
+#if !SANITIZER_FREEBSD && !SANITIZER_MAC
 const int PTHREAD_MUTEX_RECURSIVE = 1;
 const int PTHREAD_MUTEX_RECURSIVE_NP = 1;
+#else
+const int PTHREAD_MUTEX_RECURSIVE = 2;
+const int PTHREAD_MUTEX_RECURSIVE_NP = 2;
+#endif
 const int EINVAL = 22;
 const int EBUSY = 16;
 const int EOWNERDEAD = 130;
-#if !SANITIZER_MAC
+#if !SANITIZER_FREEBSD && !SANITIZER_MAC
 const int EPOLL_CTL_ADD = 1;
 #endif
 const int SIGILL = 4;
@@ -135,7 +121,7 @@
 const int SIGSEGV = 11;
 const int SIGPIPE = 13;
 const int SIGTERM = 15;
-#if defined(__mips__) || SANITIZER_MAC
+#if defined(__mips__) || SANITIZER_FREEBSD || SANITIZER_MAC
 const int SIGBUS = 10;
 const int SIGSYS = 12;
 #else
@@ -165,7 +151,7 @@
   u32 sa_flags;
   union {
     sighandler_t sa_handler;
-    sigactionhandler_t sa_sgiaction;
+    sigactionhandler_t sa_sigaction;
   };
   __sanitizer_sigset_t sa_mask;
   void (*sa_restorer)();
@@ -271,19 +257,24 @@
     : thr_(thr)
     , pc_(pc)
     , in_ignored_lib_(false) {
-  if (!thr_->ignore_interceptors) {
-    Initialize(thr);
+  Initialize(thr);
+  if (!thr_->is_inited)
+    return;
+  if (!thr_->ignore_interceptors)
     FuncEntry(thr, pc);
-  }
   DPrintf("#%d: intercept %s()\n", thr_->tid, fname);
   if (!thr_->in_ignored_lib && libignore()->IsIgnored(pc)) {
     in_ignored_lib_ = true;
     thr_->in_ignored_lib = true;
     ThreadIgnoreBegin(thr_, pc_);
   }
+  if (flags()->ignore_interceptors_accesses) ThreadIgnoreBegin(thr_, pc_);
 }
 
 ScopedInterceptor::~ScopedInterceptor() {
+  if (!thr_->is_inited)
+    return;
+  if (flags()->ignore_interceptors_accesses) ThreadIgnoreEnd(thr_, pc_);
   if (in_ignored_lib_) {
     thr_->in_ignored_lib = false;
     ThreadIgnoreEnd(thr_, pc_);
@@ -296,6 +287,7 @@
 }
 
 void ScopedInterceptor::UserCallbackStart() {
+  if (flags()->ignore_interceptors_accesses) ThreadIgnoreEnd(thr_, pc_);
   if (in_ignored_lib_) {
     thr_->in_ignored_lib = false;
     ThreadIgnoreEnd(thr_, pc_);
@@ -307,6 +299,7 @@
     thr_->in_ignored_lib = true;
     ThreadIgnoreBegin(thr_, pc_);
   }
+  if (flags()->ignore_interceptors_accesses) ThreadIgnoreBegin(thr_, pc_);
 }
 
 #define TSAN_INTERCEPT(func) INTERCEPT_FUNCTION(func)
@@ -387,7 +380,7 @@
   Acquire(thr, pc, (uptr)arg);
   AtExitCtx *ctx = (AtExitCtx*)arg;
   ((void(*)(void *arg))ctx->f)(ctx->arg);
-  __libc_free(ctx);
+  InternalFree(ctx);
 }
 
 static int setup_at_exit_wrapper(ThreadState *thr, uptr pc, void(*f)(),
@@ -413,7 +406,7 @@
 
 static int setup_at_exit_wrapper(ThreadState *thr, uptr pc, void(*f)(),
       void *arg, void *dso) {
-  AtExitCtx *ctx = (AtExitCtx*)__libc_malloc(sizeof(AtExitCtx));
+  AtExitCtx *ctx = (AtExitCtx*)InternalAlloc(sizeof(AtExitCtx));
   ctx->f = f;
   ctx->arg = arg;
   Release(thr, pc, (uptr)ctx);
@@ -432,14 +425,14 @@
   Acquire(thr, pc, (uptr)arg);
   AtExitCtx *ctx = (AtExitCtx*)arg;
   ((void(*)(int status, void *arg))ctx->f)(status, ctx->arg);
-  __libc_free(ctx);
+  InternalFree(ctx);
 }
 
 TSAN_INTERCEPTOR(int, on_exit, void(*f)(int, void*), void *arg) {
   if (cur_thread()->in_symbolizer)
     return 0;
   SCOPED_TSAN_INTERCEPTOR(on_exit, f, arg);
-  AtExitCtx *ctx = (AtExitCtx*)__libc_malloc(sizeof(AtExitCtx));
+  AtExitCtx *ctx = (AtExitCtx*)InternalAlloc(sizeof(AtExitCtx));
   ctx->f = (void(*)())f;
   ctx->arg = arg;
   Release(thr, pc, (uptr)ctx);
@@ -571,8 +564,11 @@
 #endif  // SANITIZER_MAC
 
 TSAN_INTERCEPTOR(void, longjmp, uptr *env, int val) {
+  // Note: if we call REAL(longjmp) in the context of ScopedInterceptor,
+  // bad things will happen. We will jump over ScopedInterceptor dtor and can
+  // leave thr->in_ignored_lib set.
   {
-    SCOPED_TSAN_INTERCEPTOR(longjmp, env, val);
+    SCOPED_INTERCEPTOR_RAW(longjmp, env, val);
   }
   LongJmp(cur_thread(), env);
   REAL(longjmp)(env, val);
@@ -580,7 +576,7 @@
 
 TSAN_INTERCEPTOR(void, siglongjmp, uptr *env, int val) {
   {
-    SCOPED_TSAN_INTERCEPTOR(siglongjmp, env, val);
+    SCOPED_INTERCEPTOR_RAW(siglongjmp, env, val);
   }
   LongJmp(cur_thread(), env);
   REAL(siglongjmp)(env, val);
@@ -589,7 +585,7 @@
 #if !SANITIZER_MAC
 TSAN_INTERCEPTOR(void*, malloc, uptr size) {
   if (cur_thread()->in_symbolizer)
-    return __libc_malloc(size);
+    return InternalAlloc(size);
   void *p = 0;
   {
     SCOPED_INTERCEPTOR_RAW(malloc, size);
@@ -606,7 +602,7 @@
 
 TSAN_INTERCEPTOR(void*, calloc, uptr size, uptr n) {
   if (cur_thread()->in_symbolizer)
-    return __libc_calloc(size, n);
+    return InternalCalloc(size, n);
   void *p = 0;
   {
     SCOPED_INTERCEPTOR_RAW(calloc, size, n);
@@ -618,7 +614,7 @@
 
 TSAN_INTERCEPTOR(void*, realloc, void *p, uptr size) {
   if (cur_thread()->in_symbolizer)
-    return __libc_realloc(p, size);
+    return InternalRealloc(p, size);
   if (p)
     invoke_free_hook(p);
   {
@@ -633,7 +629,7 @@
   if (p == 0)
     return;
   if (cur_thread()->in_symbolizer)
-    return __libc_free(p);
+    return InternalFree(p);
   invoke_free_hook(p);
   SCOPED_INTERCEPTOR_RAW(free, p);
   user_free(thr, pc, p);
@@ -643,7 +639,7 @@
   if (p == 0)
     return;
   if (cur_thread()->in_symbolizer)
-    return __libc_free(p);
+    return InternalFree(p);
   invoke_free_hook(p);
   SCOPED_INTERCEPTOR_RAW(cfree, p);
   user_free(thr, pc, p);
@@ -655,69 +651,6 @@
 }
 #endif
 
-TSAN_INTERCEPTOR(uptr, strlen, const char *s) {
-  SCOPED_TSAN_INTERCEPTOR(strlen, s);
-  uptr len = internal_strlen(s);
-  MemoryAccessRange(thr, pc, (uptr)s, len + 1, false);
-  return len;
-}
-
-TSAN_INTERCEPTOR(void*, memset, void *dst, int v, uptr size) {
-  // On FreeBSD we get here from libthr internals on thread initialization.
-  if (!COMMON_INTERCEPTOR_NOTHING_IS_INITIALIZED) {
-    SCOPED_TSAN_INTERCEPTOR(memset, dst, v, size);
-    MemoryAccessRange(thr, pc, (uptr)dst, size, true);
-  }
-  return internal_memset(dst, v, size);
-}
-
-TSAN_INTERCEPTOR(void*, memcpy, void *dst, const void *src, uptr size) {
-  // On FreeBSD we get here from libthr internals on thread initialization.
-  if (!COMMON_INTERCEPTOR_NOTHING_IS_INITIALIZED) {
-    SCOPED_TSAN_INTERCEPTOR(memcpy, dst, src, size);
-    MemoryAccessRange(thr, pc, (uptr)dst, size, true);
-    MemoryAccessRange(thr, pc, (uptr)src, size, false);
-  }
-  // On OS X, calling internal_memcpy here will cause memory corruptions,
-  // because memcpy and memmove are actually aliases of the same implementation.
-  // We need to use internal_memmove here.
-  return internal_memmove(dst, src, size);
-}
-
-TSAN_INTERCEPTOR(void*, memmove, void *dst, void *src, uptr n) {
-  if (!COMMON_INTERCEPTOR_NOTHING_IS_INITIALIZED) {
-    SCOPED_TSAN_INTERCEPTOR(memmove, dst, src, n);
-    MemoryAccessRange(thr, pc, (uptr)dst, n, true);
-    MemoryAccessRange(thr, pc, (uptr)src, n, false);
-  }
-  return REAL(memmove)(dst, src, n);
-}
-
-TSAN_INTERCEPTOR(char*, strchr, char *s, int c) {
-  SCOPED_TSAN_INTERCEPTOR(strchr, s, c);
-  char *res = REAL(strchr)(s, c);
-  uptr len = internal_strlen(s);
-  uptr n = res ? (char*)res - (char*)s + 1 : len + 1;
-  READ_STRING_OF_LEN(thr, pc, s, len, n);
-  return res;
-}
-
-#if !SANITIZER_MAC
-TSAN_INTERCEPTOR(char*, strchrnul, char *s, int c) {
-  SCOPED_TSAN_INTERCEPTOR(strchrnul, s, c);
-  char *res = REAL(strchrnul)(s, c);
-  uptr len = (char*)res - (char*)s + 1;
-  READ_STRING(thr, pc, s, len);
-  return res;
-}
-#endif
-
-TSAN_INTERCEPTOR(char*, strrchr, char *s, int c) {
-  SCOPED_TSAN_INTERCEPTOR(strrchr, s, c);
-  MemoryAccessRange(thr, pc, (uptr)s, internal_strlen(s) + 1, false);
-  return REAL(strrchr)(s, c);
-}
-
 TSAN_INTERCEPTOR(char*, strcpy, char *dst, const char *src) {  // NOLINT
   SCOPED_TSAN_INTERCEPTOR(strcpy, dst, src);  // NOLINT
   uptr srclen = internal_strlen(src);
@@ -763,7 +696,11 @@
   if (res != MAP_FAILED) {
     if (fd > 0)
       FdAccess(thr, pc, fd);
-    MemoryRangeImitateWrite(thr, pc, (uptr)res, sz);
+
+    if (thr->ignore_reads_and_writes == 0)
+      MemoryRangeImitateWrite(thr, pc, (uptr)res, sz);
+    else
+      MemoryResetRange(thr, pc, (uptr)res, sz);
   }
   return res;
 }
@@ -778,7 +715,11 @@
   if (res != MAP_FAILED) {
     if (fd > 0)
       FdAccess(thr, pc, fd);
-    MemoryRangeImitateWrite(thr, pc, (uptr)res, sz);
+
+    if (thr->ignore_reads_and_writes == 0)
+      MemoryRangeImitateWrite(thr, pc, (uptr)res, sz);
+    else
+      MemoryResetRange(thr, pc, (uptr)res, sz);
   }
   return res;
 }
@@ -792,7 +733,8 @@
   if (sz != 0) {
     // If sz == 0, munmap will return EINVAL and don't unmap any memory.
     DontNeedShadowFor((uptr)addr, sz);
-    ctx->metamap.ResetRange(thr, pc, (uptr)addr, (uptr)sz);
+    ScopedGlobalProcessor sgp;
+    ctx->metamap.ResetRange(thr->proc(), (uptr)addr, (uptr)sz);
   }
   int res = REAL(munmap)(addr, sz);
   return res;
@@ -887,12 +829,16 @@
 namespace __tsan {
 void DestroyThreadState() {
   ThreadState *thr = cur_thread();
+  Processor *proc = thr->proc();
   ThreadFinish(thr);
+  ProcUnwire(proc, thr);
+  ProcDestroy(proc);
   ThreadSignalContext *sctx = thr->signal_ctx;
   if (sctx) {
     thr->signal_ctx = 0;
     UnmapOrDie(sctx, sizeof(*sctx));
   }
+  DTLS_Destroy();
   cur_thread_finalize();
 }
 }  // namespace __tsan
@@ -938,6 +884,8 @@
 #endif
     while ((tid = atomic_load(&p->tid, memory_order_acquire)) == 0)
       internal_sched_yield();
+    Processor *proc = ProcCreate();
+    ProcWire(proc, thr);
     ThreadStart(thr, tid, GetTid());
     atomic_store(&p->tid, 0, memory_order_release);
   }
@@ -1095,12 +1043,12 @@
   return REAL(pthread_cond_init)(cond, a);
 }
 
-INTERCEPTOR(int, pthread_cond_wait, void *c, void *m) {
-  void *cond = init_cond(c);
-  SCOPED_TSAN_INTERCEPTOR(pthread_cond_wait, cond, m);
+static int cond_wait(ThreadState *thr, uptr pc, ScopedInterceptor *si,
+                     int (*fn)(void *c, void *m, void *abstime), void *c,
+                     void *m, void *t) {
   MemoryAccessRange(thr, pc, (uptr)c, sizeof(uptr), false);
   MutexUnlock(thr, pc, (uptr)m);
-  CondMutexUnlockCtx arg = {&si, thr, pc, m};
+  CondMutexUnlockCtx arg = {si, thr, pc, m};
   int res = 0;
   // This ensures that we handle mutex lock even in case of pthread_cancel.
   // See test/tsan/cond_cancel.cc.
@@ -1108,36 +1056,38 @@
     // Enable signal delivery while the thread is blocked.
     BlockingCall bc(thr);
     res = call_pthread_cancel_with_cleanup(
-        (int(*)(void *c, void *m, void *abstime))REAL(pthread_cond_wait),
-        cond, m, 0, (void(*)(void *arg))cond_mutex_unlock, &arg);
+        fn, c, m, t, (void (*)(void *arg))cond_mutex_unlock, &arg);
   }
-  if (res == errno_EOWNERDEAD)
-    MutexRepair(thr, pc, (uptr)m);
+  if (res == errno_EOWNERDEAD) MutexRepair(thr, pc, (uptr)m);
   MutexLock(thr, pc, (uptr)m);
   return res;
 }
 
+INTERCEPTOR(int, pthread_cond_wait, void *c, void *m) {
+  void *cond = init_cond(c);
+  SCOPED_TSAN_INTERCEPTOR(pthread_cond_wait, cond, m);
+  return cond_wait(thr, pc, &si, (int (*)(void *c, void *m, void *abstime))REAL(
+                                     pthread_cond_wait),
+                   cond, m, 0);
+}
+
 INTERCEPTOR(int, pthread_cond_timedwait, void *c, void *m, void *abstime) {
   void *cond = init_cond(c);
   SCOPED_TSAN_INTERCEPTOR(pthread_cond_timedwait, cond, m, abstime);
-  MemoryAccessRange(thr, pc, (uptr)c, sizeof(uptr), false);
-  MutexUnlock(thr, pc, (uptr)m);
-  CondMutexUnlockCtx arg = {&si, thr, pc, m};
-  int res = 0;
-  // This ensures that we handle mutex lock even in case of pthread_cancel.
-  // See test/tsan/cond_cancel.cc.
-  {
-    BlockingCall bc(thr);
-    res = call_pthread_cancel_with_cleanup(
-        REAL(pthread_cond_timedwait), cond, m, abstime,
-        (void(*)(void *arg))cond_mutex_unlock, &arg);
-  }
-  if (res == errno_EOWNERDEAD)
-    MutexRepair(thr, pc, (uptr)m);
-  MutexLock(thr, pc, (uptr)m);
-  return res;
+  return cond_wait(thr, pc, &si, REAL(pthread_cond_timedwait), cond, m,
+                   abstime);
 }
 
+#if SANITIZER_MAC
+INTERCEPTOR(int, pthread_cond_timedwait_relative_np, void *c, void *m,
+            void *reltime) {
+  void *cond = init_cond(c);
+  SCOPED_TSAN_INTERCEPTOR(pthread_cond_timedwait_relative_np, cond, m, reltime);
+  return cond_wait(thr, pc, &si, REAL(pthread_cond_timedwait_relative_np), cond,
+                   m, reltime);
+}
+#endif
+
 INTERCEPTOR(int, pthread_cond_signal, void *c) {
   void *cond = init_cond(c);
   SCOPED_TSAN_INTERCEPTOR(pthread_cond_signal, cond);
@@ -1395,96 +1345,6 @@
 }
 
 #if SANITIZER_LINUX && !SANITIZER_ANDROID
-TSAN_INTERCEPTOR(int, __xstat, int version, const char *path, void *buf) {
-  SCOPED_TSAN_INTERCEPTOR(__xstat, version, path, buf);
-  READ_STRING(thr, pc, path, 0);
-  return REAL(__xstat)(version, path, buf);
-}
-#define TSAN_MAYBE_INTERCEPT___XSTAT TSAN_INTERCEPT(__xstat)
-#else
-#define TSAN_MAYBE_INTERCEPT___XSTAT
-#endif
-
-TSAN_INTERCEPTOR(int, stat, const char *path, void *buf) {
-#if SANITIZER_FREEBSD || SANITIZER_MAC || SANITIZER_ANDROID
-  SCOPED_TSAN_INTERCEPTOR(stat, path, buf);
-  READ_STRING(thr, pc, path, 0);
-  return REAL(stat)(path, buf);
-#else
-  SCOPED_TSAN_INTERCEPTOR(__xstat, 0, path, buf);
-  READ_STRING(thr, pc, path, 0);
-  return REAL(__xstat)(0, path, buf);
-#endif
-}
-
-#if SANITIZER_LINUX && !SANITIZER_ANDROID
-TSAN_INTERCEPTOR(int, __xstat64, int version, const char *path, void *buf) {
-  SCOPED_TSAN_INTERCEPTOR(__xstat64, version, path, buf);
-  READ_STRING(thr, pc, path, 0);
-  return REAL(__xstat64)(version, path, buf);
-}
-#define TSAN_MAYBE_INTERCEPT___XSTAT64 TSAN_INTERCEPT(__xstat64)
-#else
-#define TSAN_MAYBE_INTERCEPT___XSTAT64
-#endif
-
-#if SANITIZER_LINUX && !SANITIZER_ANDROID
-TSAN_INTERCEPTOR(int, stat64, const char *path, void *buf) {
-  SCOPED_TSAN_INTERCEPTOR(__xstat64, 0, path, buf);
-  READ_STRING(thr, pc, path, 0);
-  return REAL(__xstat64)(0, path, buf);
-}
-#define TSAN_MAYBE_INTERCEPT_STAT64 TSAN_INTERCEPT(stat64)
-#else
-#define TSAN_MAYBE_INTERCEPT_STAT64
-#endif
-
-#if SANITIZER_LINUX && !SANITIZER_ANDROID
-TSAN_INTERCEPTOR(int, __lxstat, int version, const char *path, void *buf) {
-  SCOPED_TSAN_INTERCEPTOR(__lxstat, version, path, buf);
-  READ_STRING(thr, pc, path, 0);
-  return REAL(__lxstat)(version, path, buf);
-}
-#define TSAN_MAYBE_INTERCEPT___LXSTAT TSAN_INTERCEPT(__lxstat)
-#else
-#define TSAN_MAYBE_INTERCEPT___LXSTAT
-#endif
-
-TSAN_INTERCEPTOR(int, lstat, const char *path, void *buf) {
-#if SANITIZER_FREEBSD || SANITIZER_MAC || SANITIZER_ANDROID
-  SCOPED_TSAN_INTERCEPTOR(lstat, path, buf);
-  READ_STRING(thr, pc, path, 0);
-  return REAL(lstat)(path, buf);
-#else
-  SCOPED_TSAN_INTERCEPTOR(__lxstat, 0, path, buf);
-  READ_STRING(thr, pc, path, 0);
-  return REAL(__lxstat)(0, path, buf);
-#endif
-}
-
-#if SANITIZER_LINUX && !SANITIZER_ANDROID
-TSAN_INTERCEPTOR(int, __lxstat64, int version, const char *path, void *buf) {
-  SCOPED_TSAN_INTERCEPTOR(__lxstat64, version, path, buf);
-  READ_STRING(thr, pc, path, 0);
-  return REAL(__lxstat64)(version, path, buf);
-}
-#define TSAN_MAYBE_INTERCEPT___LXSTAT64 TSAN_INTERCEPT(__lxstat64)
-#else
-#define TSAN_MAYBE_INTERCEPT___LXSTAT64
-#endif
-
-#if SANITIZER_LINUX && !SANITIZER_ANDROID
-TSAN_INTERCEPTOR(int, lstat64, const char *path, void *buf) {
-  SCOPED_TSAN_INTERCEPTOR(__lxstat64, 0, path, buf);
-  READ_STRING(thr, pc, path, 0);
-  return REAL(__lxstat64)(0, path, buf);
-}
-#define TSAN_MAYBE_INTERCEPT_LSTAT64 TSAN_INTERCEPT(lstat64)
-#else
-#define TSAN_MAYBE_INTERCEPT_LSTAT64
-#endif
-
-#if SANITIZER_LINUX && !SANITIZER_ANDROID
 TSAN_INTERCEPTOR(int, __fxstat, int version, int fd, void *buf) {
   SCOPED_TSAN_INTERCEPTOR(__fxstat, version, fd, buf);
   if (fd > 0)
@@ -1701,32 +1561,6 @@
   return res;
 }
 
-#if SANITIZER_LINUX
-TSAN_INTERCEPTOR(int, epoll_create, int size) {
-  SCOPED_TSAN_INTERCEPTOR(epoll_create, size);
-  int fd = REAL(epoll_create)(size);
-  if (fd >= 0)
-    FdPollCreate(thr, pc, fd);
-  return fd;
-}
-#define TSAN_MAYBE_INTERCEPT_EPOLL_CREATE TSAN_INTERCEPT(epoll_create)
-#else
-#define TSAN_MAYBE_INTERCEPT_EPOLL_CREATE
-#endif
-
-#if SANITIZER_LINUX
-TSAN_INTERCEPTOR(int, epoll_create1, int flags) {
-  SCOPED_TSAN_INTERCEPTOR(epoll_create1, flags);
-  int fd = REAL(epoll_create1)(flags);
-  if (fd >= 0)
-    FdPollCreate(thr, pc, fd);
-  return fd;
-}
-#define TSAN_MAYBE_INTERCEPT_EPOLL_CREATE1 TSAN_INTERCEPT(epoll_create1)
-#else
-#define TSAN_MAYBE_INTERCEPT_EPOLL_CREATE1
-#endif
-
 TSAN_INTERCEPTOR(int, close, int fd) {
   SCOPED_TSAN_INTERCEPTOR(close, fd);
   if (fd >= 0)
@@ -1781,37 +1615,6 @@
 }
 #endif
 
-TSAN_INTERCEPTOR(long_t, send, int fd, void *buf, long_t len, int flags) {
-  SCOPED_TSAN_INTERCEPTOR(send, fd, buf, len, flags);
-  if (fd >= 0) {
-    FdAccess(thr, pc, fd);
-    FdRelease(thr, pc, fd);
-  }
-  int res = REAL(send)(fd, buf, len, flags);
-  return res;
-}
-
-TSAN_INTERCEPTOR(long_t, sendmsg, int fd, void *msg, int flags) {
-  SCOPED_TSAN_INTERCEPTOR(sendmsg, fd, msg, flags);
-  if (fd >= 0) {
-    FdAccess(thr, pc, fd);
-    FdRelease(thr, pc, fd);
-  }
-  int res = REAL(sendmsg)(fd, msg, flags);
-  return res;
-}
-
-TSAN_INTERCEPTOR(long_t, recv, int fd, void *buf, long_t len, int flags) {
-  SCOPED_TSAN_INTERCEPTOR(recv, fd, buf, len, flags);
-  if (fd >= 0)
-    FdAccess(thr, pc, fd);
-  int res = REAL(recv)(fd, buf, len, flags);
-  if (res >= 0 && fd >= 0) {
-    FdAcquire(thr, pc, fd);
-  }
-  return res;
-}
-
 TSAN_INTERCEPTOR(int, unlink, char *path) {
   SCOPED_TSAN_INTERCEPTOR(unlink, path);
   Release(thr, pc, File2addr(path));
@@ -1892,12 +1695,30 @@
 
 TSAN_INTERCEPTOR(int, closedir, void *dirp) {
   SCOPED_TSAN_INTERCEPTOR(closedir, dirp);
-  int fd = dirfd(dirp);
-  FdClose(thr, pc, fd);
+  if (dirp) {
+    int fd = dirfd(dirp);
+    FdClose(thr, pc, fd);
+  }
   return REAL(closedir)(dirp);
 }
 
 #if SANITIZER_LINUX
+TSAN_INTERCEPTOR(int, epoll_create, int size) {
+  SCOPED_TSAN_INTERCEPTOR(epoll_create, size);
+  int fd = REAL(epoll_create)(size);
+  if (fd >= 0)
+    FdPollCreate(thr, pc, fd);
+  return fd;
+}
+
+TSAN_INTERCEPTOR(int, epoll_create1, int flags) {
+  SCOPED_TSAN_INTERCEPTOR(epoll_create1, flags);
+  int fd = REAL(epoll_create1)(flags);
+  if (fd >= 0)
+    FdPollCreate(thr, pc, fd);
+  return fd;
+}
+
 TSAN_INTERCEPTOR(int, epoll_ctl, int epfd, int op, int fd, void *ev) {
   SCOPED_TSAN_INTERCEPTOR(epoll_ctl, epfd, op, fd, ev);
   if (epfd >= 0)
@@ -1909,12 +1730,7 @@
   int res = REAL(epoll_ctl)(epfd, op, fd, ev);
   return res;
 }
-#define TSAN_MAYBE_INTERCEPT_EPOLL_CTL TSAN_INTERCEPT(epoll_ctl)
-#else
-#define TSAN_MAYBE_INTERCEPT_EPOLL_CTL
-#endif
 
-#if SANITIZER_LINUX
 TSAN_INTERCEPTOR(int, epoll_wait, int epfd, void *ev, int cnt, int timeout) {
   SCOPED_TSAN_INTERCEPTOR(epoll_wait, epfd, ev, cnt, timeout);
   if (epfd >= 0)
@@ -1924,9 +1740,26 @@
     FdAcquire(thr, pc, epfd);
   return res;
 }
-#define TSAN_MAYBE_INTERCEPT_EPOLL_WAIT TSAN_INTERCEPT(epoll_wait)
+
+TSAN_INTERCEPTOR(int, epoll_pwait, int epfd, void *ev, int cnt, int timeout,
+                 void *sigmask) {
+  SCOPED_TSAN_INTERCEPTOR(epoll_pwait, epfd, ev, cnt, timeout, sigmask);
+  if (epfd >= 0)
+    FdAccess(thr, pc, epfd);
+  int res = BLOCK_REAL(epoll_pwait)(epfd, ev, cnt, timeout, sigmask);
+  if (res > 0 && epfd >= 0)
+    FdAcquire(thr, pc, epfd);
+  return res;
+}
+
+#define TSAN_MAYBE_INTERCEPT_EPOLL \
+    TSAN_INTERCEPT(epoll_create); \
+    TSAN_INTERCEPT(epoll_create1); \
+    TSAN_INTERCEPT(epoll_ctl); \
+    TSAN_INTERCEPT(epoll_wait); \
+    TSAN_INTERCEPT(epoll_pwait)
 #else
-#define TSAN_MAYBE_INTERCEPT_EPOLL_WAIT
+#define TSAN_MAYBE_INTERCEPT_EPOLL
 #endif
 
 namespace __tsan {
@@ -1935,6 +1768,19 @@
     bool sigact, int sig, my_siginfo_t *info, void *uctx) {
   if (acquire)
     Acquire(thr, 0, (uptr)&sigactions[sig]);
+  // Signals are generally asynchronous, so if we receive a signals when
+  // ignores are enabled we should disable ignores. This is critical for sync
+  // and interceptors, because otherwise we can miss syncronization and report
+  // false races.
+  int ignore_reads_and_writes = thr->ignore_reads_and_writes;
+  int ignore_interceptors = thr->ignore_interceptors;
+  int ignore_sync = thr->ignore_sync;
+  if (!ctx->after_multithreaded_fork) {
+    thr->ignore_reads_and_writes = 0;
+    thr->fast_state.ClearIgnoreBit();
+    thr->ignore_interceptors = 0;
+    thr->ignore_sync = 0;
+  }
   // Ensure that the handler does not spoil errno.
   const int saved_errno = errno;
   errno = 99;
@@ -1950,6 +1796,13 @@
     else
       ((sighandler_t)pc)(sig);
   }
+  if (!ctx->after_multithreaded_fork) {
+    thr->ignore_reads_and_writes = ignore_reads_and_writes;
+    if (ignore_reads_and_writes)
+      thr->fast_state.SetIgnoreBit();
+    thr->ignore_interceptors = ignore_interceptors;
+    thr->ignore_sync = ignore_sync;
+  }
   // We do not detect errno spoiling for SIGTERM,
   // because some SIGTERM handlers do spoil errno but reraise SIGTERM,
   // tsan reports false positive in such case.
@@ -1979,7 +1832,7 @@
     return;
   atomic_store(&sctx->have_pending_signals, 0, memory_order_relaxed);
   atomic_fetch_add(&thr->in_signal_handler, 1, memory_order_relaxed);
-  CHECK_EQ(0, REAL(sigfillset)(&sctx->emptyset));
+  internal_sigfillset(&sctx->emptyset);
   CHECK_EQ(0, pthread_sigmask(SIG_SETMASK, &sctx->emptyset, &sctx->oldset));
   for (int sig = 0; sig < kSigCount; sig++) {
     SignalDesc *signal = &sctx->pending_signals[sig];
@@ -2019,13 +1872,8 @@
       (sctx && atomic_load(&sctx->in_blocking_func, memory_order_relaxed))) {
     atomic_fetch_add(&thr->in_signal_handler, 1, memory_order_relaxed);
     if (sctx && atomic_load(&sctx->in_blocking_func, memory_order_relaxed)) {
-      // We ignore interceptors in blocking functions,
-      // temporary enbled them again while we are calling user function.
-      int const i = thr->ignore_interceptors;
-      thr->ignore_interceptors = 0;
       atomic_store(&sctx->in_blocking_func, 0, memory_order_relaxed);
       CallUserSignalHandler(thr, sync, true, sigact, sig, info, ctx);
-      thr->ignore_interceptors = i;
       atomic_store(&sctx->in_blocking_func, 1, memory_order_relaxed);
     } else {
       // Be very conservative with when we do acquire in this case.
@@ -2063,7 +1911,10 @@
 }
 
 TSAN_INTERCEPTOR(int, sigaction, int sig, sigaction_t *act, sigaction_t *old) {
-  SCOPED_TSAN_INTERCEPTOR(sigaction, sig, act, old);
+  // Note: if we call REAL(sigaction) directly for any reason without proxying
+  // the signal handler through rtl_sigaction, very bad things will happen.
+  // The handler will run synchronously and corrupt tsan per-thread state.
+  SCOPED_INTERCEPTOR_RAW(sigaction, sig, act, old);
   if (old)
     internal_memcpy(old, &sigactions[sig], sizeof(*old));
   if (act == 0)
@@ -2083,7 +1934,7 @@
 #endif
   sigaction_t newact;
   internal_memcpy(&newact, act, sizeof(newact));
-  REAL(sigfillset)(&newact.sa_mask);
+  internal_sigfillset(&newact.sa_mask);
   if (act->sa_handler != SIG_IGN && act->sa_handler != SIG_DFL) {
     if (newact.sa_flags & SA_SIGINFO)
       newact.sa_sigaction = rtl_sigaction;
@@ -2098,7 +1949,7 @@
 TSAN_INTERCEPTOR(sighandler_t, signal, int sig, sighandler_t h) {
   sigaction_t act;
   act.sa_handler = h;
-  REAL(memset)(&act.sa_mask, -1, sizeof(act.sa_mask));
+  internal_memset(&act.sa_mask, -1, sizeof(act.sa_mask));
   act.sa_flags = 0;
   sigaction_t old;
   int res = sigaction(sig, &act, &old);
@@ -2179,7 +2030,13 @@
     return REAL(fork)(fake);
   SCOPED_INTERCEPTOR_RAW(fork, fake);
   ForkBefore(thr, pc);
-  int pid = REAL(fork)(fake);
+  int pid;
+  {
+    // On OS X, REAL(fork) can call intercepted functions (OSSpinLockLock), and
+    // we'll assert in CheckNoLocks() unless we ignore interceptors.
+    ScopedIgnoreInterceptors ignore;
+    pid = REAL(fork)(fake);
+  }
   if (pid == 0) {
     // child
     ForkChildAfter(thr, pc);
@@ -2294,18 +2151,15 @@
 #undef SANITIZER_INTERCEPT_FGETPWENT
 #undef SANITIZER_INTERCEPT_GETPWNAM_AND_FRIENDS
 #undef SANITIZER_INTERCEPT_GETPWNAM_R_AND_FRIENDS
-// __tls_get_addr can be called with mis-aligned stack due to:
-// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
-// There are two potential issues:
-// 1. Sanitizer code contains a MOVDQA spill (it does not seem to be the case
-// right now). or 2. ProcessPendingSignal calls user handler which contains
-// MOVDQA spill (this happens right now).
-// Since the interceptor only initializes memory for msan, the simplest solution
-// is to disable the interceptor in tsan (other sanitizers do not call
-// signal handlers from COMMON_INTERCEPTOR_ENTER).
+// We define our own.
+#if SANITIZER_INTERCEPT_TLS_GET_ADDR
+#define NEED_TLS_GET_ADDR
+#endif
 #undef SANITIZER_INTERCEPT_TLS_GET_ADDR
 
 #define COMMON_INTERCEPT_FUNCTION(name) INTERCEPT_FUNCTION(name)
+#define COMMON_INTERCEPT_FUNCTION_VER(name, ver)                          \
+  INTERCEPT_FUNCTION_VER(name, ver)
 
 #define COMMON_INTERCEPTOR_WRITE_RANGE(ctx, ptr, size)                    \
   MemoryAccessRange(((TsanInterceptorContext *)ctx)->thr,                 \
@@ -2392,6 +2246,10 @@
   MutexRepair(((TsanInterceptorContext *)ctx)->thr, \
             ((TsanInterceptorContext *)ctx)->pc, (uptr)m)
 
+#define COMMON_INTERCEPTOR_MUTEX_INVALID(ctx, m) \
+  MutexInvalidAccess(((TsanInterceptorContext *)ctx)->thr, \
+                     ((TsanInterceptorContext *)ctx)->pc, (uptr)m)
+
 #if !SANITIZER_MAC
 #define COMMON_INTERCEPTOR_HANDLE_RECVMSG(ctx, msg) \
   HandleRecvmsg(((TsanInterceptorContext *)ctx)->thr, \
@@ -2406,6 +2264,12 @@
     *begin = *end = 0;                                                         \
   }
 
+#define COMMON_INTERCEPTOR_USER_CALLBACK_START() \
+  SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_START()
+
+#define COMMON_INTERCEPTOR_USER_CALLBACK_END() \
+  SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_END()
+
 #include "sanitizer_common/sanitizer_common_interceptors.inc"
 
 #define TSAN_SYSCALL() \
@@ -2428,7 +2292,7 @@
   }
 };
 
-#if !SANITIZER_MAC
+#if !SANITIZER_FREEBSD && !SANITIZER_MAC
 static void syscall_access_range(uptr pc, uptr p, uptr s, bool write) {
   TSAN_SYSCALL();
   MemoryAccessRange(thr, pc, p, s, write);
@@ -2522,6 +2386,31 @@
 
 #include "sanitizer_common/sanitizer_common_syscalls.inc"
 
+#ifdef NEED_TLS_GET_ADDR
+// Define own interceptor instead of sanitizer_common's for three reasons:
+// 1. It must not process pending signals.
+//    Signal handlers may contain MOVDQA instruction (see below).
+// 2. It must be as simple as possible to not contain MOVDQA.
+// 3. Sanitizer_common version uses COMMON_INTERCEPTOR_INITIALIZE_RANGE which
+//    is empty for tsan (meant only for msan).
+// Note: __tls_get_addr can be called with mis-aligned stack due to:
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
+// So the interceptor must work with mis-aligned stack, in particular, does not
+// execute MOVDQA with stack addresses.
+TSAN_INTERCEPTOR(void *, __tls_get_addr, void *arg) {
+  void *res = REAL(__tls_get_addr)(arg);
+  ThreadState *thr = cur_thread();
+  if (!thr)
+    return res;
+  DTLS::DTV *dtv = DTLS_on_tls_get_addr(arg, res, thr->tls_addr, thr->tls_size);
+  if (!dtv)
+    return res;
+  // New DTLS block has been allocated.
+  MemoryResetRange(thr, 0, dtv->beg, dtv->size);
+  return res;
+}
+#endif
+
 namespace __tsan {
 
 static void finalize(void *arg) {
@@ -2582,13 +2471,6 @@
   TSAN_MAYBE_INTERCEPT_PVALLOC;
   TSAN_INTERCEPT(posix_memalign);
 
-  TSAN_INTERCEPT(strlen);
-  TSAN_INTERCEPT(memset);
-  TSAN_INTERCEPT(memcpy);
-  TSAN_INTERCEPT(memmove);
-  TSAN_INTERCEPT(strchr);
-  TSAN_INTERCEPT(strchrnul);
-  TSAN_INTERCEPT(strrchr);
   TSAN_INTERCEPT(strcpy);  // NOLINT
   TSAN_INTERCEPT(strncpy);
   TSAN_INTERCEPT(strdup);
@@ -2631,14 +2513,6 @@
 
   TSAN_INTERCEPT(pthread_once);
 
-  TSAN_INTERCEPT(stat);
-  TSAN_MAYBE_INTERCEPT___XSTAT;
-  TSAN_MAYBE_INTERCEPT_STAT64;
-  TSAN_MAYBE_INTERCEPT___XSTAT64;
-  TSAN_INTERCEPT(lstat);
-  TSAN_MAYBE_INTERCEPT___LXSTAT;
-  TSAN_MAYBE_INTERCEPT_LSTAT64;
-  TSAN_MAYBE_INTERCEPT___LXSTAT64;
   TSAN_INTERCEPT(fstat);
   TSAN_MAYBE_INTERCEPT___FXSTAT;
   TSAN_MAYBE_INTERCEPT_FSTAT64;
@@ -2659,18 +2533,13 @@
   TSAN_INTERCEPT(connect);
   TSAN_INTERCEPT(bind);
   TSAN_INTERCEPT(listen);
-  TSAN_MAYBE_INTERCEPT_EPOLL_CREATE;
-  TSAN_MAYBE_INTERCEPT_EPOLL_CREATE1;
+  TSAN_MAYBE_INTERCEPT_EPOLL;
   TSAN_INTERCEPT(close);
   TSAN_MAYBE_INTERCEPT___CLOSE;
   TSAN_MAYBE_INTERCEPT___RES_ICLOSE;
   TSAN_INTERCEPT(pipe);
   TSAN_INTERCEPT(pipe2);
 
-  TSAN_INTERCEPT(send);
-  TSAN_INTERCEPT(sendmsg);
-  TSAN_INTERCEPT(recv);
-
   TSAN_INTERCEPT(unlink);
   TSAN_INTERCEPT(tmpfile);
   TSAN_MAYBE_INTERCEPT_TMPFILE64;
@@ -2681,9 +2550,6 @@
   TSAN_INTERCEPT(rmdir);
   TSAN_INTERCEPT(closedir);
 
-  TSAN_MAYBE_INTERCEPT_EPOLL_CTL;
-  TSAN_MAYBE_INTERCEPT_EPOLL_WAIT;
-
   TSAN_INTERCEPT(sigaction);
   TSAN_INTERCEPT(signal);
   TSAN_INTERCEPT(sigsuspend);
@@ -2705,6 +2571,10 @@
   TSAN_INTERCEPT(__cxa_atexit);
   TSAN_INTERCEPT(_exit);
 
+#ifdef NEED_TLS_GET_ADDR
+  TSAN_INTERCEPT(__tls_get_addr);
+#endif
+
 #if !SANITIZER_MAC && !SANITIZER_ANDROID
   // Need to setup it, because interceptors check that the function is resolved.
   // But atexit is emitted directly into the module, so can't be resolved.
diff --git a/lib/tsan/rtl/tsan_interceptors.h b/lib/tsan/rtl/tsan_interceptors.h
index d831620..a0f9a07 100644
--- a/lib/tsan/rtl/tsan_interceptors.h
+++ b/lib/tsan/rtl/tsan_interceptors.h
@@ -34,7 +34,7 @@
       Report("FATAL: ThreadSanitizer: failed to intercept %s\n", #func); \
       Die(); \
     }                                                    \
-    if (thr->ignore_interceptors || thr->in_ignored_lib) \
+    if (!thr->is_inited || thr->ignore_interceptors || thr->in_ignored_lib) \
       return REAL(func)(__VA_ARGS__); \
 /**/
 
@@ -46,12 +46,4 @@
 
 #define TSAN_INTERCEPTOR(ret, func, ...) INTERCEPTOR(ret, func, __VA_ARGS__)
 
-#if SANITIZER_FREEBSD
-#define __libc_free __free
-#define __libc_malloc __malloc
-#endif
-
-extern "C" void __libc_free(void *ptr);
-extern "C" void *__libc_malloc(uptr size);
-
 #endif  // TSAN_INTERCEPTORS_H
diff --git a/lib/tsan/rtl/tsan_interceptors_mac.cc b/lib/tsan/rtl/tsan_interceptors_mac.cc
index 2bf7ad9..5939638 100644
--- a/lib/tsan/rtl/tsan_interceptors_mac.cc
+++ b/lib/tsan/rtl/tsan_interceptors_mac.cc
@@ -17,11 +17,161 @@
 
 #include "interception/interception.h"
 #include "tsan_interceptors.h"
+#include "tsan_interface.h"
+#include "tsan_interface_ann.h"
 
 #include <libkern/OSAtomic.h>
+#include <xpc/xpc.h>
+
+typedef long long_t;  // NOLINT
 
 namespace __tsan {
 
+// The non-barrier versions of OSAtomic* functions are semantically mo_relaxed,
+// but the two variants (e.g. OSAtomicAdd32 and OSAtomicAdd32Barrier) are
+// actually aliases of each other, and we cannot have different interceptors for
+// them, because they're actually the same function.  Thus, we have to stay
+// conservative and treat the non-barrier versions as mo_acq_rel.
+static const morder kMacOrderBarrier = mo_acq_rel;
+static const morder kMacOrderNonBarrier = mo_acq_rel;
+
+#define OSATOMIC_INTERCEPTOR(return_t, t, tsan_t, f, tsan_atomic_f, mo) \
+  TSAN_INTERCEPTOR(return_t, f, t x, volatile t *ptr) {                 \
+    SCOPED_TSAN_INTERCEPTOR(f, x, ptr);                                 \
+    return tsan_atomic_f((volatile tsan_t *)ptr, x, mo);                \
+  }
+
+#define OSATOMIC_INTERCEPTOR_PLUS_X(return_t, t, tsan_t, f, tsan_atomic_f, mo) \
+  TSAN_INTERCEPTOR(return_t, f, t x, volatile t *ptr) {                        \
+    SCOPED_TSAN_INTERCEPTOR(f, x, ptr);                                        \
+    return tsan_atomic_f((volatile tsan_t *)ptr, x, mo) + x;                   \
+  }
+
+#define OSATOMIC_INTERCEPTOR_PLUS_1(return_t, t, tsan_t, f, tsan_atomic_f, mo) \
+  TSAN_INTERCEPTOR(return_t, f, volatile t *ptr) {                             \
+    SCOPED_TSAN_INTERCEPTOR(f, ptr);                                           \
+    return tsan_atomic_f((volatile tsan_t *)ptr, 1, mo) + 1;                   \
+  }
+
+#define OSATOMIC_INTERCEPTOR_MINUS_1(return_t, t, tsan_t, f, tsan_atomic_f, \
+                                     mo)                                    \
+  TSAN_INTERCEPTOR(return_t, f, volatile t *ptr) {                          \
+    SCOPED_TSAN_INTERCEPTOR(f, ptr);                                        \
+    return tsan_atomic_f((volatile tsan_t *)ptr, 1, mo) - 1;                \
+  }
+
+#define OSATOMIC_INTERCEPTORS_ARITHMETIC(f, tsan_atomic_f, m)                  \
+  m(int32_t, int32_t, a32, f##32, __tsan_atomic32_##tsan_atomic_f,             \
+    kMacOrderNonBarrier)                                                       \
+  m(int32_t, int32_t, a32, f##32##Barrier, __tsan_atomic32_##tsan_atomic_f,    \
+    kMacOrderBarrier)                                                          \
+  m(int64_t, int64_t, a64, f##64, __tsan_atomic64_##tsan_atomic_f,             \
+    kMacOrderNonBarrier)                                                       \
+  m(int64_t, int64_t, a64, f##64##Barrier, __tsan_atomic64_##tsan_atomic_f,    \
+    kMacOrderBarrier)
+
+#define OSATOMIC_INTERCEPTORS_BITWISE(f, tsan_atomic_f, m, m_orig)             \
+  m(int32_t, uint32_t, a32, f##32, __tsan_atomic32_##tsan_atomic_f,            \
+    kMacOrderNonBarrier)                                                       \
+  m(int32_t, uint32_t, a32, f##32##Barrier, __tsan_atomic32_##tsan_atomic_f,   \
+    kMacOrderBarrier)                                                          \
+  m_orig(int32_t, uint32_t, a32, f##32##Orig, __tsan_atomic32_##tsan_atomic_f, \
+    kMacOrderNonBarrier)                                                       \
+  m_orig(int32_t, uint32_t, a32, f##32##OrigBarrier,                           \
+    __tsan_atomic32_##tsan_atomic_f, kMacOrderBarrier)
+
+OSATOMIC_INTERCEPTORS_ARITHMETIC(OSAtomicAdd, fetch_add,
+                                 OSATOMIC_INTERCEPTOR_PLUS_X)
+OSATOMIC_INTERCEPTORS_ARITHMETIC(OSAtomicIncrement, fetch_add,
+                                 OSATOMIC_INTERCEPTOR_PLUS_1)
+OSATOMIC_INTERCEPTORS_ARITHMETIC(OSAtomicDecrement, fetch_sub,
+                                 OSATOMIC_INTERCEPTOR_MINUS_1)
+OSATOMIC_INTERCEPTORS_BITWISE(OSAtomicOr, fetch_or, OSATOMIC_INTERCEPTOR_PLUS_X,
+                              OSATOMIC_INTERCEPTOR)
+OSATOMIC_INTERCEPTORS_BITWISE(OSAtomicAnd, fetch_and,
+                              OSATOMIC_INTERCEPTOR_PLUS_X, OSATOMIC_INTERCEPTOR)
+OSATOMIC_INTERCEPTORS_BITWISE(OSAtomicXor, fetch_xor,
+                              OSATOMIC_INTERCEPTOR_PLUS_X, OSATOMIC_INTERCEPTOR)
+
+#define OSATOMIC_INTERCEPTORS_CAS(f, tsan_atomic_f, tsan_t, t)              \
+  TSAN_INTERCEPTOR(bool, f, t old_value, t new_value, t volatile *ptr) {    \
+    SCOPED_TSAN_INTERCEPTOR(f, old_value, new_value, ptr);                  \
+    return tsan_atomic_f##_compare_exchange_strong(                         \
+        (tsan_t *)ptr, (tsan_t *)&old_value, (tsan_t)new_value,             \
+        kMacOrderNonBarrier, kMacOrderNonBarrier);                          \
+  }                                                                         \
+                                                                            \
+  TSAN_INTERCEPTOR(bool, f##Barrier, t old_value, t new_value,              \
+                   t volatile *ptr) {                                       \
+    SCOPED_TSAN_INTERCEPTOR(f##Barrier, old_value, new_value, ptr);         \
+    return tsan_atomic_f##_compare_exchange_strong(                         \
+        (tsan_t *)ptr, (tsan_t *)&old_value, (tsan_t)new_value,             \
+        kMacOrderBarrier, kMacOrderNonBarrier);                             \
+  }
+
+OSATOMIC_INTERCEPTORS_CAS(OSAtomicCompareAndSwapInt, __tsan_atomic32, a32, int)
+OSATOMIC_INTERCEPTORS_CAS(OSAtomicCompareAndSwapLong, __tsan_atomic64, a64,
+                          long_t)
+OSATOMIC_INTERCEPTORS_CAS(OSAtomicCompareAndSwapPtr, __tsan_atomic64, a64,
+                          void *)
+OSATOMIC_INTERCEPTORS_CAS(OSAtomicCompareAndSwap32, __tsan_atomic32, a32,
+                          int32_t)
+OSATOMIC_INTERCEPTORS_CAS(OSAtomicCompareAndSwap64, __tsan_atomic64, a64,
+                          int64_t)
+
+#define OSATOMIC_INTERCEPTOR_BITOP(f, op, m, mo)              \
+  TSAN_INTERCEPTOR(bool, f, uint32_t n, volatile void *ptr) { \
+    SCOPED_TSAN_INTERCEPTOR(f, n, ptr);                       \
+    char *byte_ptr = ((char *)ptr) + (n >> 3);                \
+    char bit_index = n & 7;                                   \
+    char mask = m;                                            \
+    char orig_byte = op((a8 *)byte_ptr, mask, mo);            \
+    return orig_byte & mask;                                  \
+  }
+
+#define OSATOMIC_INTERCEPTORS_BITOP(f, op, m)                     \
+  OSATOMIC_INTERCEPTOR_BITOP(f, op, m, kMacOrderNonBarrier)       \
+  OSATOMIC_INTERCEPTOR_BITOP(f##Barrier, op, m, kMacOrderBarrier)
+
+OSATOMIC_INTERCEPTORS_BITOP(OSAtomicTestAndSet, __tsan_atomic8_fetch_or,
+                            0x80u >> bit_index)
+OSATOMIC_INTERCEPTORS_BITOP(OSAtomicTestAndClear, __tsan_atomic8_fetch_and,
+                            ~(0x80u >> bit_index))
+
+TSAN_INTERCEPTOR(void, OSAtomicEnqueue, OSQueueHead *list, void *item,
+                 size_t offset) {
+  SCOPED_TSAN_INTERCEPTOR(OSAtomicEnqueue, list, item, offset);
+  __tsan_release(item);
+  REAL(OSAtomicEnqueue)(list, item, offset);
+}
+
+TSAN_INTERCEPTOR(void *, OSAtomicDequeue, OSQueueHead *list, size_t offset) {
+  SCOPED_TSAN_INTERCEPTOR(OSAtomicDequeue, list, offset);
+  void *item = REAL(OSAtomicDequeue)(list, offset);
+  if (item) __tsan_acquire(item);
+  return item;
+}
+
+// OSAtomicFifoEnqueue and OSAtomicFifoDequeue are only on OS X.
+#if !SANITIZER_IOS
+
+TSAN_INTERCEPTOR(void, OSAtomicFifoEnqueue, OSFifoQueueHead *list, void *item,
+                 size_t offset) {
+  SCOPED_TSAN_INTERCEPTOR(OSAtomicFifoEnqueue, list, item, offset);
+  __tsan_release(item);
+  REAL(OSAtomicFifoEnqueue)(list, item, offset);
+}
+
+TSAN_INTERCEPTOR(void *, OSAtomicFifoDequeue, OSFifoQueueHead *list,
+                 size_t offset) {
+  SCOPED_TSAN_INTERCEPTOR(OSAtomicFifoDequeue, list, offset);
+  void *item = REAL(OSAtomicFifoDequeue)(list, offset);
+  if (item) __tsan_acquire(item);
+  return item;
+}
+
+#endif
+
 TSAN_INTERCEPTOR(void, OSSpinLockLock, volatile OSSpinLock *lock) {
   CHECK(!cur_thread()->is_dead);
   if (!cur_thread()->is_inited) {
@@ -86,6 +236,98 @@
   REAL(os_lock_unlock)(lock);
 }
 
+TSAN_INTERCEPTOR(void, xpc_connection_set_event_handler,
+                 xpc_connection_t connection, xpc_handler_t handler) {
+  SCOPED_TSAN_INTERCEPTOR(xpc_connection_set_event_handler, connection,
+                          handler);
+  Release(thr, pc, (uptr)connection);
+  xpc_handler_t new_handler = ^(xpc_object_t object) {
+    {
+      SCOPED_INTERCEPTOR_RAW(xpc_connection_set_event_handler);
+      Acquire(thr, pc, (uptr)connection);
+    }
+    handler(object);
+  };
+  REAL(xpc_connection_set_event_handler)(connection, new_handler);
+}
+
+TSAN_INTERCEPTOR(void, xpc_connection_send_barrier, xpc_connection_t connection,
+                 dispatch_block_t barrier) {
+  SCOPED_TSAN_INTERCEPTOR(xpc_connection_send_barrier, connection, barrier);
+  Release(thr, pc, (uptr)connection);
+  dispatch_block_t new_barrier = ^() {
+    {
+      SCOPED_INTERCEPTOR_RAW(xpc_connection_send_barrier);
+      Acquire(thr, pc, (uptr)connection);
+    }
+    barrier();
+  };
+  REAL(xpc_connection_send_barrier)(connection, new_barrier);
+}
+
+TSAN_INTERCEPTOR(void, xpc_connection_send_message_with_reply,
+                 xpc_connection_t connection, xpc_object_t message,
+                 dispatch_queue_t replyq, xpc_handler_t handler) {
+  SCOPED_TSAN_INTERCEPTOR(xpc_connection_send_message_with_reply, connection,
+                          message, replyq, handler);
+  Release(thr, pc, (uptr)connection);
+  xpc_handler_t new_handler = ^(xpc_object_t object) {
+    {
+      SCOPED_INTERCEPTOR_RAW(xpc_connection_send_message_with_reply);
+      Acquire(thr, pc, (uptr)connection);
+    }
+    handler(object);
+  };
+  REAL(xpc_connection_send_message_with_reply)
+  (connection, message, replyq, new_handler);
+}
+
+// On macOS, libc++ is always linked dynamically, so intercepting works the
+// usual way.
+#define STDCXX_INTERCEPTOR TSAN_INTERCEPTOR
+
+namespace {
+struct fake_shared_weak_count {
+  volatile a64 shared_owners;
+  volatile a64 shared_weak_owners;
+  virtual void _unused_0x0() = 0;
+  virtual void _unused_0x8() = 0;
+  virtual void on_zero_shared() = 0;
+  virtual void _unused_0x18() = 0;
+  virtual void on_zero_shared_weak() = 0;
+};
+}  // namespace
+
+// This adds a libc++ interceptor for:
+//     void __shared_weak_count::__release_shared() _NOEXCEPT;
+// Shared and weak pointers in C++ maintain reference counts via atomics in
+// libc++.dylib, which are TSan-invisible, and this leads to false positives in
+// destructor code.  This interceptor re-implements the whole function so that
+// the mo_acq_rel semantics of the atomic decrement are visible.
+//
+// Unfortunately, this interceptor cannot simply Acquire/Release some sync
+// object and call the original function, because it would have a race between
+// the sync and the destruction of the object.  Calling both under a lock will
+// not work because the destructor can invoke this interceptor again (and even
+// in a different thread, so recursive locks don't help).
+STDCXX_INTERCEPTOR(void, _ZNSt3__119__shared_weak_count16__release_sharedEv,
+                   fake_shared_weak_count *o) {
+  if (!flags()->shared_ptr_interceptor)
+    return REAL(_ZNSt3__119__shared_weak_count16__release_sharedEv)(o);
+
+  SCOPED_TSAN_INTERCEPTOR(_ZNSt3__119__shared_weak_count16__release_sharedEv,
+                          o);
+  if (__tsan_atomic64_fetch_add(&o->shared_owners, -1, mo_release) == 0) {
+    Acquire(thr, pc, (uptr)&o->shared_owners);
+    o->on_zero_shared();
+    if (__tsan_atomic64_fetch_add(&o->shared_weak_owners, -1, mo_release) ==
+        0) {
+      Acquire(thr, pc, (uptr)&o->shared_weak_owners);
+      o->on_zero_shared_weak();
+    }
+  }
+}
+
 }  // namespace __tsan
 
 #endif  // SANITIZER_MAC
diff --git a/lib/tsan/rtl/tsan_interface.h b/lib/tsan/rtl/tsan_interface.h
index 41e084b..fbb099d 100644
--- a/lib/tsan/rtl/tsan_interface.h
+++ b/lib/tsan/rtl/tsan_interface.h
@@ -25,6 +25,8 @@
 extern "C" {
 #endif
 
+#ifndef SANITIZER_GO
+
 // This function should be called at the very beginning of the process,
 // before any instrumented code is executed and before any call to malloc.
 SANITIZER_INTERFACE_ATTRIBUTE void __tsan_init();
@@ -75,8 +77,296 @@
 SANITIZER_INTERFACE_ATTRIBUTE
 void __tsan_write_range(void *addr, unsigned long size);  // NOLINT
 
+// User may provide function that would be called right when TSan detects
+// an error. The argument 'report' is an opaque pointer that can be used to
+// gather additional information using other TSan report API functions.
+SANITIZER_INTERFACE_ATTRIBUTE
+void __tsan_on_report(void *report);
+
+// If TSan is currently reporting a detected issue on the current thread,
+// returns an opaque pointer to the current report. Otherwise returns NULL.
+SANITIZER_INTERFACE_ATTRIBUTE
+void *__tsan_get_current_report();
+
+// Returns a report's description (issue type), number of duplicate issues
+// found, counts of array data (stack traces, memory operations, locations,
+// mutexes, threads, unique thread IDs) and a stack trace of a sleep() call (if
+// one was involved in the issue).
+SANITIZER_INTERFACE_ATTRIBUTE
+int __tsan_get_report_data(void *report, const char **description, int *count,
+                           int *stack_count, int *mop_count, int *loc_count,
+                           int *mutex_count, int *thread_count,
+                           int *unique_tid_count, void **sleep_trace,
+                           uptr trace_size);
+
+// Returns information about stack traces included in the report.
+SANITIZER_INTERFACE_ATTRIBUTE
+int __tsan_get_report_stack(void *report, uptr idx, void **trace,
+                            uptr trace_size);
+
+// Returns information about memory operations included in the report.
+SANITIZER_INTERFACE_ATTRIBUTE
+int __tsan_get_report_mop(void *report, uptr idx, int *tid, void **addr,
+                          int *size, int *write, int *atomic, void **trace,
+                          uptr trace_size);
+
+// Returns information about locations included in the report.
+SANITIZER_INTERFACE_ATTRIBUTE
+int __tsan_get_report_loc(void *report, uptr idx, const char **type,
+                          void **addr, uptr *start, uptr *size, int *tid,
+                          int *fd, int *suppressable, void **trace,
+                          uptr trace_size);
+
+// Returns information about mutexes included in the report.
+SANITIZER_INTERFACE_ATTRIBUTE
+int __tsan_get_report_mutex(void *report, uptr idx, uptr *mutex_id, void **addr,
+                            int *destroyed, void **trace, uptr trace_size);
+
+// Returns information about threads included in the report.
+SANITIZER_INTERFACE_ATTRIBUTE
+int __tsan_get_report_thread(void *report, uptr idx, int *tid, uptr *os_id,
+                             int *running, const char **name, int *parent_tid,
+                             void **trace, uptr trace_size);
+
+// Returns information about unique thread IDs included in the report.
+SANITIZER_INTERFACE_ATTRIBUTE
+int __tsan_get_report_unique_tid(void *report, uptr idx, int *tid);
+
+#endif  // SANITIZER_GO
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
+namespace __tsan {
+
+// These should match declarations from public tsan_interface_atomic.h header.
+typedef unsigned char      a8;
+typedef unsigned short     a16;  // NOLINT
+typedef unsigned int       a32;
+typedef unsigned long long a64;  // NOLINT
+#if !defined(SANITIZER_GO) && (defined(__SIZEOF_INT128__) \
+    || (__clang_major__ * 100 + __clang_minor__ >= 302)) && !defined(__mips64)
+__extension__ typedef __int128 a128;
+# define __TSAN_HAS_INT128 1
+#else
+# define __TSAN_HAS_INT128 0
+#endif
+
+// Part of ABI, do not change.
+// http://llvm.org/viewvc/llvm-project/libcxx/trunk/include/atomic?view=markup
+typedef enum {
+  mo_relaxed,
+  mo_consume,
+  mo_acquire,
+  mo_release,
+  mo_acq_rel,
+  mo_seq_cst
+} morder;
+
+struct ThreadState;
+
+extern "C" {
+SANITIZER_INTERFACE_ATTRIBUTE
+a8 __tsan_atomic8_load(const volatile a8 *a, morder mo);
+SANITIZER_INTERFACE_ATTRIBUTE
+a16 __tsan_atomic16_load(const volatile a16 *a, morder mo);
+SANITIZER_INTERFACE_ATTRIBUTE
+a32 __tsan_atomic32_load(const volatile a32 *a, morder mo);
+SANITIZER_INTERFACE_ATTRIBUTE
+a64 __tsan_atomic64_load(const volatile a64 *a, morder mo);
+#if __TSAN_HAS_INT128
+SANITIZER_INTERFACE_ATTRIBUTE
+a128 __tsan_atomic128_load(const volatile a128 *a, morder mo);
+#endif
+
+SANITIZER_INTERFACE_ATTRIBUTE
+void __tsan_atomic8_store(volatile a8 *a, a8 v, morder mo);
+SANITIZER_INTERFACE_ATTRIBUTE
+void __tsan_atomic16_store(volatile a16 *a, a16 v, morder mo);
+SANITIZER_INTERFACE_ATTRIBUTE
+void __tsan_atomic32_store(volatile a32 *a, a32 v, morder mo);
+SANITIZER_INTERFACE_ATTRIBUTE
+void __tsan_atomic64_store(volatile a64 *a, a64 v, morder mo);
+#if __TSAN_HAS_INT128
+SANITIZER_INTERFACE_ATTRIBUTE
+void __tsan_atomic128_store(volatile a128 *a, a128 v, morder mo);
+#endif
+
+SANITIZER_INTERFACE_ATTRIBUTE
+a8 __tsan_atomic8_exchange(volatile a8 *a, a8 v, morder mo);
+SANITIZER_INTERFACE_ATTRIBUTE
+a16 __tsan_atomic16_exchange(volatile a16 *a, a16 v, morder mo);
+SANITIZER_INTERFACE_ATTRIBUTE
+a32 __tsan_atomic32_exchange(volatile a32 *a, a32 v, morder mo);
+SANITIZER_INTERFACE_ATTRIBUTE
+a64 __tsan_atomic64_exchange(volatile a64 *a, a64 v, morder mo);
+#if __TSAN_HAS_INT128
+SANITIZER_INTERFACE_ATTRIBUTE
+a128 __tsan_atomic128_exchange(volatile a128 *a, a128 v, morder mo);
+#endif
+
+SANITIZER_INTERFACE_ATTRIBUTE
+a8 __tsan_atomic8_fetch_add(volatile a8 *a, a8 v, morder mo);
+SANITIZER_INTERFACE_ATTRIBUTE
+a16 __tsan_atomic16_fetch_add(volatile a16 *a, a16 v, morder mo);
+SANITIZER_INTERFACE_ATTRIBUTE
+a32 __tsan_atomic32_fetch_add(volatile a32 *a, a32 v, morder mo);
+SANITIZER_INTERFACE_ATTRIBUTE
+a64 __tsan_atomic64_fetch_add(volatile a64 *a, a64 v, morder mo);
+#if __TSAN_HAS_INT128
+SANITIZER_INTERFACE_ATTRIBUTE
+a128 __tsan_atomic128_fetch_add(volatile a128 *a, a128 v, morder mo);
+#endif
+
+SANITIZER_INTERFACE_ATTRIBUTE
+a8 __tsan_atomic8_fetch_sub(volatile a8 *a, a8 v, morder mo);
+SANITIZER_INTERFACE_ATTRIBUTE
+a16 __tsan_atomic16_fetch_sub(volatile a16 *a, a16 v, morder mo);
+SANITIZER_INTERFACE_ATTRIBUTE
+a32 __tsan_atomic32_fetch_sub(volatile a32 *a, a32 v, morder mo);
+SANITIZER_INTERFACE_ATTRIBUTE
+a64 __tsan_atomic64_fetch_sub(volatile a64 *a, a64 v, morder mo);
+#if __TSAN_HAS_INT128
+SANITIZER_INTERFACE_ATTRIBUTE
+a128 __tsan_atomic128_fetch_sub(volatile a128 *a, a128 v, morder mo);
+#endif
+
+SANITIZER_INTERFACE_ATTRIBUTE
+a8 __tsan_atomic8_fetch_and(volatile a8 *a, a8 v, morder mo);
+SANITIZER_INTERFACE_ATTRIBUTE
+a16 __tsan_atomic16_fetch_and(volatile a16 *a, a16 v, morder mo);
+SANITIZER_INTERFACE_ATTRIBUTE
+a32 __tsan_atomic32_fetch_and(volatile a32 *a, a32 v, morder mo);
+SANITIZER_INTERFACE_ATTRIBUTE
+a64 __tsan_atomic64_fetch_and(volatile a64 *a, a64 v, morder mo);
+#if __TSAN_HAS_INT128
+SANITIZER_INTERFACE_ATTRIBUTE
+a128 __tsan_atomic128_fetch_and(volatile a128 *a, a128 v, morder mo);
+#endif
+
+SANITIZER_INTERFACE_ATTRIBUTE
+a8 __tsan_atomic8_fetch_or(volatile a8 *a, a8 v, morder mo);
+SANITIZER_INTERFACE_ATTRIBUTE
+a16 __tsan_atomic16_fetch_or(volatile a16 *a, a16 v, morder mo);
+SANITIZER_INTERFACE_ATTRIBUTE
+a32 __tsan_atomic32_fetch_or(volatile a32 *a, a32 v, morder mo);
+SANITIZER_INTERFACE_ATTRIBUTE
+a64 __tsan_atomic64_fetch_or(volatile a64 *a, a64 v, morder mo);
+#if __TSAN_HAS_INT128
+SANITIZER_INTERFACE_ATTRIBUTE
+a128 __tsan_atomic128_fetch_or(volatile a128 *a, a128 v, morder mo);
+#endif
+
+SANITIZER_INTERFACE_ATTRIBUTE
+a8 __tsan_atomic8_fetch_xor(volatile a8 *a, a8 v, morder mo);
+SANITIZER_INTERFACE_ATTRIBUTE
+a16 __tsan_atomic16_fetch_xor(volatile a16 *a, a16 v, morder mo);
+SANITIZER_INTERFACE_ATTRIBUTE
+a32 __tsan_atomic32_fetch_xor(volatile a32 *a, a32 v, morder mo);
+SANITIZER_INTERFACE_ATTRIBUTE
+a64 __tsan_atomic64_fetch_xor(volatile a64 *a, a64 v, morder mo);
+#if __TSAN_HAS_INT128
+SANITIZER_INTERFACE_ATTRIBUTE
+a128 __tsan_atomic128_fetch_xor(volatile a128 *a, a128 v, morder mo);
+#endif
+
+SANITIZER_INTERFACE_ATTRIBUTE
+a8 __tsan_atomic8_fetch_nand(volatile a8 *a, a8 v, morder mo);
+SANITIZER_INTERFACE_ATTRIBUTE
+a16 __tsan_atomic16_fetch_nand(volatile a16 *a, a16 v, morder mo);
+SANITIZER_INTERFACE_ATTRIBUTE
+a32 __tsan_atomic32_fetch_nand(volatile a32 *a, a32 v, morder mo);
+SANITIZER_INTERFACE_ATTRIBUTE
+a64 __tsan_atomic64_fetch_nand(volatile a64 *a, a64 v, morder mo);
+#if __TSAN_HAS_INT128
+SANITIZER_INTERFACE_ATTRIBUTE
+a128 __tsan_atomic128_fetch_nand(volatile a128 *a, a128 v, morder mo);
+#endif
+
+SANITIZER_INTERFACE_ATTRIBUTE
+int __tsan_atomic8_compare_exchange_strong(volatile a8 *a, a8 *c, a8 v,
+                                           morder mo, morder fmo);
+SANITIZER_INTERFACE_ATTRIBUTE
+int __tsan_atomic16_compare_exchange_strong(volatile a16 *a, a16 *c, a16 v,
+                                            morder mo, morder fmo);
+SANITIZER_INTERFACE_ATTRIBUTE
+int __tsan_atomic32_compare_exchange_strong(volatile a32 *a, a32 *c, a32 v,
+                                            morder mo, morder fmo);
+SANITIZER_INTERFACE_ATTRIBUTE
+int __tsan_atomic64_compare_exchange_strong(volatile a64 *a, a64 *c, a64 v,
+                                            morder mo, morder fmo);
+#if __TSAN_HAS_INT128
+SANITIZER_INTERFACE_ATTRIBUTE
+int __tsan_atomic128_compare_exchange_strong(volatile a128 *a, a128 *c, a128 v,
+                                             morder mo, morder fmo);
+#endif
+
+SANITIZER_INTERFACE_ATTRIBUTE
+int __tsan_atomic8_compare_exchange_weak(volatile a8 *a, a8 *c, a8 v, morder mo,
+                                         morder fmo);
+SANITIZER_INTERFACE_ATTRIBUTE
+int __tsan_atomic16_compare_exchange_weak(volatile a16 *a, a16 *c, a16 v,
+                                          morder mo, morder fmo);
+SANITIZER_INTERFACE_ATTRIBUTE
+int __tsan_atomic32_compare_exchange_weak(volatile a32 *a, a32 *c, a32 v,
+                                          morder mo, morder fmo);
+SANITIZER_INTERFACE_ATTRIBUTE
+int __tsan_atomic64_compare_exchange_weak(volatile a64 *a, a64 *c, a64 v,
+                                          morder mo, morder fmo);
+#if __TSAN_HAS_INT128
+SANITIZER_INTERFACE_ATTRIBUTE
+int __tsan_atomic128_compare_exchange_weak(volatile a128 *a, a128 *c, a128 v,
+                                           morder mo, morder fmo);
+#endif
+
+SANITIZER_INTERFACE_ATTRIBUTE
+a8 __tsan_atomic8_compare_exchange_val(volatile a8 *a, a8 c, a8 v, morder mo,
+                                       morder fmo);
+SANITIZER_INTERFACE_ATTRIBUTE
+a16 __tsan_atomic16_compare_exchange_val(volatile a16 *a, a16 c, a16 v,
+                                         morder mo, morder fmo);
+SANITIZER_INTERFACE_ATTRIBUTE
+a32 __tsan_atomic32_compare_exchange_val(volatile a32 *a, a32 c, a32 v,
+                                         morder mo, morder fmo);
+SANITIZER_INTERFACE_ATTRIBUTE
+a64 __tsan_atomic64_compare_exchange_val(volatile a64 *a, a64 c, a64 v,
+                                         morder mo, morder fmo);
+#if __TSAN_HAS_INT128
+SANITIZER_INTERFACE_ATTRIBUTE
+a128 __tsan_atomic128_compare_exchange_val(volatile a128 *a, a128 c, a128 v,
+                                           morder mo, morder fmo);
+#endif
+
+SANITIZER_INTERFACE_ATTRIBUTE
+void __tsan_atomic_thread_fence(morder mo);
+SANITIZER_INTERFACE_ATTRIBUTE
+void __tsan_atomic_signal_fence(morder mo);
+
+SANITIZER_INTERFACE_ATTRIBUTE
+void __tsan_go_atomic32_load(ThreadState *thr, uptr cpc, uptr pc, u8 *a);
+SANITIZER_INTERFACE_ATTRIBUTE
+void __tsan_go_atomic64_load(ThreadState *thr, uptr cpc, uptr pc, u8 *a);
+SANITIZER_INTERFACE_ATTRIBUTE
+void __tsan_go_atomic32_store(ThreadState *thr, uptr cpc, uptr pc, u8 *a);
+SANITIZER_INTERFACE_ATTRIBUTE
+void __tsan_go_atomic64_store(ThreadState *thr, uptr cpc, uptr pc, u8 *a);
+SANITIZER_INTERFACE_ATTRIBUTE
+void __tsan_go_atomic32_fetch_add(ThreadState *thr, uptr cpc, uptr pc, u8 *a);
+SANITIZER_INTERFACE_ATTRIBUTE
+void __tsan_go_atomic64_fetch_add(ThreadState *thr, uptr cpc, uptr pc, u8 *a);
+SANITIZER_INTERFACE_ATTRIBUTE
+void __tsan_go_atomic32_exchange(ThreadState *thr, uptr cpc, uptr pc, u8 *a);
+SANITIZER_INTERFACE_ATTRIBUTE
+void __tsan_go_atomic64_exchange(ThreadState *thr, uptr cpc, uptr pc, u8 *a);
+SANITIZER_INTERFACE_ATTRIBUTE
+void __tsan_go_atomic32_compare_exchange(ThreadState *thr, uptr cpc, uptr pc,
+                                         u8 *a);
+SANITIZER_INTERFACE_ATTRIBUTE
+void __tsan_go_atomic64_compare_exchange(ThreadState *thr, uptr cpc, uptr pc,
+                                         u8 *a);
+}  // extern "C"
+
+}  // namespace __tsan
+
 #endif  // TSAN_INTERFACE_H
diff --git a/lib/tsan/rtl/tsan_interface_atomic.cc b/lib/tsan/rtl/tsan_interface_atomic.cc
index 2703199..dc0873f 100644
--- a/lib/tsan/rtl/tsan_interface_atomic.cc
+++ b/lib/tsan/rtl/tsan_interface_atomic.cc
@@ -23,39 +23,16 @@
 #include "sanitizer_common/sanitizer_stacktrace.h"
 #include "sanitizer_common/sanitizer_mutex.h"
 #include "tsan_flags.h"
+#include "tsan_interface.h"
 #include "tsan_rtl.h"
 
 using namespace __tsan;  // NOLINT
 
-// These should match declarations from public tsan_interface_atomic.h header.
-typedef unsigned char      a8;
-typedef unsigned short     a16;  // NOLINT
-typedef unsigned int       a32;
-typedef unsigned long long a64;  // NOLINT
-#if !defined(SANITIZER_GO) && (defined(__SIZEOF_INT128__) \
-    || (__clang_major__ * 100 + __clang_minor__ >= 302)) && !defined(__mips64)
-__extension__ typedef __int128 a128;
-# define __TSAN_HAS_INT128 1
-#else
-# define __TSAN_HAS_INT128 0
-#endif
-
 #if !defined(SANITIZER_GO) && __TSAN_HAS_INT128
 // Protects emulation of 128-bit atomic operations.
 static StaticSpinMutex mutex128;
 #endif
 
-// Part of ABI, do not change.
-// http://llvm.org/viewvc/llvm-project/libcxx/trunk/include/atomic?view=markup
-typedef enum {
-  mo_relaxed,
-  mo_consume,
-  mo_acquire,
-  mo_release,
-  mo_acq_rel,
-  mo_seq_cst
-} morder;
-
 static bool IsLoadOrder(morder mo) {
   return mo == mo_relaxed || mo == mo_consume
       || mo == mo_acquire || mo == mo_seq_cst;
diff --git a/lib/tsan/rtl/tsan_interface_java.cc b/lib/tsan/rtl/tsan_interface_java.cc
index 0aea63d..95be859 100644
--- a/lib/tsan/rtl/tsan_interface_java.cc
+++ b/lib/tsan/rtl/tsan_interface_java.cc
@@ -111,7 +111,7 @@
   CHECK_GE(ptr, jctx->heap_begin);
   CHECK_LE(ptr + size, jctx->heap_begin + jctx->heap_size);
 
-  ctx->metamap.FreeRange(thr, pc, ptr, size);
+  ctx->metamap.FreeRange(thr->proc(), ptr, size);
 }
 
 void __tsan_java_move(jptr src, jptr dst, jptr size) {
diff --git a/lib/tsan/rtl/tsan_libdispatch_mac.cc b/lib/tsan/rtl/tsan_libdispatch_mac.cc
index 617dc91..529cedb 100644
--- a/lib/tsan/rtl/tsan_libdispatch_mac.cc
+++ b/lib/tsan/rtl/tsan_libdispatch_mac.cc
@@ -33,8 +33,10 @@
   dispatch_queue_t queue;
   void *orig_context;
   dispatch_function_t orig_work;
-  uptr object_to_acquire;
-  dispatch_object_t object_to_release;
+  bool free_context_in_callback;
+  bool submitted_synchronously;
+  bool is_barrier_block;
+  uptr non_queue_sync_object;
 } tsan_block_context_t;
 
 // The offsets of different fields of the dispatch_queue_t structure, exported
@@ -66,6 +68,15 @@
   return width == 1;
 }
 
+static dispatch_queue_t GetTargetQueueFromSource(dispatch_source_t source) {
+  CHECK_EQ(dispatch_queue_offsets.dqo_target_queue_size, 8);
+  dispatch_queue_t target_queue =
+      *(dispatch_queue_t *)(((uptr)source) +
+                            dispatch_queue_offsets.dqo_target_queue);
+  CHECK_NE(target_queue, 0);
+  return target_queue;
+}
+
 static tsan_block_context_t *AllocContext(ThreadState *thr, uptr pc,
                                           dispatch_queue_t queue,
                                           void *orig_context,
@@ -75,30 +86,40 @@
   new_context->queue = queue;
   new_context->orig_context = orig_context;
   new_context->orig_work = orig_work;
-  new_context->object_to_acquire = (uptr)new_context;
-  new_context->object_to_release = nullptr;
+  new_context->free_context_in_callback = true;
+  new_context->submitted_synchronously = false;
+  new_context->is_barrier_block = false;
   return new_context;
 }
 
-static void dispatch_callback_wrap_acquire(void *param) {
-  SCOPED_INTERCEPTOR_RAW(dispatch_async_f_callback_wrap);
+static void dispatch_callback_wrap(void *param) {
+  SCOPED_INTERCEPTOR_RAW(dispatch_callback_wrap);
   tsan_block_context_t *context = (tsan_block_context_t *)param;
-  Acquire(thr, pc, context->object_to_acquire);
+  bool is_queue_serial = context->queue && IsQueueSerial(context->queue);
+  uptr sync_ptr = (uptr)context->queue ?: context->non_queue_sync_object;
 
-  // Extra retain/release is required for dispatch groups. We use the group
-  // itself to synchronize, but in a notification (dispatch_group_notify
-  // callback), it may be disposed already. To solve this, we retain the group
-  // and release it here.
-  if (context->object_to_release) dispatch_release(context->object_to_release);
+  uptr serial_sync = (uptr)sync_ptr;
+  uptr concurrent_sync = ((uptr)sync_ptr) + sizeof(uptr);
+  uptr submit_sync = (uptr)context;
+  bool serial_task = context->is_barrier_block || is_queue_serial;
 
-  // In serial queues, work items can be executed on different threads, we need
-  // to explicitly synchronize on the queue itself.
-  if (IsQueueSerial(context->queue)) Acquire(thr, pc, (uptr)context->queue);
+  Acquire(thr, pc, submit_sync);
+  Acquire(thr, pc, serial_sync);
+  if (serial_task) Acquire(thr, pc, concurrent_sync);
+
   SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_START();
   context->orig_work(context->orig_context);
   SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_END();
-  if (IsQueueSerial(context->queue)) Release(thr, pc, (uptr)context->queue);
-  user_free(thr, pc, context);
+
+  Release(thr, pc, serial_task ? serial_sync : concurrent_sync);
+  if (context->submitted_synchronously) Release(thr, pc, submit_sync);
+
+  if (context->free_context_in_callback) user_free(thr, pc, context);
+}
+
+static void invoke_block(void *param) {
+  dispatch_block_t block = (dispatch_block_t)param;
+  block();
 }
 
 static void invoke_and_release_block(void *param) {
@@ -107,44 +128,97 @@
   Block_release(block);
 }
 
-#define DISPATCH_INTERCEPT_B(name)                                           \
+#define DISPATCH_INTERCEPT_B(name, barrier)                                  \
   TSAN_INTERCEPTOR(void, name, dispatch_queue_t q, dispatch_block_t block) { \
     SCOPED_TSAN_INTERCEPTOR(name, q, block);                                 \
-    SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_START(); \
+    SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_START();                           \
     dispatch_block_t heap_block = Block_copy(block);                         \
-    SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_END(); \
+    SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_END();                             \
     tsan_block_context_t *new_context =                                      \
         AllocContext(thr, pc, q, heap_block, &invoke_and_release_block);     \
+    new_context->is_barrier_block = barrier;                                 \
     Release(thr, pc, (uptr)new_context);                                     \
-    SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_START(); \
-    REAL(name##_f)(q, new_context, dispatch_callback_wrap_acquire);          \
-    SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_END(); \
+    SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_START();                           \
+    REAL(name##_f)(q, new_context, dispatch_callback_wrap);                  \
+    SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_END();                             \
   }
 
-#define DISPATCH_INTERCEPT_F(name)                                \
+#define DISPATCH_INTERCEPT_SYNC_B(name, barrier)                             \
+  TSAN_INTERCEPTOR(void, name, dispatch_queue_t q, dispatch_block_t block) { \
+    SCOPED_TSAN_INTERCEPTOR(name, q, block);                                 \
+    SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_START();                           \
+    dispatch_block_t heap_block = Block_copy(block);                         \
+    SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_END();                             \
+    tsan_block_context_t new_context = {                                     \
+        q, heap_block, &invoke_and_release_block, false, true, barrier, 0};  \
+    Release(thr, pc, (uptr)&new_context);                                    \
+    SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_START();                           \
+    REAL(name##_f)(q, &new_context, dispatch_callback_wrap);                 \
+    SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_END();                             \
+    Acquire(thr, pc, (uptr)&new_context);                                    \
+  }
+
+#define DISPATCH_INTERCEPT_F(name, barrier)                       \
   TSAN_INTERCEPTOR(void, name, dispatch_queue_t q, void *context, \
                    dispatch_function_t work) {                    \
     SCOPED_TSAN_INTERCEPTOR(name, q, context, work);              \
     tsan_block_context_t *new_context =                           \
         AllocContext(thr, pc, q, context, work);                  \
+    new_context->is_barrier_block = barrier;                      \
     Release(thr, pc, (uptr)new_context);                          \
-    SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_START(); \
-    REAL(name)(q, new_context, dispatch_callback_wrap_acquire);   \
-    SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_END(); \
+    SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_START();                \
+    REAL(name)(q, new_context, dispatch_callback_wrap);           \
+    SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_END();                  \
+  }
+
+#define DISPATCH_INTERCEPT_SYNC_F(name, barrier)                              \
+  TSAN_INTERCEPTOR(void, name, dispatch_queue_t q, void *context,             \
+                   dispatch_function_t work) {                                \
+    SCOPED_TSAN_INTERCEPTOR(name, q, context, work);                          \
+    tsan_block_context_t new_context = {                                      \
+        q, context, work, false, true, barrier, 0};                           \
+    Release(thr, pc, (uptr)&new_context);                                     \
+    SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_START();                            \
+    REAL(name)(q, &new_context, dispatch_callback_wrap);                      \
+    SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_END();                              \
+    Acquire(thr, pc, (uptr)&new_context);                                     \
   }
 
 // We wrap dispatch_async, dispatch_sync and friends where we allocate a new
 // context, which is used to synchronize (we release the context before
 // submitting, and the callback acquires it before executing the original
 // callback).
-DISPATCH_INTERCEPT_B(dispatch_async)
-DISPATCH_INTERCEPT_B(dispatch_barrier_async)
-DISPATCH_INTERCEPT_F(dispatch_async_f)
-DISPATCH_INTERCEPT_F(dispatch_barrier_async_f)
-DISPATCH_INTERCEPT_B(dispatch_sync)
-DISPATCH_INTERCEPT_B(dispatch_barrier_sync)
-DISPATCH_INTERCEPT_F(dispatch_sync_f)
-DISPATCH_INTERCEPT_F(dispatch_barrier_sync_f)
+DISPATCH_INTERCEPT_B(dispatch_async, false)
+DISPATCH_INTERCEPT_B(dispatch_barrier_async, true)
+DISPATCH_INTERCEPT_F(dispatch_async_f, false)
+DISPATCH_INTERCEPT_F(dispatch_barrier_async_f, true)
+DISPATCH_INTERCEPT_SYNC_B(dispatch_sync, false)
+DISPATCH_INTERCEPT_SYNC_B(dispatch_barrier_sync, true)
+DISPATCH_INTERCEPT_SYNC_F(dispatch_sync_f, false)
+DISPATCH_INTERCEPT_SYNC_F(dispatch_barrier_sync_f, true)
+
+TSAN_INTERCEPTOR(void, dispatch_after, dispatch_time_t when,
+                 dispatch_queue_t queue, dispatch_block_t block) {
+  SCOPED_TSAN_INTERCEPTOR(dispatch_after, when, queue, block);
+  SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_START();
+  dispatch_block_t heap_block = Block_copy(block);
+  SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_END();
+  tsan_block_context_t *new_context =
+      AllocContext(thr, pc, queue, heap_block, &invoke_and_release_block);
+  Release(thr, pc, (uptr)new_context);
+  SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_START();
+  REAL(dispatch_after_f)(when, queue, new_context, dispatch_callback_wrap);
+  SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_END();
+}
+
+TSAN_INTERCEPTOR(void, dispatch_after_f, dispatch_time_t when,
+                 dispatch_queue_t queue, void *context,
+                 dispatch_function_t work) {
+  SCOPED_TSAN_INTERCEPTOR(dispatch_after_f, when, queue, context, work);
+  WRAP(dispatch_after)(when, queue, ^(void) {
+    work(context);
+  });
+}
 
 // GCD's dispatch_once implementation has a fast path that contains a racy read
 // and it's inlined into user's code. Furthermore, this fast path doesn't
@@ -161,7 +235,7 @@
 #undef dispatch_once
 TSAN_INTERCEPTOR(void, dispatch_once, dispatch_once_t *predicate,
                  dispatch_block_t block) {
-  SCOPED_TSAN_INTERCEPTOR(dispatch_once, predicate, block);
+  SCOPED_INTERCEPTOR_RAW(dispatch_once, predicate, block);
   atomic_uint32_t *a = reinterpret_cast<atomic_uint32_t *>(predicate);
   u32 v = atomic_load(a, memory_order_acquire);
   if (v == 0 &&
@@ -183,7 +257,7 @@
 #undef dispatch_once_f
 TSAN_INTERCEPTOR(void, dispatch_once_f, dispatch_once_t *predicate,
                  void *context, dispatch_function_t function) {
-  SCOPED_TSAN_INTERCEPTOR(dispatch_once_f, predicate, context, function);
+  SCOPED_INTERCEPTOR_RAW(dispatch_once_f, predicate, context, function);
   SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_START();
   WRAP(dispatch_once)(predicate, ^(void) {
     function(context);
@@ -216,6 +290,7 @@
 
 TSAN_INTERCEPTOR(void, dispatch_group_leave, dispatch_group_t group) {
   SCOPED_TSAN_INTERCEPTOR(dispatch_group_leave, group);
+  // Acquired in the group noticifaction callback in dispatch_group_notify[_f].
   Release(thr, pc, (uptr)group);
   REAL(dispatch_group_leave)(group);
 }
@@ -225,8 +300,10 @@
   SCOPED_TSAN_INTERCEPTOR(dispatch_group_async, group, queue, block);
   dispatch_retain(group);
   dispatch_group_enter(group);
+  __block dispatch_block_t block_copy = (dispatch_block_t)_Block_copy(block);
   WRAP(dispatch_async)(queue, ^(void) {
-    block();
+    block_copy();
+    _Block_release(block_copy);
     WRAP(dispatch_group_leave)(group);
     dispatch_release(group);
   });
@@ -248,35 +325,355 @@
 TSAN_INTERCEPTOR(void, dispatch_group_notify, dispatch_group_t group,
                  dispatch_queue_t q, dispatch_block_t block) {
   SCOPED_TSAN_INTERCEPTOR(dispatch_group_notify, group, q, block);
+
+  // To make sure the group is still available in the callback (otherwise
+  // it can be already destroyed).  Will be released in the callback.
+  dispatch_retain(group);
+
   SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_START();
-  dispatch_block_t heap_block = Block_copy(block);
+  dispatch_block_t heap_block = Block_copy(^(void) {
+    {
+      SCOPED_INTERCEPTOR_RAW(dispatch_read_callback);
+      // Released when leaving the group (dispatch_group_leave).
+      Acquire(thr, pc, (uptr)group);
+    }
+    dispatch_release(group);
+    block();
+  });
   SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_END();
   tsan_block_context_t *new_context =
       AllocContext(thr, pc, q, heap_block, &invoke_and_release_block);
-  new_context->object_to_acquire = (uptr)group;
-
-  // Will be released in dispatch_callback_wrap_acquire.
-  new_context->object_to_release = group;
-  dispatch_retain(group);
-
-  Release(thr, pc, (uptr)group);
-  REAL(dispatch_group_notify_f)(group, q, new_context,
-                                dispatch_callback_wrap_acquire);
+  new_context->is_barrier_block = true;
+  Release(thr, pc, (uptr)new_context);
+  REAL(dispatch_group_notify_f)(group, q, new_context, dispatch_callback_wrap);
 }
 
 TSAN_INTERCEPTOR(void, dispatch_group_notify_f, dispatch_group_t group,
                  dispatch_queue_t q, void *context, dispatch_function_t work) {
-  SCOPED_TSAN_INTERCEPTOR(dispatch_group_notify_f, group, q, context, work);
-  tsan_block_context_t *new_context = AllocContext(thr, pc, q, context, work);
-  new_context->object_to_acquire = (uptr)group;
+  WRAP(dispatch_group_notify)(group, q, ^(void) { work(context); });
+}
 
-  // Will be released in dispatch_callback_wrap_acquire.
-  new_context->object_to_release = group;
-  dispatch_retain(group);
+TSAN_INTERCEPTOR(void, dispatch_source_set_event_handler,
+                 dispatch_source_t source, dispatch_block_t handler) {
+  SCOPED_TSAN_INTERCEPTOR(dispatch_source_set_event_handler, source, handler);
+  if (handler == nullptr)
+    return REAL(dispatch_source_set_event_handler)(source, nullptr);
+  dispatch_queue_t q = GetTargetQueueFromSource(source);
+  __block tsan_block_context_t new_context = {
+      q, handler, &invoke_block, false, false, false, 0 };
+  dispatch_block_t new_handler = Block_copy(^(void) {
+    new_context.orig_context = handler;  // To explicitly capture "handler".
+    dispatch_callback_wrap(&new_context);
+  });
+  uptr submit_sync = (uptr)&new_context;
+  Release(thr, pc, submit_sync);
+  REAL(dispatch_source_set_event_handler)(source, new_handler);
+  Block_release(new_handler);
+}
 
-  Release(thr, pc, (uptr)group);
-  REAL(dispatch_group_notify_f)(group, q, new_context,
-                                dispatch_callback_wrap_acquire);
+TSAN_INTERCEPTOR(void, dispatch_source_set_event_handler_f,
+                 dispatch_source_t source, dispatch_function_t handler) {
+  SCOPED_TSAN_INTERCEPTOR(dispatch_source_set_event_handler_f, source, handler);
+  if (handler == nullptr)
+    return REAL(dispatch_source_set_event_handler)(source, nullptr);
+  dispatch_block_t block = ^(void) {
+    handler(dispatch_get_context(source));
+  };
+  WRAP(dispatch_source_set_event_handler)(source, block);
+}
+
+TSAN_INTERCEPTOR(void, dispatch_source_set_cancel_handler,
+                 dispatch_source_t source, dispatch_block_t handler) {
+  SCOPED_TSAN_INTERCEPTOR(dispatch_source_set_cancel_handler, source, handler);
+  if (handler == nullptr)
+    return REAL(dispatch_source_set_cancel_handler)(source, nullptr);
+  dispatch_queue_t q = GetTargetQueueFromSource(source);
+  __block tsan_block_context_t new_context = {
+      q, handler, &invoke_block, false, false, false, 0};
+  dispatch_block_t new_handler = Block_copy(^(void) {
+    new_context.orig_context = handler;  // To explicitly capture "handler".
+    dispatch_callback_wrap(&new_context);
+  });
+  uptr submit_sync = (uptr)&new_context;
+  Release(thr, pc, submit_sync);
+  REAL(dispatch_source_set_cancel_handler)(source, new_handler);
+  Block_release(new_handler);
+}
+
+TSAN_INTERCEPTOR(void, dispatch_source_set_cancel_handler_f,
+                 dispatch_source_t source, dispatch_function_t handler) {
+  SCOPED_TSAN_INTERCEPTOR(dispatch_source_set_cancel_handler_f, source,
+                          handler);
+  if (handler == nullptr)
+    return REAL(dispatch_source_set_cancel_handler)(source, nullptr);
+  dispatch_block_t block = ^(void) {
+    handler(dispatch_get_context(source));
+  };
+  WRAP(dispatch_source_set_cancel_handler)(source, block);
+}
+
+TSAN_INTERCEPTOR(void, dispatch_source_set_registration_handler,
+                 dispatch_source_t source, dispatch_block_t handler) {
+  SCOPED_TSAN_INTERCEPTOR(dispatch_source_set_registration_handler, source,
+                          handler);
+  if (handler == nullptr)
+    return REAL(dispatch_source_set_registration_handler)(source, nullptr);
+  dispatch_queue_t q = GetTargetQueueFromSource(source);
+  __block tsan_block_context_t new_context = {
+      q, handler, &invoke_block, false, false, false, 0};
+  dispatch_block_t new_handler = Block_copy(^(void) {
+    new_context.orig_context = handler;  // To explicitly capture "handler".
+    dispatch_callback_wrap(&new_context);
+  });
+  uptr submit_sync = (uptr)&new_context;
+  Release(thr, pc, submit_sync);
+  REAL(dispatch_source_set_registration_handler)(source, new_handler);
+  Block_release(new_handler);
+}
+
+TSAN_INTERCEPTOR(void, dispatch_source_set_registration_handler_f,
+                 dispatch_source_t source, dispatch_function_t handler) {
+  SCOPED_TSAN_INTERCEPTOR(dispatch_source_set_registration_handler_f, source,
+                          handler);
+  if (handler == nullptr)
+    return REAL(dispatch_source_set_registration_handler)(source, nullptr);
+  dispatch_block_t block = ^(void) {
+    handler(dispatch_get_context(source));
+  };
+  WRAP(dispatch_source_set_registration_handler)(source, block);
+}
+
+TSAN_INTERCEPTOR(void, dispatch_apply, size_t iterations,
+                 dispatch_queue_t queue, void (^block)(size_t)) {
+  SCOPED_TSAN_INTERCEPTOR(dispatch_apply, iterations, queue, block);
+
+  void *parent_to_child_sync = nullptr;
+  uptr parent_to_child_sync_uptr = (uptr)&parent_to_child_sync;
+  void *child_to_parent_sync = nullptr;
+  uptr child_to_parent_sync_uptr = (uptr)&child_to_parent_sync;
+
+  Release(thr, pc, parent_to_child_sync_uptr);
+  void (^new_block)(size_t) = ^(size_t iteration) {
+    SCOPED_INTERCEPTOR_RAW(dispatch_apply);
+    Acquire(thr, pc, parent_to_child_sync_uptr);
+    SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_START();
+    block(iteration);
+    SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_END();
+    Release(thr, pc, child_to_parent_sync_uptr);
+  };
+  SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_START();
+  REAL(dispatch_apply)(iterations, queue, new_block);
+  SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_END();
+  Acquire(thr, pc, child_to_parent_sync_uptr);
+}
+
+TSAN_INTERCEPTOR(void, dispatch_apply_f, size_t iterations,
+                 dispatch_queue_t queue, void *context,
+                 void (*work)(void *, size_t)) {
+  SCOPED_TSAN_INTERCEPTOR(dispatch_apply_f, iterations, queue, context, work);
+  void (^new_block)(size_t) = ^(size_t iteration) {
+    work(context, iteration);
+  };
+  WRAP(dispatch_apply)(iterations, queue, new_block);
+}
+
+DECLARE_REAL_AND_INTERCEPTOR(void, free, void *ptr)
+DECLARE_REAL_AND_INTERCEPTOR(int, munmap, void *addr, long_t sz)
+
+TSAN_INTERCEPTOR(dispatch_data_t, dispatch_data_create, const void *buffer,
+                 size_t size, dispatch_queue_t q, dispatch_block_t destructor) {
+  SCOPED_TSAN_INTERCEPTOR(dispatch_data_create, buffer, size, q, destructor);
+  if ((q == nullptr) || (destructor == DISPATCH_DATA_DESTRUCTOR_DEFAULT))
+    return REAL(dispatch_data_create)(buffer, size, q, destructor);
+
+  if (destructor == DISPATCH_DATA_DESTRUCTOR_FREE)
+    destructor = ^(void) { WRAP(free)((void *)buffer); };
+  else if (destructor == DISPATCH_DATA_DESTRUCTOR_MUNMAP)
+    destructor = ^(void) { WRAP(munmap)((void *)buffer, size); };
+
+  SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_START();
+  dispatch_block_t heap_block = Block_copy(destructor);
+  SCOPED_TSAN_INTERCEPTOR_USER_CALLBACK_END();
+  tsan_block_context_t *new_context =
+      AllocContext(thr, pc, q, heap_block, &invoke_and_release_block);
+  uptr submit_sync = (uptr)new_context;
+  Release(thr, pc, submit_sync);
+  return REAL(dispatch_data_create)(buffer, size, q, ^(void) {
+    dispatch_callback_wrap(new_context);
+  });
+}
+
+typedef void (^fd_handler_t)(dispatch_data_t data, int error);
+typedef void (^cleanup_handler_t)(int error);
+
+TSAN_INTERCEPTOR(void, dispatch_read, dispatch_fd_t fd, size_t length,
+                 dispatch_queue_t q, fd_handler_t h) {
+  SCOPED_TSAN_INTERCEPTOR(dispatch_read, fd, length, q, h);
+  __block tsan_block_context_t new_context = {
+      q, nullptr, &invoke_block, false, false, false, 0};
+  fd_handler_t new_h = Block_copy(^(dispatch_data_t data, int error) {
+    new_context.orig_context = ^(void) {
+      h(data, error);
+    };
+    dispatch_callback_wrap(&new_context);
+  });
+  uptr submit_sync = (uptr)&new_context;
+  Release(thr, pc, submit_sync);
+  REAL(dispatch_read)(fd, length, q, new_h);
+  Block_release(new_h);
+}
+
+TSAN_INTERCEPTOR(void, dispatch_write, dispatch_fd_t fd, dispatch_data_t data,
+                 dispatch_queue_t q, fd_handler_t h) {
+  SCOPED_TSAN_INTERCEPTOR(dispatch_write, fd, data, q, h);
+  __block tsan_block_context_t new_context = {
+      q, nullptr, &invoke_block, false, false, false, 0};
+  fd_handler_t new_h = Block_copy(^(dispatch_data_t data, int error) {
+    new_context.orig_context = ^(void) {
+      h(data, error);
+    };
+    dispatch_callback_wrap(&new_context);
+  });
+  uptr submit_sync = (uptr)&new_context;
+  Release(thr, pc, submit_sync);
+  REAL(dispatch_write)(fd, data, q, new_h);
+  Block_release(new_h);
+}
+
+TSAN_INTERCEPTOR(void, dispatch_io_read, dispatch_io_t channel, off_t offset,
+                 size_t length, dispatch_queue_t q, dispatch_io_handler_t h) {
+  SCOPED_TSAN_INTERCEPTOR(dispatch_io_read, channel, offset, length, q, h);
+  __block tsan_block_context_t new_context = {
+      q, nullptr, &invoke_block, false, false, false, 0};
+  dispatch_io_handler_t new_h =
+      Block_copy(^(bool done, dispatch_data_t data, int error) {
+        new_context.orig_context = ^(void) {
+          h(done, data, error);
+        };
+        dispatch_callback_wrap(&new_context);
+      });
+  uptr submit_sync = (uptr)&new_context;
+  Release(thr, pc, submit_sync);
+  REAL(dispatch_io_read)(channel, offset, length, q, new_h);
+  Block_release(new_h);
+}
+
+TSAN_INTERCEPTOR(void, dispatch_io_write, dispatch_io_t channel, off_t offset,
+                 dispatch_data_t data, dispatch_queue_t q,
+                 dispatch_io_handler_t h) {
+  SCOPED_TSAN_INTERCEPTOR(dispatch_io_write, channel, offset, data, q, h);
+  __block tsan_block_context_t new_context = {
+      q, nullptr, &invoke_block, false, false, false, 0};
+  dispatch_io_handler_t new_h =
+      Block_copy(^(bool done, dispatch_data_t data, int error) {
+        new_context.orig_context = ^(void) {
+          h(done, data, error);
+        };
+        dispatch_callback_wrap(&new_context);
+      });
+  uptr submit_sync = (uptr)&new_context;
+  Release(thr, pc, submit_sync);
+  REAL(dispatch_io_write)(channel, offset, data, q, new_h);
+  Block_release(new_h);
+}
+
+TSAN_INTERCEPTOR(void, dispatch_io_barrier, dispatch_io_t channel,
+                 dispatch_block_t barrier) {
+  SCOPED_TSAN_INTERCEPTOR(dispatch_io_barrier, channel, barrier);
+  __block tsan_block_context_t new_context = {
+      nullptr, nullptr, &invoke_block, false, false, false, 0};
+  new_context.non_queue_sync_object = (uptr)channel;
+  new_context.is_barrier_block = true;
+  dispatch_block_t new_block = Block_copy(^(void) {
+    new_context.orig_context = ^(void) {
+      barrier();
+    };
+    dispatch_callback_wrap(&new_context);
+  });
+  uptr submit_sync = (uptr)&new_context;
+  Release(thr, pc, submit_sync);
+  REAL(dispatch_io_barrier)(channel, new_block);
+  Block_release(new_block);
+}
+
+TSAN_INTERCEPTOR(dispatch_io_t, dispatch_io_create, dispatch_io_type_t type,
+                 dispatch_fd_t fd, dispatch_queue_t q, cleanup_handler_t h) {
+  SCOPED_TSAN_INTERCEPTOR(dispatch_io_create, type, fd, q, h);
+  __block dispatch_io_t new_channel = nullptr;
+  __block tsan_block_context_t new_context = {
+      q, nullptr, &invoke_block, false, false, false, 0};
+  cleanup_handler_t new_h = Block_copy(^(int error) {
+    {
+      SCOPED_INTERCEPTOR_RAW(dispatch_io_create_callback);
+      Acquire(thr, pc, (uptr)new_channel);  // Release() in dispatch_io_close.
+    }
+    new_context.orig_context = ^(void) {
+      h(error);
+    };
+    dispatch_callback_wrap(&new_context);
+  });
+  uptr submit_sync = (uptr)&new_context;
+  Release(thr, pc, submit_sync);
+  new_channel = REAL(dispatch_io_create)(type, fd, q, new_h);
+  Block_release(new_h);
+  return new_channel;
+}
+
+TSAN_INTERCEPTOR(dispatch_io_t, dispatch_io_create_with_path,
+                 dispatch_io_type_t type, const char *path, int oflag,
+                 mode_t mode, dispatch_queue_t q, cleanup_handler_t h) {
+  SCOPED_TSAN_INTERCEPTOR(dispatch_io_create_with_path, type, path, oflag, mode,
+                          q, h);
+  __block dispatch_io_t new_channel = nullptr;
+  __block tsan_block_context_t new_context = {
+      q, nullptr, &invoke_block, false, false, false, 0};
+  cleanup_handler_t new_h = Block_copy(^(int error) {
+    {
+      SCOPED_INTERCEPTOR_RAW(dispatch_io_create_callback);
+      Acquire(thr, pc, (uptr)new_channel);  // Release() in dispatch_io_close.
+    }
+    new_context.orig_context = ^(void) {
+      h(error);
+    };
+    dispatch_callback_wrap(&new_context);
+  });
+  uptr submit_sync = (uptr)&new_context;
+  Release(thr, pc, submit_sync);
+  new_channel =
+      REAL(dispatch_io_create_with_path)(type, path, oflag, mode, q, new_h);
+  Block_release(new_h);
+  return new_channel;
+}
+
+TSAN_INTERCEPTOR(dispatch_io_t, dispatch_io_create_with_io,
+                 dispatch_io_type_t type, dispatch_io_t io, dispatch_queue_t q,
+                 cleanup_handler_t h) {
+  SCOPED_TSAN_INTERCEPTOR(dispatch_io_create_with_io, type, io, q, h);
+  __block dispatch_io_t new_channel = nullptr;
+  __block tsan_block_context_t new_context = {
+      q, nullptr, &invoke_block, false, false, false, 0};
+  cleanup_handler_t new_h = Block_copy(^(int error) {
+    {
+      SCOPED_INTERCEPTOR_RAW(dispatch_io_create_callback);
+      Acquire(thr, pc, (uptr)new_channel);  // Release() in dispatch_io_close.
+    }
+    new_context.orig_context = ^(void) {
+      h(error);
+    };
+    dispatch_callback_wrap(&new_context);
+  });
+  uptr submit_sync = (uptr)&new_context;
+  Release(thr, pc, submit_sync);
+  new_channel = REAL(dispatch_io_create_with_io)(type, io, q, new_h);
+  Block_release(new_h);
+  return new_channel;
+}
+
+TSAN_INTERCEPTOR(void, dispatch_io_close, dispatch_io_t channel,
+                 dispatch_io_close_flags_t flags) {
+  SCOPED_TSAN_INTERCEPTOR(dispatch_io_close, channel, flags);
+  Release(thr, pc, (uptr)channel);  // Acquire() in dispatch_io_create[_*].
+  return REAL(dispatch_io_close)(channel, flags);
 }
 
 }  // namespace __tsan
diff --git a/lib/tsan/rtl/tsan_malloc_mac.cc b/lib/tsan/rtl/tsan_malloc_mac.cc
index 7fd9427..8d31ccb 100644
--- a/lib/tsan/rtl/tsan_malloc_mac.cc
+++ b/lib/tsan/rtl/tsan_malloc_mac.cc
@@ -27,33 +27,28 @@
 #define COMMON_MALLOC_MEMALIGN(alignment, size) \
   void *p =                                     \
       user_alloc(cur_thread(), StackTrace::GetCurrentPc(), size, alignment)
-#define COMMON_MALLOC_MALLOC(size)      \
-  if (cur_thread()->in_symbolizer)      \
-    return REAL(malloc)(size);          \
-  SCOPED_INTERCEPTOR_RAW(malloc, size); \
+#define COMMON_MALLOC_MALLOC(size)                             \
+  if (cur_thread()->in_symbolizer) return InternalAlloc(size); \
+  SCOPED_INTERCEPTOR_RAW(malloc, size);                        \
   void *p = user_alloc(thr, pc, size)
-#define COMMON_MALLOC_REALLOC(ptr, size)      \
-  if (cur_thread()->in_symbolizer)            \
-    return REAL(realloc)(ptr, size);          \
-  SCOPED_INTERCEPTOR_RAW(realloc, ptr, size); \
+#define COMMON_MALLOC_REALLOC(ptr, size)                              \
+  if (cur_thread()->in_symbolizer) return InternalRealloc(ptr, size); \
+  SCOPED_INTERCEPTOR_RAW(realloc, ptr, size);                         \
   void *p = user_realloc(thr, pc, ptr, size)
-#define COMMON_MALLOC_CALLOC(count, size)      \
-  if (cur_thread()->in_symbolizer)             \
-    return REAL(calloc)(count, size);          \
-  SCOPED_INTERCEPTOR_RAW(calloc, size, count); \
+#define COMMON_MALLOC_CALLOC(count, size)                              \
+  if (cur_thread()->in_symbolizer) return InternalCalloc(count, size); \
+  SCOPED_INTERCEPTOR_RAW(calloc, size, count);                         \
   void *p = user_calloc(thr, pc, size, count)
-#define COMMON_MALLOC_VALLOC(size)                          \
-  if (cur_thread()->in_symbolizer)                          \
-    return REAL(valloc)(size);                              \
-  SCOPED_INTERCEPTOR_RAW(valloc, size);                     \
+#define COMMON_MALLOC_VALLOC(size)                            \
+  if (cur_thread()->in_symbolizer)                            \
+    return InternalAlloc(size, nullptr, GetPageSizeCached()); \
+  SCOPED_INTERCEPTOR_RAW(valloc, size);                       \
   void *p = user_alloc(thr, pc, size, GetPageSizeCached())
-#define COMMON_MALLOC_FREE(ptr)      \
-  if (cur_thread()->in_symbolizer)   \
-    return REAL(free)(ptr);          \
-  SCOPED_INTERCEPTOR_RAW(free, ptr); \
+#define COMMON_MALLOC_FREE(ptr)                              \
+  if (cur_thread()->in_symbolizer) return InternalFree(ptr); \
+  SCOPED_INTERCEPTOR_RAW(free, ptr);                         \
   user_free(thr, pc, ptr)
-#define COMMON_MALLOC_SIZE(ptr) \
-  uptr size = user_alloc_usable_size(ptr);
+#define COMMON_MALLOC_SIZE(ptr) uptr size = user_alloc_usable_size(ptr);
 #define COMMON_MALLOC_FILL_STATS(zone, stats)
 #define COMMON_MALLOC_REPORT_UNKNOWN_REALLOC(ptr, zone_ptr, zone_name) \
   (void)zone_name; \
diff --git a/lib/tsan/rtl/tsan_mman.cc b/lib/tsan/rtl/tsan_mman.cc
index 7247c6e..7693077 100644
--- a/lib/tsan/rtl/tsan_mman.cc
+++ b/lib/tsan/rtl/tsan_mman.cc
@@ -63,18 +63,69 @@
   return reinterpret_cast<Allocator*>(&allocator_placeholder);
 }
 
+struct GlobalProc {
+  Mutex mtx;
+  Processor *proc;
+
+  GlobalProc()
+      : mtx(MutexTypeGlobalProc, StatMtxGlobalProc)
+      , proc(ProcCreate()) {
+  }
+};
+
+static char global_proc_placeholder[sizeof(GlobalProc)] ALIGNED(64);
+GlobalProc *global_proc() {
+  return reinterpret_cast<GlobalProc*>(&global_proc_placeholder);
+}
+
+ScopedGlobalProcessor::ScopedGlobalProcessor() {
+  GlobalProc *gp = global_proc();
+  ThreadState *thr = cur_thread();
+  if (thr->proc())
+    return;
+  // If we don't have a proc, use the global one.
+  // There are currently only two known case where this path is triggered:
+  //   __interceptor_free
+  //   __nptl_deallocate_tsd
+  //   start_thread
+  //   clone
+  // and:
+  //   ResetRange
+  //   __interceptor_munmap
+  //   __deallocate_stack
+  //   start_thread
+  //   clone
+  // Ideally, we destroy thread state (and unwire proc) when a thread actually
+  // exits (i.e. when we join/wait it). Then we would not need the global proc
+  gp->mtx.Lock();
+  ProcWire(gp->proc, thr);
+}
+
+ScopedGlobalProcessor::~ScopedGlobalProcessor() {
+  GlobalProc *gp = global_proc();
+  ThreadState *thr = cur_thread();
+  if (thr->proc() != gp->proc)
+    return;
+  ProcUnwire(gp->proc, thr);
+  gp->mtx.Unlock();
+}
+
 void InitializeAllocator() {
   allocator()->Init(common_flags()->allocator_may_return_null);
 }
 
-void AllocatorThreadStart(ThreadState *thr) {
-  allocator()->InitCache(&thr->alloc_cache);
-  internal_allocator()->InitCache(&thr->internal_alloc_cache);
+void InitializeAllocatorLate() {
+  new(global_proc()) GlobalProc();
 }
 
-void AllocatorThreadFinish(ThreadState *thr) {
-  allocator()->DestroyCache(&thr->alloc_cache);
-  internal_allocator()->DestroyCache(&thr->internal_alloc_cache);
+void AllocatorProcStart(Processor *proc) {
+  allocator()->InitCache(&proc->alloc_cache);
+  internal_allocator()->InitCache(&proc->internal_alloc_cache);
+}
+
+void AllocatorProcFinish(Processor *proc) {
+  allocator()->DestroyCache(&proc->alloc_cache);
+  internal_allocator()->DestroyCache(&proc->internal_alloc_cache);
 }
 
 void AllocatorPrintStats() {
@@ -98,7 +149,7 @@
 void *user_alloc(ThreadState *thr, uptr pc, uptr sz, uptr align, bool signal) {
   if ((sz >= (1ull << 40)) || (align >= (1ull << 40)))
     return allocator()->ReturnNullOrDie();
-  void *p = allocator()->Allocate(&thr->alloc_cache, sz, align);
+  void *p = allocator()->Allocate(&thr->proc()->alloc_cache, sz, align);
   if (p == 0)
     return 0;
   if (ctx && ctx->initialized)
@@ -118,9 +169,10 @@
 }
 
 void user_free(ThreadState *thr, uptr pc, void *p, bool signal) {
+  ScopedGlobalProcessor sgp;
   if (ctx && ctx->initialized)
     OnUserFree(thr, pc, (uptr)p, true);
-  allocator()->Deallocate(&thr->alloc_cache, p);
+  allocator()->Deallocate(&thr->proc()->alloc_cache, p);
   if (signal)
     SignalUnsafeCall(thr, pc);
 }
@@ -136,7 +188,7 @@
 
 void OnUserFree(ThreadState *thr, uptr pc, uptr p, bool write) {
   CHECK_NE(p, (void*)0);
-  uptr sz = ctx->metamap.FreeBlock(thr, pc, p);
+  uptr sz = ctx->metamap.FreeBlock(thr->proc(), p);
   DPrintf("#%d: free(%p, %zu)\n", thr->tid, p, sz);
   if (write && thr->ignore_reads_and_writes == 0)
     MemoryRangeFreed(thr, pc, (uptr)p, sz);
@@ -164,7 +216,11 @@
   if (p == 0)
     return 0;
   MBlock *b = ctx->metamap.GetBlock((uptr)p);
-  return b ? b->siz : 0;
+  if (!b)
+    return 0;  // Not a valid pointer.
+  if (b->siz == 0)
+    return 1;  // Zero-sized allocations are actually 1 byte.
+  return b->siz;
 }
 
 void invoke_malloc_hook(void *ptr, uptr size) {
@@ -172,6 +228,7 @@
   if (ctx == 0 || !ctx->initialized || thr->ignore_interceptors)
     return;
   __sanitizer_malloc_hook(ptr, size);
+  RunMallocHooks(ptr, size);
 }
 
 void invoke_free_hook(void *ptr) {
@@ -179,6 +236,7 @@
   if (ctx == 0 || !ctx->initialized || thr->ignore_interceptors)
     return;
   __sanitizer_free_hook(ptr);
+  RunFreeHooks(ptr);
 }
 
 void *internal_alloc(MBlockType typ, uptr sz) {
@@ -187,7 +245,7 @@
     thr->nomalloc = 0;  // CHECK calls internal_malloc().
     CHECK(0);
   }
-  return InternalAlloc(sz, &thr->internal_alloc_cache);
+  return InternalAlloc(sz, &thr->proc()->internal_alloc_cache);
 }
 
 void internal_free(void *p) {
@@ -196,7 +254,7 @@
     thr->nomalloc = 0;  // CHECK calls internal_malloc().
     CHECK(0);
   }
-  InternalFree(p, &thr->internal_alloc_cache);
+  InternalFree(p, &thr->proc()->internal_alloc_cache);
 }
 
 }  // namespace __tsan
@@ -238,8 +296,8 @@
 
 void __tsan_on_thread_idle() {
   ThreadState *thr = cur_thread();
-  allocator()->SwallowCache(&thr->alloc_cache);
-  internal_allocator()->SwallowCache(&thr->internal_alloc_cache);
-  ctx->metamap.OnThreadIdle(thr);
+  allocator()->SwallowCache(&thr->proc()->alloc_cache);
+  internal_allocator()->SwallowCache(&thr->proc()->internal_alloc_cache);
+  ctx->metamap.OnProcIdle(thr->proc());
 }
 }  // extern "C"
diff --git a/lib/tsan/rtl/tsan_mman.h b/lib/tsan/rtl/tsan_mman.h
index b419b58..8cdeeb3 100644
--- a/lib/tsan/rtl/tsan_mman.h
+++ b/lib/tsan/rtl/tsan_mman.h
@@ -20,9 +20,10 @@
 const uptr kDefaultAlignment = 16;
 
 void InitializeAllocator();
+void InitializeAllocatorLate();
 void ReplaceSystemMalloc();
-void AllocatorThreadStart(ThreadState *thr);
-void AllocatorThreadFinish(ThreadState *thr);
+void AllocatorProcStart(Processor *proc);
+void AllocatorProcFinish(Processor *proc);
 void AllocatorPrintStats();
 
 // For user allocations.
diff --git a/lib/tsan/rtl/tsan_mutex.cc b/lib/tsan/rtl/tsan_mutex.cc
index 9dd2480..22afefc 100644
--- a/lib/tsan/rtl/tsan_mutex.cc
+++ b/lib/tsan/rtl/tsan_mutex.cc
@@ -43,6 +43,7 @@
   /*11 MutexTypeDDetector*/   {},
   /*12 MutexTypeFired*/       {MutexTypeLeaf},
   /*13 MutexTypeRacy*/        {MutexTypeLeaf},
+  /*14 MutexTypeGlobalProc*/  {},
 };
 
 static bool CanLockAdj[MutexTypeCount][MutexTypeCount];
diff --git a/lib/tsan/rtl/tsan_mutex.h b/lib/tsan/rtl/tsan_mutex.h
index 27f5538..22ee2f3 100644
--- a/lib/tsan/rtl/tsan_mutex.h
+++ b/lib/tsan/rtl/tsan_mutex.h
@@ -34,6 +34,7 @@
   MutexTypeDDetector,
   MutexTypeFired,
   MutexTypeRacy,
+  MutexTypeGlobalProc,
 
   // This must be the last.
   MutexTypeCount
diff --git a/lib/tsan/rtl/tsan_new_delete.cc b/lib/tsan/rtl/tsan_new_delete.cc
index ebb422c..b6478bb 100644
--- a/lib/tsan/rtl/tsan_new_delete.cc
+++ b/lib/tsan/rtl/tsan_new_delete.cc
@@ -23,14 +23,10 @@
 
 DECLARE_REAL(void *, malloc, uptr size)
 DECLARE_REAL(void, free, void *ptr)
-#if SANITIZER_MAC || SANITIZER_ANDROID
-#define __libc_malloc REAL(malloc)
-#define __libc_free REAL(free)
-#endif
 
 #define OPERATOR_NEW_BODY(mangled_name) \
   if (cur_thread()->in_symbolizer) \
-    return __libc_malloc(size); \
+    return InternalAlloc(size); \
   void *p = 0; \
   {  \
     SCOPED_INTERCEPTOR_RAW(mangled_name, size); \
@@ -66,7 +62,7 @@
 #define OPERATOR_DELETE_BODY(mangled_name) \
   if (ptr == 0) return;  \
   if (cur_thread()->in_symbolizer) \
-    return __libc_free(ptr); \
+    return InternalFree(ptr); \
   invoke_free_hook(ptr);  \
   SCOPED_INTERCEPTOR_RAW(mangled_name, ptr);  \
   user_free(thr, pc, ptr);
diff --git a/lib/tsan/rtl/tsan_platform.h b/lib/tsan/rtl/tsan_platform.h
index c2b4871..2bd6637 100644
--- a/lib/tsan/rtl/tsan_platform.h
+++ b/lib/tsan/rtl/tsan_platform.h
@@ -297,7 +297,7 @@
   static const uptr kShadowEnd     = 0x050000000000ull;
   static const uptr kAppMemBeg     = 0x000000001000ull;
   static const uptr kAppMemEnd     = 0x00e000000000ull;
-}
+};
 
 #else
 # error "Unknown platform"
@@ -587,7 +587,11 @@
   return (((x) & ~(Mapping::kAppMemMsk | (kShadowCell - 1)))
       ^ Mapping::kAppMemXor) * kShadowCnt;
 #else
+# ifndef SANITIZER_WINDOWS
   return ((x & ~(kShadowCell - 1)) * kShadowCnt) | Mapping::kShadowBeg;
+# else
+  return ((x & ~(kShadowCell - 1)) * kShadowCnt) + Mapping::kShadowBeg;
+# endif
 #endif
 }
 
@@ -662,7 +666,6 @@
 # ifndef SANITIZER_WINDOWS
   return (s & ~Mapping::kShadowBeg) / kShadowCnt;
 # else
-  // FIXME(dvyukov): this is most likely wrong as the mapping is not bijection.
   return (s - Mapping::kShadowBeg) / kShadowCnt;
 # endif // SANITIZER_WINDOWS
 #endif
@@ -754,10 +757,6 @@
 void InitializeShadowMemoryPlatform();
 void FlushShadowMemory();
 void WriteMemoryProfile(char *buf, uptr buf_size, uptr nthread, uptr nlive);
-
-// Says whether the addr relates to a global var.
-// Guesses with high probability, may yield both false positives and negatives.
-bool IsGlobalVar(uptr addr);
 int ExtractResolvFDs(void *state, int *fds, int nfd);
 int ExtractRecvmsgFDs(void *msg, int *fds, int nfd);
 
diff --git a/lib/tsan/rtl/tsan_platform_linux.cc b/lib/tsan/rtl/tsan_platform_linux.cc
index 6602561..d7182fd 100644
--- a/lib/tsan/rtl/tsan_platform_linux.cc
+++ b/lib/tsan/rtl/tsan_platform_linux.cc
@@ -18,6 +18,8 @@
 
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_libc.h"
+#include "sanitizer_common/sanitizer_linux.h"
+#include "sanitizer_common/sanitizer_platform_limits_posix.h"
 #include "sanitizer_common/sanitizer_posix.h"
 #include "sanitizer_common/sanitizer_procmaps.h"
 #include "sanitizer_common/sanitizer_stoptheworld.h"
@@ -34,6 +36,9 @@
 #include <string.h>
 #include <stdarg.h>
 #include <sys/mman.h>
+#if SANITIZER_LINUX
+#include <sys/personality.h>
+#endif
 #include <sys/syscall.h>
 #include <sys/socket.h>
 #include <sys/time.h>
@@ -64,9 +69,6 @@
 
 namespace __tsan {
 
-static uptr g_data_start;
-static uptr g_data_end;
-
 #ifdef TSAN_RUNTIME_VMA
 // Runtime detected VMA size.
 uptr vmaSize;
@@ -199,46 +201,6 @@
   MapRodata();
 }
 
-static void InitDataSeg() {
-  MemoryMappingLayout proc_maps(true);
-  uptr start, end, offset;
-  char name[128];
-#if SANITIZER_FREEBSD
-  // On FreeBSD BSS is usually the last block allocated within the
-  // low range and heap is the last block allocated within the range
-  // 0x800000000-0x8ffffffff.
-  while (proc_maps.Next(&start, &end, &offset, name, ARRAY_SIZE(name),
-                        /*protection*/ 0)) {
-    DPrintf("%p-%p %p %s\n", start, end, offset, name);
-    if ((start & 0xffff00000000ULL) == 0 && (end & 0xffff00000000ULL) == 0 &&
-        name[0] == '\0') {
-      g_data_start = start;
-      g_data_end = end;
-    }
-  }
-#else
-  bool prev_is_data = false;
-  while (proc_maps.Next(&start, &end, &offset, name, ARRAY_SIZE(name),
-                        /*protection*/ 0)) {
-    DPrintf("%p-%p %p %s\n", start, end, offset, name);
-    bool is_data = offset != 0 && name[0] != 0;
-    // BSS may get merged with [heap] in /proc/self/maps. This is not very
-    // reliable.
-    bool is_bss = offset == 0 &&
-      (name[0] == 0 || internal_strcmp(name, "[heap]") == 0) && prev_is_data;
-    if (g_data_start == 0 && is_data)
-      g_data_start = start;
-    if (is_bss)
-      g_data_end = end;
-    prev_is_data = is_data;
-  }
-#endif
-  DPrintf("guessed data_start=%p data_end=%p\n",  g_data_start, g_data_end);
-  CHECK_LT(g_data_start, g_data_end);
-  CHECK_GE((uptr)&g_data_start, g_data_start);
-  CHECK_LT((uptr)&g_data_start, g_data_end);
-}
-
 #endif  // #ifndef SANITIZER_GO
 
 void InitializePlatformEarly() {
@@ -289,6 +251,20 @@
       SetAddressSpaceUnlimited();
       reexec = true;
     }
+#if SANITIZER_LINUX && defined(__aarch64__)
+    // After patch "arm64: mm: support ARCH_MMAP_RND_BITS." is introduced in
+    // linux kernel, the random gap between stack and mapped area is increased
+    // from 128M to 36G on 39-bit aarch64. As it is almost impossible to cover
+    // this big range, we should disable randomized virtual space on aarch64.
+    int old_personality = personality(0xffffffff);
+    if (old_personality != -1 && (old_personality & ADDR_NO_RANDOMIZE) == 0) {
+      VReport(1, "WARNING: Program is run with randomized virtual address "
+              "space, which wouldn't work with ThreadSanitizer.\n"
+              "Re-execing with fixed virtual address space.\n");
+      CHECK_NE(personality(old_personality | ADDR_NO_RANDOMIZE), -1);
+      reexec = true;
+    }
+#endif
     if (reexec)
       ReExec();
   }
@@ -296,14 +272,9 @@
 #ifndef SANITIZER_GO
   CheckAndProtect();
   InitTlsSize();
-  InitDataSeg();
 #endif
 }
 
-bool IsGlobalVar(uptr addr) {
-  return g_data_start && addr >= g_data_start && addr < g_data_end;
-}
-
 #ifndef SANITIZER_GO
 // Extract file descriptors passed to glibc internal __res_iclose function.
 // This is required to properly "close" the fds, because we do not see internal
@@ -361,6 +332,69 @@
 void ReplaceSystemMalloc() { }
 #endif
 
+#ifndef SANITIZER_GO
+#if SANITIZER_ANDROID
+
+#if defined(__aarch64__)
+# define __get_tls() \
+    ({ void** __val; __asm__("mrs %0, tpidr_el0" : "=r"(__val)); __val; })
+#elif defined(__x86_64__)
+# define __get_tls() \
+    ({ void** __val; __asm__("mov %%fs:0, %0" : "=r"(__val)); __val; })
+#else
+#error unsupported architecture
+#endif
+
+// On Android, __thread is not supported. So we store the pointer to ThreadState
+// in TLS_SLOT_TSAN, which is the tls slot allocated by Android bionic for tsan.
+static const int TLS_SLOT_TSAN = 8;
+// On Android, one thread can call intercepted functions after
+// DestroyThreadState(), so add a fake thread state for "dead" threads.
+static ThreadState *dead_thread_state = nullptr;
+
+ThreadState *cur_thread() {
+  ThreadState* thr = (ThreadState*)__get_tls()[TLS_SLOT_TSAN];
+  if (thr == nullptr) {
+    __sanitizer_sigset_t emptyset;
+    internal_sigfillset(&emptyset);
+    __sanitizer_sigset_t oldset;
+    CHECK_EQ(0, internal_sigprocmask(SIG_SETMASK, &emptyset, &oldset));
+    thr = reinterpret_cast<ThreadState*>(__get_tls()[TLS_SLOT_TSAN]);
+    if (thr == nullptr) {
+      thr = reinterpret_cast<ThreadState*>(MmapOrDie(sizeof(ThreadState),
+                                                     "ThreadState"));
+      __get_tls()[TLS_SLOT_TSAN] = thr;
+      if (dead_thread_state == nullptr) {
+        dead_thread_state = reinterpret_cast<ThreadState*>(
+            MmapOrDie(sizeof(ThreadState), "ThreadState"));
+        dead_thread_state->fast_state.SetIgnoreBit();
+        dead_thread_state->ignore_interceptors = 1;
+        dead_thread_state->is_dead = true;
+        *const_cast<int*>(&dead_thread_state->tid) = -1;
+        CHECK_EQ(0, internal_mprotect(dead_thread_state, sizeof(ThreadState),
+                                      PROT_READ));
+      }
+    }
+    CHECK_EQ(0, internal_sigprocmask(SIG_SETMASK, &oldset, nullptr));
+  }
+  return thr;
+}
+
+void cur_thread_finalize() {
+  __sanitizer_sigset_t emptyset;
+  internal_sigfillset(&emptyset);
+  __sanitizer_sigset_t oldset;
+  CHECK_EQ(0, internal_sigprocmask(SIG_SETMASK, &emptyset, &oldset));
+  ThreadState* thr = (ThreadState*)__get_tls()[TLS_SLOT_TSAN];
+  if (thr != dead_thread_state) {
+    __get_tls()[TLS_SLOT_TSAN] = dead_thread_state;
+    UnmapOrDie(thr, sizeof(ThreadState));
+  }
+  CHECK_EQ(0, internal_sigprocmask(SIG_SETMASK, &oldset, nullptr));
+}
+#endif  // SANITIZER_ANDROID
+#endif  // ifndef SANITIZER_GO
+
 }  // namespace __tsan
 
 #endif  // SANITIZER_LINUX || SANITIZER_FREEBSD
diff --git a/lib/tsan/rtl/tsan_platform_mac.cc b/lib/tsan/rtl/tsan_platform_mac.cc
index 31caf37..0cc02ab 100644
--- a/lib/tsan/rtl/tsan_platform_mac.cc
+++ b/lib/tsan/rtl/tsan_platform_mac.cc
@@ -67,20 +67,18 @@
 // when TLVs are not accessible (early process startup, thread cleanup, ...).
 // The following provides a "poor man's TLV" implementation, where we use the
 // shadow memory of the pointer returned by pthread_self() to store a pointer to
-// the ThreadState object. The main thread's ThreadState pointer is stored
-// separately in a static variable, because we need to access it even before the
+// the ThreadState object. The main thread's ThreadState is stored separately
+// in a static variable, because we need to access it even before the
 // shadow memory is set up.
 static uptr main_thread_identity = 0;
-static ThreadState *main_thread_state = nullptr;
+ALIGNED(64) static char main_thread_state[sizeof(ThreadState)];
 
 ThreadState *cur_thread() {
-  ThreadState **fake_tls;
   uptr thread_identity = (uptr)pthread_self();
   if (thread_identity == main_thread_identity || main_thread_identity == 0) {
-    fake_tls = &main_thread_state;
-  } else {
-    fake_tls = (ThreadState **)MemToShadow(thread_identity);
+    return (ThreadState *)&main_thread_state;
   }
+  ThreadState **fake_tls = (ThreadState **)MemToShadow(thread_identity);
   ThreadState *thr = (ThreadState *)SignalSafeGetOrAllocate(
       (uptr *)fake_tls, sizeof(ThreadState));
   return thr;
@@ -91,7 +89,11 @@
 // handler will try to access the unmapped ThreadState.
 void cur_thread_finalize() {
   uptr thread_identity = (uptr)pthread_self();
-  CHECK_NE(thread_identity, main_thread_identity);
+  if (thread_identity == main_thread_identity) {
+    // Calling dispatch_main() or xpc_main() actually invokes pthread_exit to
+    // exit the main thread. Let's keep the main thread's ThreadState.
+    return;
+  }
   ThreadState **fake_tls = (ThreadState **)MemToShadow(thread_identity);
   internal_munmap(*fake_tls, sizeof(ThreadState));
   *fake_tls = nullptr;
@@ -131,10 +133,12 @@
   if (event == PTHREAD_INTROSPECTION_THREAD_CREATE) {
     if (thread == pthread_self()) {
       // The current thread is a newly created GCD worker thread.
+      ThreadState *thr = cur_thread();
+      Processor *proc = ProcCreate();
+      ProcWire(proc, thr);
       ThreadState *parent_thread_state = nullptr;  // No parent.
       int tid = ThreadCreate(parent_thread_state, 0, (uptr)thread, true);
       CHECK_NE(tid, 0);
-      ThreadState *thr = cur_thread();
       ThreadStart(thr, tid, GetTid());
     }
   } else if (event == PTHREAD_INTROSPECTION_THREAD_TERMINATE) {
@@ -183,10 +187,6 @@
 }
 #endif
 
-bool IsGlobalVar(uptr addr) {
-  return false;
-}
-
 }  // namespace __tsan
 
 #endif  // SANITIZER_MAC
diff --git a/lib/tsan/rtl/tsan_platform_posix.cc b/lib/tsan/rtl/tsan_platform_posix.cc
index 90476cb..805ce1b 100644
--- a/lib/tsan/rtl/tsan_platform_posix.cc
+++ b/lib/tsan/rtl/tsan_platform_posix.cc
@@ -105,7 +105,7 @@
   CHECK_LE(beg, end);
   if (beg == end)
     return;
-  if (beg != (uptr)MmapNoAccess(beg, end - beg)) {
+  if (beg != (uptr)MmapFixedNoAccess(beg, end - beg)) {
     Printf("FATAL: ThreadSanitizer can not protect [%zx,%zx]\n", beg, end);
     Printf("FATAL: Make sure you are not using unlimited stack\n");
     Die();
diff --git a/lib/tsan/rtl/tsan_preinit.cc b/lib/tsan/rtl/tsan_preinit.cc
new file mode 100644
index 0000000..a96618d
--- /dev/null
+++ b/lib/tsan/rtl/tsan_preinit.cc
@@ -0,0 +1,27 @@
+//===-- tsan_preinit.cc ---------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of ThreadSanitizer.
+//
+// Call __tsan_init at the very early stage of process startup.
+//===----------------------------------------------------------------------===//
+
+#include "sanitizer_common/sanitizer_internal_defs.h"
+#include "tsan_interface.h"
+
+#if SANITIZER_CAN_USE_PREINIT_ARRAY
+
+// The symbol is called __local_tsan_preinit, because it's not intended to be
+// exported.
+// This code linked into the main executable when -fsanitize=thread is in
+// the link flags. It can only use exported interface functions.
+__attribute__((section(".preinit_array"), used))
+void (*__local_tsan_preinit)(void) = __tsan_init;
+
+#endif
diff --git a/lib/tsan/rtl/tsan_report.cc b/lib/tsan/rtl/tsan_report.cc
index c1d2fd0..9360494 100644
--- a/lib/tsan/rtl/tsan_report.cc
+++ b/lib/tsan/rtl/tsan_report.cc
@@ -96,6 +96,8 @@
     return "destroy of a locked mutex";
   if (typ == ReportTypeMutexDoubleLock)
     return "double lock of a mutex";
+  if (typ == ReportTypeMutexInvalidAccess)
+    return "use of an invalid mutex (e.g. uninitialized or destroyed)";
   if (typ == ReportTypeMutexBadUnlock)
     return "unlock of an unlocked mutex (or by a wrong thread)";
   if (typ == ReportTypeMutexBadReadLock)
@@ -234,7 +236,7 @@
     Printf(" '%s'", rt->name);
   char thrbuf[kThreadBufSize];
   Printf(" (tid=%zu, %s) created by %s",
-    rt->pid, rt->running ? "running" : "finished",
+    rt->os_id, rt->running ? "running" : "finished",
     thread_name(thrbuf, rt->parent_tid));
   if (rt->stack)
     Printf(" at:");
@@ -379,9 +381,9 @@
 
 static void PrintMop(const ReportMop *mop, bool first) {
   Printf("\n");
-  Printf("%s by ",
+  Printf("%s at %p by ",
       (first ? (mop->write ? "Write" : "Read")
-             : (mop->write ? "Previous write" : "Previous read")));
+             : (mop->write ? "Previous write" : "Previous read")), mop->addr);
   if (mop->tid == kMainThreadId)
     Printf("main goroutine:\n");
   else
@@ -389,6 +391,31 @@
   PrintStack(mop->stack);
 }
 
+static void PrintLocation(const ReportLocation *loc) {
+  switch (loc->type) {
+  case ReportLocationHeap: {
+    Printf("\n");
+    Printf("Heap block of size %zu at %p allocated by ",
+        loc->heap_chunk_size, loc->heap_chunk_start);
+    if (loc->tid == kMainThreadId)
+      Printf("main goroutine:\n");
+    else
+      Printf("goroutine %d:\n", loc->tid);
+    PrintStack(loc->stack);
+    break;
+  }
+  case ReportLocationGlobal: {
+    Printf("\n");
+    Printf("Global var %s of size %zu at %p declared at %s:%zu\n",
+        loc->global.name, loc->global.size, loc->global.start,
+        loc->global.file, loc->global.line);
+    break;
+  }
+  default:
+    break;
+  }
+}
+
 static void PrintThread(const ReportThread *rt) {
   if (rt->id == kMainThreadId)
     return;
@@ -404,6 +431,8 @@
     Printf("WARNING: DATA RACE");
     for (uptr i = 0; i < rep->mops.Size(); i++)
       PrintMop(rep->mops[i], i == 0);
+    for (uptr i = 0; i < rep->locs.Size(); i++)
+      PrintLocation(rep->locs[i]);
     for (uptr i = 0; i < rep->threads.Size(); i++)
       PrintThread(rep->threads[i]);
   } else if (rep->typ == ReportTypeDeadlock) {
diff --git a/lib/tsan/rtl/tsan_report.h b/lib/tsan/rtl/tsan_report.h
index 3e344a0..d0b9d74 100644
--- a/lib/tsan/rtl/tsan_report.h
+++ b/lib/tsan/rtl/tsan_report.h
@@ -27,6 +27,7 @@
   ReportTypeThreadLeak,
   ReportTypeMutexDestroyLocked,
   ReportTypeMutexDoubleLock,
+  ReportTypeMutexInvalidAccess,
   ReportTypeMutexBadUnlock,
   ReportTypeMutexBadReadLock,
   ReportTypeMutexBadReadUnlock,
@@ -86,7 +87,7 @@
 
 struct ReportThread {
   int id;
-  uptr pid;
+  uptr os_id;
   bool running;
   char *name;
   int parent_tid;
diff --git a/lib/tsan/rtl/tsan_rtl.cc b/lib/tsan/rtl/tsan_rtl.cc
index 4df4db5..629871e 100644
--- a/lib/tsan/rtl/tsan_rtl.cc
+++ b/lib/tsan/rtl/tsan_rtl.cc
@@ -321,6 +321,7 @@
   const char *options = GetEnv(kTsanOptionsEnv);
   CacheBinaryName();
   InitializeFlags(&ctx->flags, options);
+  AvoidCVE_2016_2143();
   InitializePlatformEarly();
 #ifndef SANITIZER_GO
   // Re-exec ourselves if we need to set additional env or command line args.
@@ -329,6 +330,10 @@
   InitializeAllocator();
   ReplaceSystemMalloc();
 #endif
+  if (common_flags()->detect_deadlocks)
+    ctx->dd = DDetector::Create(flags());
+  Processor *proc = ProcCreate();
+  ProcWire(proc, thr);
   InitializeInterceptors();
   CheckShadowMapping();
   InitializePlatform();
@@ -336,6 +341,7 @@
   InitializeDynamicAnnotations();
 #ifndef SANITIZER_GO
   InitializeShadowMemory();
+  InitializeAllocatorLate();
 #endif
   // Setup correct file descriptor for error reports.
   __sanitizer_set_report_path(common_flags()->log_path);
@@ -351,8 +357,6 @@
   SetSandboxingCallback(StopBackgroundThread);
 #endif
 #endif
-  if (common_flags()->detect_deadlocks)
-    ctx->dd = DDetector::Create(flags());
 
   VPrintf(1, "***** Running under ThreadSanitizer v2 (pid %d) *****\n",
           (int)internal_getpid());
@@ -366,6 +370,10 @@
 #endif
   ctx->initialized = true;
 
+#ifndef SANITIZER_GO
+  Symbolizer::LateInitialize();
+#endif
+
   if (flags()->stop_on_start) {
     Printf("ThreadSanitizer is suspended at startup (pid %d)."
            " Call __tsan_resume().\n",
diff --git a/lib/tsan/rtl/tsan_rtl.h b/lib/tsan/rtl/tsan_rtl.h
index 04104b1..ff69015 100644
--- a/lib/tsan/rtl/tsan_rtl.h
+++ b/lib/tsan/rtl/tsan_rtl.h
@@ -325,6 +325,36 @@
   uptr *shadow_stack_pos;
 };
 
+// A Processor represents a physical thread, or a P for Go.
+// It is used to store internal resources like allocate cache, and does not
+// participate in race-detection logic (invisible to end user).
+// In C++ it is tied to an OS thread just like ThreadState, however ideally
+// it should be tied to a CPU (this way we will have fewer allocator caches).
+// In Go it is tied to a P, so there are significantly fewer Processor's than
+// ThreadState's (which are tied to Gs).
+// A ThreadState must be wired with a Processor to handle events.
+struct Processor {
+  ThreadState *thr; // currently wired thread, or nullptr
+#ifndef SANITIZER_GO
+  AllocatorCache alloc_cache;
+  InternalAllocatorCache internal_alloc_cache;
+#endif
+  DenseSlabAllocCache block_cache;
+  DenseSlabAllocCache sync_cache;
+  DenseSlabAllocCache clock_cache;
+  DDPhysicalThread *dd_pt;
+};
+
+#ifndef SANITIZER_GO
+// ScopedGlobalProcessor temporary setups a global processor for the current
+// thread, if it does not have one. Intended for interceptors that can run
+// at the very thread end, when we already destroyed the thread processor.
+struct ScopedGlobalProcessor {
+  ScopedGlobalProcessor();
+  ~ScopedGlobalProcessor();
+};
+#endif
+
 // This struct is stored in TLS.
 struct ThreadState {
   FastState fast_state;
@@ -360,8 +390,6 @@
   MutexSet mset;
   ThreadClock clock;
 #ifndef SANITIZER_GO
-  AllocatorCache alloc_cache;
-  InternalAllocatorCache internal_alloc_cache;
   Vector<JmpBuf> jmp_bufs;
   int ignore_interceptors;
 #endif
@@ -385,16 +413,19 @@
 #if SANITIZER_DEBUG && !SANITIZER_GO
   InternalDeadlockDetector internal_deadlock_detector;
 #endif
-  DDPhysicalThread *dd_pt;
   DDLogicalThread *dd_lt;
 
+  // Current wired Processor, or nullptr. Required to handle any events.
+  Processor *proc1;
+#ifndef SANITIZER_GO
+  Processor *proc() { return proc1; }
+#else
+  Processor *proc();
+#endif
+
   atomic_uintptr_t in_signal_handler;
   ThreadSignalContext *signal_ctx;
 
-  DenseSlabAllocCache block_cache;
-  DenseSlabAllocCache sync_cache;
-  DenseSlabAllocCache clock_cache;
-
 #ifndef SANITIZER_GO
   u32 last_sleep_stack_id;
   ThreadClock last_sleep_clock;
@@ -404,6 +435,8 @@
   // If set, malloc must not be called.
   int nomalloc;
 
+  const ReportDesc *current_report;
+
   explicit ThreadState(Context *ctx, int tid, int unique_id, u64 epoch,
                        unsigned reuse_count,
                        uptr stk_addr, uptr stk_size,
@@ -411,7 +444,7 @@
 };
 
 #ifndef SANITIZER_GO
-#if SANITIZER_MAC
+#if SANITIZER_MAC || SANITIZER_ANDROID
 ThreadState *cur_thread();
 void cur_thread_finalize();
 #else
@@ -421,7 +454,7 @@
   return reinterpret_cast<ThreadState *>(&cur_thread_placeholder);
 }
 INLINE void cur_thread_finalize() { }
-#endif  // SANITIZER_MAC
+#endif  // SANITIZER_MAC || SANITIZER_ANDROID
 #endif  // SANITIZER_GO
 
 class ThreadContext : public ThreadContextBase {
@@ -683,6 +716,11 @@
 int ThreadCount(ThreadState *thr);
 void ProcessPendingSignals(ThreadState *thr);
 
+Processor *ProcCreate();
+void ProcDestroy(Processor *proc);
+void ProcWire(Processor *proc, ThreadState *thr);
+void ProcUnwire(Processor *proc, ThreadState *thr);
+
 void MutexCreate(ThreadState *thr, uptr pc, uptr addr,
                  bool rw, bool recursive, bool linker_init);
 void MutexDestroy(ThreadState *thr, uptr pc, uptr addr);
@@ -693,6 +731,7 @@
 void MutexReadUnlock(ThreadState *thr, uptr pc, uptr addr);
 void MutexReadOrWriteUnlock(ThreadState *thr, uptr pc, uptr addr);
 void MutexRepair(ThreadState *thr, uptr pc, uptr addr);  // call on EOWNERDEAD
+void MutexInvalidAccess(ThreadState *thr, uptr pc, uptr addr);
 
 void Acquire(ThreadState *thr, uptr pc, uptr addr);
 // AcquireGlobal synchronizes the current thread with all other threads.
diff --git a/lib/tsan/rtl/tsan_rtl_mutex.cc b/lib/tsan/rtl/tsan_rtl_mutex.cc
index 62ab7aa..1806acf 100644
--- a/lib/tsan/rtl/tsan_rtl_mutex.cc
+++ b/lib/tsan/rtl/tsan_rtl_mutex.cc
@@ -32,7 +32,7 @@
   Callback(ThreadState *thr, uptr pc)
       : thr(thr)
       , pc(pc) {
-    DDCallback::pt = thr->dd_pt;
+    DDCallback::pt = thr->proc()->dd_pt;
     DDCallback::lt = thr->dd_lt;
   }
 
@@ -84,21 +84,14 @@
 void MutexDestroy(ThreadState *thr, uptr pc, uptr addr) {
   DPrintf("#%d: MutexDestroy %zx\n", thr->tid, addr);
   StatInc(thr, StatMutexDestroy);
-#ifndef SANITIZER_GO
-  // Global mutexes not marked as LINKER_INITIALIZED
-  // cause tons of not interesting reports, so just ignore it.
-  if (IsGlobalVar(addr))
-    return;
-#endif
-  if (IsAppMem(addr)) {
-    CHECK(!thr->is_freeing);
-    thr->is_freeing = true;
-    MemoryWrite(thr, pc, addr, kSizeLog1);
-    thr->is_freeing = false;
-  }
-  SyncVar *s = ctx->metamap.GetIfExistsAndLock(addr);
+  SyncVar *s = ctx->metamap.GetIfExistsAndLock(addr, true);
   if (s == 0)
     return;
+  if (s->is_linker_init) {
+    // Destroy is no-op for linker-initialized mutexes.
+    s->mtx.Unlock();
+    return;
+  }
   if (common_flags()->detect_deadlocks) {
     Callback cb(thr, pc);
     ctx->dd->MutexDestroy(&cb, &s->dd);
@@ -114,7 +107,7 @@
   u64 mid = s->GetId();
   u32 last_lock = s->last_lock;
   if (!unlock_locked)
-    s->Reset(thr);  // must not reset it before the report is printed
+    s->Reset(thr->proc());  // must not reset it before the report is printed
   s->mtx.Unlock();
   if (unlock_locked) {
     ThreadRegistryLock l(ctx->thread_registry);
@@ -128,15 +121,23 @@
     rep.AddStack(trace, true);
     rep.AddLocation(addr, 1);
     OutputReport(thr, rep);
-  }
-  if (unlock_locked) {
-    SyncVar *s = ctx->metamap.GetIfExistsAndLock(addr);
+
+    SyncVar *s = ctx->metamap.GetIfExistsAndLock(addr, true);
     if (s != 0) {
-      s->Reset(thr);
+      s->Reset(thr->proc());
       s->mtx.Unlock();
     }
   }
   thr->mset.Remove(mid);
+  // Imitate a memory write to catch unlock-destroy races.
+  // Do this outside of sync mutex, because it can report a race which locks
+  // sync mutexes.
+  if (IsAppMem(addr)) {
+    CHECK(!thr->is_freeing);
+    thr->is_freeing = true;
+    MemoryWrite(thr, pc, addr, kSizeLog1);
+    thr->is_freeing = false;
+  }
   // s will be destroyed and freed in MetaMap::FreeBlock.
 }
 
@@ -350,11 +351,21 @@
   s->mtx.Unlock();
 }
 
+void MutexInvalidAccess(ThreadState *thr, uptr pc, uptr addr) {
+  DPrintf("#%d: MutexInvalidAccess %zx\n", thr->tid, addr);
+  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, true);
+  u64 mid = s->GetId();
+  s->mtx.Unlock();
+  ReportMutexMisuse(thr, pc, ReportTypeMutexInvalidAccess, addr, mid);
+}
+
 void Acquire(ThreadState *thr, uptr pc, uptr addr) {
   DPrintf("#%d: Acquire %zx\n", thr->tid, addr);
   if (thr->ignore_sync)
     return;
-  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, false);
+  SyncVar *s = ctx->metamap.GetIfExistsAndLock(addr, false);
+  if (!s)
+    return;
   AcquireImpl(thr, pc, &s->clock);
   s->mtx.ReadUnlock();
 }
@@ -426,7 +437,7 @@
   if (thr->ignore_sync)
     return;
   thr->clock.set(thr->fast_state.epoch());
-  thr->clock.acquire(&thr->clock_cache, c);
+  thr->clock.acquire(&thr->proc()->clock_cache, c);
   StatInc(thr, StatSyncAcquire);
 }
 
@@ -435,7 +446,7 @@
     return;
   thr->clock.set(thr->fast_state.epoch());
   thr->fast_synch_epoch = thr->fast_state.epoch();
-  thr->clock.release(&thr->clock_cache, c);
+  thr->clock.release(&thr->proc()->clock_cache, c);
   StatInc(thr, StatSyncRelease);
 }
 
@@ -444,7 +455,7 @@
     return;
   thr->clock.set(thr->fast_state.epoch());
   thr->fast_synch_epoch = thr->fast_state.epoch();
-  thr->clock.ReleaseStore(&thr->clock_cache, c);
+  thr->clock.ReleaseStore(&thr->proc()->clock_cache, c);
   StatInc(thr, StatSyncRelease);
 }
 
@@ -453,7 +464,7 @@
     return;
   thr->clock.set(thr->fast_state.epoch());
   thr->fast_synch_epoch = thr->fast_state.epoch();
-  thr->clock.acq_rel(&thr->clock_cache, c);
+  thr->clock.acq_rel(&thr->proc()->clock_cache, c);
   StatInc(thr, StatSyncAcquire);
   StatInc(thr, StatSyncRelease);
 }
diff --git a/lib/tsan/rtl/tsan_rtl_proc.cc b/lib/tsan/rtl/tsan_rtl_proc.cc
new file mode 100644
index 0000000..0c838a1
--- /dev/null
+++ b/lib/tsan/rtl/tsan_rtl_proc.cc
@@ -0,0 +1,61 @@
+//===-- tsan_rtl_proc.cc ------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of ThreadSanitizer (TSan), a race detector.
+//
+//===----------------------------------------------------------------------===//
+
+#include "sanitizer_common/sanitizer_placement_new.h"
+#include "tsan_rtl.h"
+#include "tsan_mman.h"
+#include "tsan_flags.h"
+
+namespace __tsan {
+
+Processor *ProcCreate() {
+  void *mem = InternalAlloc(sizeof(Processor));
+  internal_memset(mem, 0, sizeof(Processor));
+  Processor *proc = new(mem) Processor;
+  proc->thr = nullptr;
+#ifndef SANITIZER_GO
+  AllocatorProcStart(proc);
+#endif
+  if (common_flags()->detect_deadlocks)
+    proc->dd_pt = ctx->dd->CreatePhysicalThread();
+  return proc;
+}
+
+void ProcDestroy(Processor *proc) {
+  CHECK_EQ(proc->thr, nullptr);
+#ifndef SANITIZER_GO
+  AllocatorProcFinish(proc);
+#endif
+  ctx->clock_alloc.FlushCache(&proc->clock_cache);
+  ctx->metamap.OnProcIdle(proc);
+  if (common_flags()->detect_deadlocks)
+     ctx->dd->DestroyPhysicalThread(proc->dd_pt);
+  proc->~Processor();
+  InternalFree(proc);
+}
+
+void ProcWire(Processor *proc, ThreadState *thr) {
+  CHECK_EQ(thr->proc1, nullptr);
+  CHECK_EQ(proc->thr, nullptr);
+  thr->proc1 = proc;
+  proc->thr = thr;
+}
+
+void ProcUnwire(Processor *proc, ThreadState *thr) {
+  CHECK_EQ(thr->proc1, proc);
+  CHECK_EQ(proc->thr, thr);
+  thr->proc1 = nullptr;
+  proc->thr = nullptr;
+}
+
+}  // namespace __tsan
diff --git a/lib/tsan/rtl/tsan_rtl_report.cc b/lib/tsan/rtl/tsan_rtl_report.cc
index 5aff6ca..810119b 100644
--- a/lib/tsan/rtl/tsan_rtl_report.cc
+++ b/lib/tsan/rtl/tsan_rtl_report.cc
@@ -56,6 +56,11 @@
 }
 #endif
 
+SANITIZER_WEAK_DEFAULT_IMPL
+void __tsan_on_report(const ReportDesc *rep) {
+  (void)rep;
+}
+
 static void StackStripMain(SymbolizedStack *frames) {
   SymbolizedStack *last_frame = nullptr;
   SymbolizedStack *last_frame2 = nullptr;
@@ -189,7 +194,7 @@
   ReportThread *rt = new(mem) ReportThread;
   rep_->threads.PushBack(rt);
   rt->id = tctx->tid;
-  rt->pid = tctx->os_id;
+  rt->os_id = tctx->os_id;
   rt->running = (tctx->status == ThreadStatusRunning);
   rt->name = internal_strdup(tctx->name);
   rt->parent_tid = tctx->parent_tid;
@@ -268,7 +273,7 @@
   u64 uid = 0;
   u64 mid = id;
   uptr addr = SyncVar::SplitId(id, &uid);
-  SyncVar *s = ctx->metamap.GetIfExistsAndLock(addr);
+  SyncVar *s = ctx->metamap.GetIfExistsAndLock(addr, true);
   // Check that the mutex is still alive.
   // Another mutex can be created at the same address,
   // so check uid as well.
@@ -342,12 +347,12 @@
     rep_->locs.PushBack(loc);
     AddThread(tctx);
   }
+#endif
   if (ReportLocation *loc = SymbolizeData(addr)) {
     loc->suppressable = true;
     rep_->locs.PushBack(loc);
     return;
   }
-#endif
 }
 
 #ifndef SANITIZER_GO
@@ -492,6 +497,8 @@
     return false;
   atomic_store_relaxed(&ctx->last_symbolize_time_ns, NanoTime());
   const ReportDesc *rep = srep.GetReport();
+  CHECK_EQ(thr->current_report, nullptr);
+  thr->current_report = rep;
   Suppression *supp = 0;
   uptr pc_or_addr = 0;
   for (uptr i = 0; pc_or_addr == 0 && i < rep->mops.Size(); i++)
@@ -512,13 +519,17 @@
     thr->is_freeing = false;
     bool suppressed = OnReport(rep, pc_or_addr != 0);
     thr->is_freeing = old_is_freeing;
-    if (suppressed)
+    if (suppressed) {
+      thr->current_report = nullptr;
       return false;
+    }
   }
   PrintReport(rep);
+  __tsan_on_report(rep);
   ctx->nreported++;
   if (flags()->halt_on_error)
     Die();
+  thr->current_report = nullptr;
   return true;
 }
 
@@ -669,6 +680,14 @@
   PrintStack(SymbolizeStack(trace));
 }
 
+// Always inlining PrintCurrentStackSlow, because LocatePcInTrace assumes
+// __sanitizer_print_stack_trace exists in the actual unwinded stack, but
+// tail-call to PrintCurrentStackSlow breaks this assumption because
+// __sanitizer_print_stack_trace disappears after tail-call.
+// However, this solution is not reliable enough, please see dvyukov's comment
+// http://reviews.llvm.org/D19148#406208
+// Also see PR27280 comment 2 and 3 for breaking examples and analysis.
+ALWAYS_INLINE
 void PrintCurrentStackSlow(uptr pc) {
 #ifndef SANITIZER_GO
   BufferedStackTrace *ptrace =
diff --git a/lib/tsan/rtl/tsan_rtl_thread.cc b/lib/tsan/rtl/tsan_rtl_thread.cc
index dcae255..ab8f3c3 100644
--- a/lib/tsan/rtl/tsan_rtl_thread.cc
+++ b/lib/tsan/rtl/tsan_rtl_thread.cc
@@ -42,7 +42,7 @@
 void ThreadContext::OnJoined(void *arg) {
   ThreadState *caller_thr = static_cast<ThreadState *>(arg);
   AcquireImpl(caller_thr, 0, &sync);
-  sync.Reset(&caller_thr->clock_cache);
+  sync.Reset(&caller_thr->proc()->clock_cache);
 }
 
 struct OnCreatedArgs {
@@ -74,7 +74,7 @@
 
 void ThreadContext::OnDetached(void *arg) {
   ThreadState *thr1 = static_cast<ThreadState*>(arg);
-  sync.Reset(&thr1->clock_cache);
+  sync.Reset(&thr1->proc()->clock_cache);
 }
 
 struct OnStartedArgs {
@@ -106,13 +106,8 @@
   thr->shadow_stack_pos = thr->shadow_stack;
   thr->shadow_stack_end = thr->shadow_stack + kInitStackSize;
 #endif
-#ifndef SANITIZER_GO
-  AllocatorThreadStart(thr);
-#endif
-  if (common_flags()->detect_deadlocks) {
-    thr->dd_pt = ctx->dd->CreatePhysicalThread();
+  if (common_flags()->detect_deadlocks)
     thr->dd_lt = ctx->dd->CreateLogicalThread(unique_id);
-  }
   thr->fast_state.SetHistorySize(flags()->history_size);
   // Commit switch to the new part of the trace.
   // TraceAddEvent will reset stack0/mset0 in the new part for us.
@@ -121,7 +116,7 @@
   thr->fast_synch_epoch = epoch0;
   AcquireImpl(thr, 0, &sync);
   StatInc(thr, StatSyncAcquire);
-  sync.Reset(&thr->clock_cache);
+  sync.Reset(&thr->proc()->clock_cache);
   thr->is_inited = true;
   DPrintf("#%d: ThreadStart epoch=%zu stk_addr=%zx stk_size=%zx "
           "tls_addr=%zx tls_size=%zx\n",
@@ -138,15 +133,8 @@
   }
   epoch1 = thr->fast_state.epoch();
 
-  if (common_flags()->detect_deadlocks) {
-    ctx->dd->DestroyPhysicalThread(thr->dd_pt);
+  if (common_flags()->detect_deadlocks)
     ctx->dd->DestroyLogicalThread(thr->dd_lt);
-  }
-  ctx->clock_alloc.FlushCache(&thr->clock_cache);
-  ctx->metamap.OnThreadIdle(thr);
-#ifndef SANITIZER_GO
-  AllocatorThreadFinish(thr);
-#endif
   thr->~ThreadState();
 #if TSAN_COLLECT_STATS
   StatAggregate(ctx->stat, thr->stat);
diff --git a/lib/tsan/rtl/tsan_stat.cc b/lib/tsan/rtl/tsan_stat.cc
index a5cca96..d1d6ed2 100644
--- a/lib/tsan/rtl/tsan_stat.cc
+++ b/lib/tsan/rtl/tsan_stat.cc
@@ -168,6 +168,7 @@
   name[StatMtxFired]                     = "  FiredSuppressions               ";
   name[StatMtxRacy]                      = "  RacyStacks                      ";
   name[StatMtxFD]                        = "  FD                              ";
+  name[StatMtxGlobalProc]                = "  GlobalProc                      ";
 
   Printf("Statistics:\n");
   for (int i = 0; i < StatCnt; i++)
diff --git a/lib/tsan/rtl/tsan_stat.h b/lib/tsan/rtl/tsan_stat.h
index 8ea3204..8447dd8 100644
--- a/lib/tsan/rtl/tsan_stat.h
+++ b/lib/tsan/rtl/tsan_stat.h
@@ -173,6 +173,7 @@
   StatMtxFired,
   StatMtxRacy,
   StatMtxFD,
+  StatMtxGlobalProc,
 
   // This must be the last.
   StatCnt
diff --git a/lib/tsan/rtl/tsan_suppressions.cc b/lib/tsan/rtl/tsan_suppressions.cc
index 8754b61..aea3cb9 100644
--- a/lib/tsan/rtl/tsan_suppressions.cc
+++ b/lib/tsan/rtl/tsan_suppressions.cc
@@ -80,6 +80,8 @@
     return kSuppressionMutex;
   else if (typ == ReportTypeMutexDoubleLock)
     return kSuppressionMutex;
+  else if (typ == ReportTypeMutexInvalidAccess)
+    return kSuppressionMutex;
   else if (typ == ReportTypeMutexBadUnlock)
     return kSuppressionMutex;
   else if (typ == ReportTypeMutexBadReadLock)
@@ -92,7 +94,7 @@
     return kSuppressionNone;
   else if (typ == ReportTypeDeadlock)
     return kSuppressionDeadlock;
-  Printf("ThreadSanitizer: unknown report type %d\n", typ),
+  Printf("ThreadSanitizer: unknown report type %d\n", typ);
   Die();
 }
 
@@ -159,8 +161,8 @@
   Printf("ThreadSanitizer: Matched %d suppressions (pid=%d):\n", hit_count,
          (int)internal_getpid());
   for (uptr i = 0; i < matched.size(); i++) {
-    Printf("%d %s:%s\n", matched[i]->hit_count, matched[i]->type,
-           matched[i]->templ);
+    Printf("%d %s:%s\n", atomic_load_relaxed(&matched[i]->hit_count),
+           matched[i]->type, matched[i]->templ);
   }
 }
 }  // namespace __tsan
diff --git a/lib/tsan/rtl/tsan_sync.cc b/lib/tsan/rtl/tsan_sync.cc
index 4202d30..58b2680 100644
--- a/lib/tsan/rtl/tsan_sync.cc
+++ b/lib/tsan/rtl/tsan_sync.cc
@@ -36,7 +36,7 @@
     DDMutexInit(thr, pc, this);
 }
 
-void SyncVar::Reset(ThreadState *thr) {
+void SyncVar::Reset(Processor *proc) {
   uid = 0;
   creation_stack_id = 0;
   owner_tid = kInvalidTid;
@@ -47,12 +47,12 @@
   is_broken = 0;
   is_linker_init = 0;
 
-  if (thr == 0) {
+  if (proc == 0) {
     CHECK_EQ(clock.size(), 0);
     CHECK_EQ(read_clock.size(), 0);
   } else {
-    clock.Reset(&thr->clock_cache);
-    read_clock.Reset(&thr->clock_cache);
+    clock.Reset(&proc->clock_cache);
+    read_clock.Reset(&proc->clock_cache);
   }
 }
 
@@ -61,7 +61,7 @@
 }
 
 void MetaMap::AllocBlock(ThreadState *thr, uptr pc, uptr p, uptr sz) {
-  u32 idx = block_alloc_.Alloc(&thr->block_cache);
+  u32 idx = block_alloc_.Alloc(&thr->proc()->block_cache);
   MBlock *b = block_alloc_.Map(idx);
   b->siz = sz;
   b->tid = thr->tid;
@@ -71,16 +71,16 @@
   *meta = idx | kFlagBlock;
 }
 
-uptr MetaMap::FreeBlock(ThreadState *thr, uptr pc, uptr p) {
+uptr MetaMap::FreeBlock(Processor *proc, uptr p) {
   MBlock* b = GetBlock(p);
   if (b == 0)
     return 0;
   uptr sz = RoundUpTo(b->siz, kMetaShadowCell);
-  FreeRange(thr, pc, p, sz);
+  FreeRange(proc, p, sz);
   return sz;
 }
 
-bool MetaMap::FreeRange(ThreadState *thr, uptr pc, uptr p, uptr sz) {
+bool MetaMap::FreeRange(Processor *proc, uptr p, uptr sz) {
   bool has_something = false;
   u32 *meta = MemToMeta(p);
   u32 *end = MemToMeta(p + sz);
@@ -96,14 +96,14 @@
     has_something = true;
     while (idx != 0) {
       if (idx & kFlagBlock) {
-        block_alloc_.Free(&thr->block_cache, idx & ~kFlagMask);
+        block_alloc_.Free(&proc->block_cache, idx & ~kFlagMask);
         break;
       } else if (idx & kFlagSync) {
         DCHECK(idx & kFlagSync);
         SyncVar *s = sync_alloc_.Map(idx & ~kFlagMask);
         u32 next = s->next;
-        s->Reset(thr);
-        sync_alloc_.Free(&thr->sync_cache, idx & ~kFlagMask);
+        s->Reset(proc);
+        sync_alloc_.Free(&proc->sync_cache, idx & ~kFlagMask);
         idx = next;
       } else {
         CHECK(0);
@@ -119,24 +119,30 @@
 // which can be huge. The function probes pages one-by-one until it finds a page
 // without meta objects, at this point it stops freeing meta objects. Because
 // thread stacks grow top-down, we do the same starting from end as well.
-void MetaMap::ResetRange(ThreadState *thr, uptr pc, uptr p, uptr sz) {
+void MetaMap::ResetRange(Processor *proc, uptr p, uptr sz) {
+  if (kGoMode) {
+    // UnmapOrDie/MmapFixedNoReserve does not work on Windows,
+    // so we do the optimization only for C/C++.
+    FreeRange(proc, p, sz);
+    return;
+  }
   const uptr kMetaRatio = kMetaShadowCell / kMetaShadowSize;
   const uptr kPageSize = GetPageSizeCached() * kMetaRatio;
   if (sz <= 4 * kPageSize) {
     // If the range is small, just do the normal free procedure.
-    FreeRange(thr, pc, p, sz);
+    FreeRange(proc, p, sz);
     return;
   }
   // First, round both ends of the range to page size.
   uptr diff = RoundUp(p, kPageSize) - p;
   if (diff != 0) {
-    FreeRange(thr, pc, p, diff);
+    FreeRange(proc, p, diff);
     p += diff;
     sz -= diff;
   }
   diff = p + sz - RoundDown(p + sz, kPageSize);
   if (diff != 0) {
-    FreeRange(thr, pc, p + sz - diff, diff);
+    FreeRange(proc, p + sz - diff, diff);
     sz -= diff;
   }
   // Now we must have a non-empty page-aligned range.
@@ -146,18 +152,21 @@
   const uptr p0 = p;
   const uptr sz0 = sz;
   // Probe start of the range.
-  while (sz > 0) {
-    bool has_something = FreeRange(thr, pc, p, kPageSize);
+  for (uptr checked = 0; sz > 0; checked += kPageSize) {
+    bool has_something = FreeRange(proc, p, kPageSize);
     p += kPageSize;
     sz -= kPageSize;
-    if (!has_something)
+    if (!has_something && checked > (128 << 10))
       break;
   }
   // Probe end of the range.
-  while (sz > 0) {
-    bool has_something = FreeRange(thr, pc, p - kPageSize, kPageSize);
+  for (uptr checked = 0; sz > 0; checked += kPageSize) {
+    bool has_something = FreeRange(proc, p + sz - kPageSize, kPageSize);
     sz -= kPageSize;
-    if (!has_something)
+    // Stacks grow down, so sync object are most likely at the end of the region
+    // (if it is a stack). The very end of the stack is TLS and tsan increases
+    // TLS by at least 256K, so check at least 512K.
+    if (!has_something && checked > (512 << 10))
       break;
   }
   // Finally, page out the whole range (including the parts that we've just
@@ -189,8 +198,8 @@
   return GetAndLock(thr, pc, addr, write_lock, true);
 }
 
-SyncVar* MetaMap::GetIfExistsAndLock(uptr addr) {
-  return GetAndLock(0, 0, addr, true, false);
+SyncVar* MetaMap::GetIfExistsAndLock(uptr addr, bool write_lock) {
+  return GetAndLock(0, 0, addr, write_lock, false);
 }
 
 SyncVar* MetaMap::GetAndLock(ThreadState *thr, uptr pc,
@@ -210,8 +219,8 @@
       SyncVar * s = sync_alloc_.Map(idx & ~kFlagMask);
       if (s->addr == addr) {
         if (myidx != 0) {
-          mys->Reset(thr);
-          sync_alloc_.Free(&thr->sync_cache, myidx);
+          mys->Reset(thr->proc());
+          sync_alloc_.Free(&thr->proc()->sync_cache, myidx);
         }
         if (write_lock)
           s->mtx.Lock();
@@ -230,7 +239,7 @@
 
     if (myidx == 0) {
       const u64 uid = atomic_fetch_add(&uid_gen_, 1, memory_order_relaxed);
-      myidx = sync_alloc_.Alloc(&thr->sync_cache);
+      myidx = sync_alloc_.Alloc(&thr->proc()->sync_cache);
       mys = sync_alloc_.Map(myidx);
       mys->Init(thr, pc, addr, uid);
     }
@@ -279,9 +288,9 @@
   }
 }
 
-void MetaMap::OnThreadIdle(ThreadState *thr) {
-  block_alloc_.FlushCache(&thr->block_cache);
-  sync_alloc_.FlushCache(&thr->sync_cache);
+void MetaMap::OnProcIdle(Processor *proc) {
+  block_alloc_.FlushCache(&proc->block_cache);
+  sync_alloc_.FlushCache(&proc->sync_cache);
 }
 
 }  // namespace __tsan
diff --git a/lib/tsan/rtl/tsan_sync.h b/lib/tsan/rtl/tsan_sync.h
index f07ea3b..2bc2f41 100644
--- a/lib/tsan/rtl/tsan_sync.h
+++ b/lib/tsan/rtl/tsan_sync.h
@@ -47,7 +47,7 @@
   SyncClock clock;
 
   void Init(ThreadState *thr, uptr pc, uptr addr, u64 uid);
-  void Reset(ThreadState *thr);
+  void Reset(Processor *proc);
 
   u64 GetId() const {
     // 47 lsb is addr, then 14 bits is low part of uid, then 3 zero bits.
@@ -72,18 +72,18 @@
   MetaMap();
 
   void AllocBlock(ThreadState *thr, uptr pc, uptr p, uptr sz);
-  uptr FreeBlock(ThreadState *thr, uptr pc, uptr p);
-  bool FreeRange(ThreadState *thr, uptr pc, uptr p, uptr sz);
-  void ResetRange(ThreadState *thr, uptr pc, uptr p, uptr sz);
+  uptr FreeBlock(Processor *proc, uptr p);
+  bool FreeRange(Processor *proc, uptr p, uptr sz);
+  void ResetRange(Processor *proc, uptr p, uptr sz);
   MBlock* GetBlock(uptr p);
 
   SyncVar* GetOrCreateAndLock(ThreadState *thr, uptr pc,
                               uptr addr, bool write_lock);
-  SyncVar* GetIfExistsAndLock(uptr addr);
+  SyncVar* GetIfExistsAndLock(uptr addr, bool write_lock);
 
   void MoveMemory(uptr src, uptr dst, uptr sz);
 
-  void OnThreadIdle(ThreadState *thr);
+  void OnProcIdle(Processor *proc);
 
  private:
   static const u32 kFlagMask  = 3u << 30;
diff --git a/lib/tsan/tests/CMakeLists.txt b/lib/tsan/tests/CMakeLists.txt
index 51181ba..d1b1e96 100644
--- a/lib/tsan/tests/CMakeLists.txt
+++ b/lib/tsan/tests/CMakeLists.txt
@@ -6,7 +6,7 @@
 
 set(TSAN_UNITTEST_CFLAGS
   ${TSAN_CFLAGS}
-  ${COMPILER_RT_TEST_CFLAGS}
+  ${COMPILER_RT_UNITTEST_CFLAGS}
   ${COMPILER_RT_GTEST_CFLAGS}
   -I${COMPILER_RT_SOURCE_DIR}/lib
   -I${COMPILER_RT_SOURCE_DIR}/lib/tsan/rtl
diff --git a/lib/tsan/tests/rtl/tsan_test.cc b/lib/tsan/tests/rtl/tsan_test.cc
index edfede0..842b417 100644
--- a/lib/tsan/tests/rtl/tsan_test.cc
+++ b/lib/tsan/tests/rtl/tsan_test.cc
@@ -54,6 +54,12 @@
 }
 #endif
 
+namespace __sanitizer {
+bool ReexecDisabled() {
+  return true;
+}
+}
+
 int main(int argc, char **argv) {
   argv0 = argv[0];
   return run_tests(argc, argv);
diff --git a/lib/tsan/tests/unit/tsan_sync_test.cc b/lib/tsan/tests/unit/tsan_sync_test.cc
index d3616a1..8016654 100644
--- a/lib/tsan/tests/unit/tsan_sync_test.cc
+++ b/lib/tsan/tests/unit/tsan_sync_test.cc
@@ -25,7 +25,7 @@
   EXPECT_NE(mb, (MBlock*)0);
   EXPECT_EQ(mb->siz, 1 * sizeof(u64));
   EXPECT_EQ(mb->tid, thr->tid);
-  uptr sz = m->FreeBlock(thr, 0, (uptr)&block[0]);
+  uptr sz = m->FreeBlock(thr->proc(), (uptr)&block[0]);
   EXPECT_EQ(sz, 1 * sizeof(u64));
   mb = m->GetBlock((uptr)&block[0]);
   EXPECT_EQ(mb, (MBlock*)0);
@@ -41,7 +41,7 @@
   EXPECT_EQ(mb1->siz, 1 * sizeof(u64));
   MBlock *mb2 = m->GetBlock((uptr)&block[1]);
   EXPECT_EQ(mb2->siz, 3 * sizeof(u64));
-  m->FreeRange(thr, 0, (uptr)&block[0], 4 * sizeof(u64));
+  m->FreeRange(thr->proc(), (uptr)&block[0], 4 * sizeof(u64));
   mb1 = m->GetBlock((uptr)&block[0]);
   EXPECT_EQ(mb1, (MBlock*)0);
   mb2 = m->GetBlock((uptr)&block[1]);
@@ -53,7 +53,7 @@
   MetaMap *m = &ctx->metamap;
   u64 block[4] = {};  // fake malloc block
   m->AllocBlock(thr, 0, (uptr)&block[0], 4 * sizeof(u64));
-  SyncVar *s1 = m->GetIfExistsAndLock((uptr)&block[0]);
+  SyncVar *s1 = m->GetIfExistsAndLock((uptr)&block[0], true);
   EXPECT_EQ(s1, (SyncVar*)0);
   s1 = m->GetOrCreateAndLock(thr, 0, (uptr)&block[0], true);
   EXPECT_NE(s1, (SyncVar*)0);
@@ -63,12 +63,12 @@
   EXPECT_NE(s2, (SyncVar*)0);
   EXPECT_EQ(s2->addr, (uptr)&block[1]);
   s2->mtx.ReadUnlock();
-  m->FreeBlock(thr, 0, (uptr)&block[0]);
-  s1 = m->GetIfExistsAndLock((uptr)&block[0]);
+  m->FreeBlock(thr->proc(), (uptr)&block[0]);
+  s1 = m->GetIfExistsAndLock((uptr)&block[0], true);
   EXPECT_EQ(s1, (SyncVar*)0);
-  s2 = m->GetIfExistsAndLock((uptr)&block[1]);
+  s2 = m->GetIfExistsAndLock((uptr)&block[1], true);
   EXPECT_EQ(s2, (SyncVar*)0);
-  m->OnThreadIdle(thr);
+  m->OnProcIdle(thr->proc());
 }
 
 TEST(MetaMap, MoveMemory) {
@@ -93,19 +93,19 @@
   mb2 = m->GetBlock((uptr)&block2[3]);
   EXPECT_NE(mb2, (MBlock*)0);
   EXPECT_EQ(mb2->siz, 1 * sizeof(u64));
-  s1 = m->GetIfExistsAndLock((uptr)&block1[0]);
+  s1 = m->GetIfExistsAndLock((uptr)&block1[0], true);
   EXPECT_EQ(s1, (SyncVar*)0);
-  s2 = m->GetIfExistsAndLock((uptr)&block1[1]);
+  s2 = m->GetIfExistsAndLock((uptr)&block1[1], true);
   EXPECT_EQ(s2, (SyncVar*)0);
-  s1 = m->GetIfExistsAndLock((uptr)&block2[0]);
+  s1 = m->GetIfExistsAndLock((uptr)&block2[0], true);
   EXPECT_NE(s1, (SyncVar*)0);
   EXPECT_EQ(s1->addr, (uptr)&block2[0]);
   s1->mtx.Unlock();
-  s2 = m->GetIfExistsAndLock((uptr)&block2[1]);
+  s2 = m->GetIfExistsAndLock((uptr)&block2[1], true);
   EXPECT_NE(s2, (SyncVar*)0);
   EXPECT_EQ(s2->addr, (uptr)&block2[1]);
   s2->mtx.Unlock();
-  m->FreeRange(thr, 0, (uptr)&block2[0], 4 * sizeof(u64));
+  m->FreeRange(thr->proc(), (uptr)&block2[0], 4 * sizeof(u64));
 }
 
 TEST(MetaMap, ResetSync) {
@@ -114,9 +114,9 @@
   u64 block[1] = {};  // fake malloc block
   m->AllocBlock(thr, 0, (uptr)&block[0], 1 * sizeof(u64));
   SyncVar *s = m->GetOrCreateAndLock(thr, 0, (uptr)&block[0], true);
-  s->Reset(thr);
+  s->Reset(thr->proc());
   s->mtx.Unlock();
-  uptr sz = m->FreeBlock(thr, 0, (uptr)&block[0]);
+  uptr sz = m->FreeBlock(thr->proc(), (uptr)&block[0]);
   EXPECT_EQ(sz, 1 * sizeof(u64));
 }
 
diff --git a/lib/tsan/tests/unit/tsan_unit_test_main.cc b/lib/tsan/tests/unit/tsan_unit_test_main.cc
index 84d94dd..2d55747 100644
--- a/lib/tsan/tests/unit/tsan_unit_test_main.cc
+++ b/lib/tsan/tests/unit/tsan_unit_test_main.cc
@@ -12,6 +12,12 @@
 //===----------------------------------------------------------------------===//
 #include "gtest/gtest.h"
 
+namespace __sanitizer {
+bool ReexecDisabled() {
+  return true;
+}
+}
+
 int main(int argc, char **argv) {
   testing::GTEST_FLAG(death_test_style) = "threadsafe";
   testing::InitGoogleTest(&argc, argv);
diff --git a/lib/ubsan/Android.bp b/lib/ubsan/Android.bp
index b8834fb..537f83a 100644
--- a/lib/ubsan/Android.bp
+++ b/lib/ubsan/Android.bp
@@ -38,6 +38,7 @@
     "-Werror",
     "-Wno-unused-parameter",
     "-Wno-non-virtual-dtor",
+    "-DUBSAN_CAN_USE_CXXABI",
 ]
 
 ubsan_rtl_c_includes = ["external/compiler-rt/lib"]
@@ -45,7 +46,6 @@
 cc_library_static {
     name: "libubsan",
     host_supported: true,
-    defaults: ["asan_arch_defaults"],
 
     include_dirs: ubsan_rtl_c_includes,
     cppflags: ubsan_rtl_cppflags,
@@ -63,6 +63,109 @@
     },
 }
 
+cc_library_static {
+    name: "libubsan_cxx",
+    host_supported: true,
+
+    include_dirs: ubsan_rtl_c_includes,
+    cppflags: ubsan_rtl_cppflags,
+    rtti: true,
+    srcs: ubsan_cxx_rtl_files,
+    sdk_version: "19",
+    sanitize: {
+        never: true,
+    },
+    compile_multilib: "both",
+}
+
+cc_defaults {
+    name: "libclang_rt_ubsan_defaults",
+
+    include_dirs: [
+        "external/compiler-rt/lib",
+        "external/compiler-rt/include",
+    ],
+    static_libs: [
+        "libsan",
+    ],
+    whole_static_libs: [
+        "libubsan",
+        "libubsan_cxx",
+    ],
+    shared_libs: [
+        "liblog",
+        "libdl",
+    ],
+    clang: true,
+    sanitize: {
+        never: true,
+    },
+    // _cxx bits (vptr-sanitizer and cfi) need dynamic_cast<>
+    stl: "c++_static",
+    sdk_version: "19",
+    enabled: false,
+}
+
+cc_library_shared {
+    name: "libclang_rt.ubsan_standalone-arm-android",
+    defaults: ["libclang_rt_ubsan_defaults"],
+    arch: {
+        arm: {
+            enabled: true,
+        },
+    },
+}
+
+cc_library_shared {
+    name: "libclang_rt.ubsan_standalone-aarch64-android",
+    defaults: ["libclang_rt_ubsan_defaults"],
+    arch: {
+        arm64: {
+            enabled: true,
+        },
+    },
+}
+
+cc_library_shared {
+    name: "libclang_rt.ubsan_standalone-i686-android",
+    defaults: ["libclang_rt_ubsan_defaults"],
+    arch: {
+        x86: {
+            enabled: true,
+        },
+    },
+}
+
+cc_library_shared {
+    name: "libclang_rt.ubsan_standalone-x86_64-android",
+    defaults: ["libclang_rt_ubsan_defaults"],
+    arch: {
+        x86_64: {
+            enabled: true,
+        },
+    },
+}
+
+cc_library_shared {
+    name: "libclang_rt.ubsan_standalone-mips-android",
+    defaults: ["libclang_rt_ubsan_defaults"],
+    arch: {
+        mips: {
+            enabled: true,
+        },
+    },
+}
+
+cc_library_shared {
+    name: "libclang_rt.ubsan_standalone-mips64-android",
+    defaults: ["libclang_rt_ubsan_defaults"],
+    arch: {
+        mips64: {
+            enabled: true,
+        },
+    },
+}
+
 //###############################################################################
 // Host modules
 
@@ -86,23 +189,6 @@
 }
 
 cc_library_host_static {
-    name: "libubsan_cxx",
-
-    include_dirs: ubsan_rtl_c_includes,
-    cppflags: ubsan_rtl_cppflags,
-    srcs: ubsan_cxx_rtl_files,
-    sanitize: {
-        never: true,
-    },
-    compile_multilib: "both",
-    target: {
-        darwin: {
-            enabled: false,
-        },
-    },
-}
-
-cc_library_host_static {
     name: "libubsan_standalone_cxx",
 
     include_dirs: ubsan_rtl_c_includes,
diff --git a/lib/ubsan/CMakeLists.txt b/lib/ubsan/CMakeLists.txt
index 5ece9a6..901fef2 100644
--- a/lib/ubsan/CMakeLists.txt
+++ b/lib/ubsan/CMakeLists.txt
@@ -22,12 +22,19 @@
 include_directories(..)
 
 set(UBSAN_CFLAGS ${SANITIZER_COMMON_CFLAGS})
-append_no_rtti_flag(UBSAN_CFLAGS)
+append_rtti_flag(OFF UBSAN_CFLAGS)
+append_list_if(SANITIZER_CAN_USE_CXXABI -DUBSAN_CAN_USE_CXXABI UBSAN_CFLAGS)
+
 set(UBSAN_STANDALONE_CFLAGS ${SANITIZER_COMMON_CFLAGS})
-append_no_rtti_flag(UBSAN_STANDALONE_CFLAGS)
+append_rtti_flag(OFF UBSAN_STANDALONE_CFLAGS)
+append_list_if(SANITIZER_CAN_USE_CXXABI -DUBSAN_CAN_USE_CXXABI UBSAN_STANDALONE_CFLAGS)
+
 set(UBSAN_CXXFLAGS ${SANITIZER_COMMON_CFLAGS})
+append_rtti_flag(ON UBSAN_STANDALONE_CXXFLAGS)
+append_list_if(SANITIZER_CAN_USE_CXXABI -DUBSAN_CAN_USE_CXXABI UBSAN_CXXFLAGS)
 
 add_custom_target(ubsan)
+set_target_properties(ubsan PROPERTIES FOLDER "Compiler-RT Misc")
 
 if(APPLE)
   set(UBSAN_COMMON_SOURCES ${UBSAN_SOURCES})
diff --git a/lib/ubsan/Makefile.mk b/lib/ubsan/Makefile.mk
deleted file mode 100644
index ec3f5c5..0000000
--- a/lib/ubsan/Makefile.mk
+++ /dev/null
@@ -1,28 +0,0 @@
-#===- lib/ubsan/Makefile.mk ---------------------------------*- Makefile -*--===#
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-#===------------------------------------------------------------------------===#
-
-ModuleName := ubsan
-SubDirs :=
-
-Sources := $(foreach file,$(wildcard $(Dir)/*.cc),$(notdir $(file)))
-StandaloneSources := ubsan_init_standalone.cc
-CXXSources := ubsan_type_hash.cc ubsan_handlers_cxx.cc
-CSources := $(filter-out $(StandaloneSources),$(filter-out $(CXXSources),$(Sources)))
-ObjNames := $(Sources:%.cc=%.o)
-
-Implementation := Generic
-
-# FIXME: use automatic dependencies?
-Dependencies := $(wildcard $(Dir)/*.h)
-Dependencies += $(wildcard $(Dir)/../sanitizer_common/*.h)
-
-# Define a convenience variable for all the ubsan functions.
-UbsanFunctions := $(CSources:%.cc=%)
-UbsanCXXFunctions := $(CXXSources:%.cc=%)
-UbsanStandaloneFunctions := $(StandaloneSources:%.cc=%)
diff --git a/lib/ubsan/ubsan_diag.cc b/lib/ubsan/ubsan_diag.cc
index 2476947..d842694 100644
--- a/lib/ubsan/ubsan_diag.cc
+++ b/lib/ubsan/ubsan_diag.cc
@@ -124,108 +124,98 @@
 }
 
 /// Hexadecimal printing for numbers too large for Printf to handle directly.
-static void PrintHex(UIntMax Val) {
+static void RenderHex(InternalScopedString *Buffer, UIntMax Val) {
 #if HAVE_INT128_T
-  Printf("0x%08x%08x%08x%08x",
-          (unsigned int)(Val >> 96),
-          (unsigned int)(Val >> 64),
-          (unsigned int)(Val >> 32),
-          (unsigned int)(Val));
+  Buffer->append("0x%08x%08x%08x%08x", (unsigned int)(Val >> 96),
+                 (unsigned int)(Val >> 64), (unsigned int)(Val >> 32),
+                 (unsigned int)(Val));
 #else
   UNREACHABLE("long long smaller than 64 bits?");
 #endif
 }
 
-static void renderLocation(Location Loc) {
-  InternalScopedString LocBuffer(1024);
+static void RenderLocation(InternalScopedString *Buffer, Location Loc) {
   switch (Loc.getKind()) {
   case Location::LK_Source: {
     SourceLocation SLoc = Loc.getSourceLocation();
     if (SLoc.isInvalid())
-      LocBuffer.append("<unknown>");
+      Buffer->append("<unknown>");
     else
-      RenderSourceLocation(&LocBuffer, SLoc.getFilename(), SLoc.getLine(),
+      RenderSourceLocation(Buffer, SLoc.getFilename(), SLoc.getLine(),
                            SLoc.getColumn(), common_flags()->symbolize_vs_style,
                            common_flags()->strip_path_prefix);
-    break;
+    return;
   }
   case Location::LK_Memory:
-    LocBuffer.append("%p", Loc.getMemoryLocation());
-    break;
+    Buffer->append("%p", Loc.getMemoryLocation());
+    return;
   case Location::LK_Symbolized: {
     const AddressInfo &Info = Loc.getSymbolizedStack()->info;
-    if (Info.file) {
-      RenderSourceLocation(&LocBuffer, Info.file, Info.line, Info.column,
+    if (Info.file)
+      RenderSourceLocation(Buffer, Info.file, Info.line, Info.column,
                            common_flags()->symbolize_vs_style,
                            common_flags()->strip_path_prefix);
-    } else if (Info.module) {
-      RenderModuleLocation(&LocBuffer, Info.module, Info.module_offset,
+    else if (Info.module)
+      RenderModuleLocation(Buffer, Info.module, Info.module_offset,
                            common_flags()->strip_path_prefix);
-    } else {
-      LocBuffer.append("%p", Info.address);
-    }
-    break;
+    else
+      Buffer->append("%p", Info.address);
+    return;
   }
   case Location::LK_Null:
-    LocBuffer.append("<unknown>");
-    break;
+    Buffer->append("<unknown>");
+    return;
   }
-  Printf("%s:", LocBuffer.data());
 }
 
-static void renderText(const char *Message, const Diag::Arg *Args) {
+static void RenderText(InternalScopedString *Buffer, const char *Message,
+                       const Diag::Arg *Args) {
   for (const char *Msg = Message; *Msg; ++Msg) {
     if (*Msg != '%') {
-      char Buffer[64];
-      unsigned I;
-      for (I = 0; Msg[I] && Msg[I] != '%' && I != 63; ++I)
-        Buffer[I] = Msg[I];
-      Buffer[I] = '\0';
-      Printf(Buffer);
-      Msg += I - 1;
-    } else {
-      const Diag::Arg &A = Args[*++Msg - '0'];
-      switch (A.Kind) {
-      case Diag::AK_String:
-        Printf("%s", A.String);
-        break;
-      case Diag::AK_TypeName: {
-        if (SANITIZER_WINDOWS)
-          // The Windows implementation demangles names early.
-          Printf("'%s'", A.String);
-        else
-          Printf("'%s'", Symbolizer::GetOrInit()->Demangle(A.String));
-        break;
-      }
-      case Diag::AK_SInt:
-        // 'long long' is guaranteed to be at least 64 bits wide.
-        if (A.SInt >= INT64_MIN && A.SInt <= INT64_MAX)
-          Printf("%lld", (long long)A.SInt);
-        else
-          PrintHex(A.SInt);
-        break;
-      case Diag::AK_UInt:
-        if (A.UInt <= UINT64_MAX)
-          Printf("%llu", (unsigned long long)A.UInt);
-        else
-          PrintHex(A.UInt);
-        break;
-      case Diag::AK_Float: {
-        // FIXME: Support floating-point formatting in sanitizer_common's
-        //        printf, and stop using snprintf here.
-        char Buffer[32];
+      Buffer->append("%c", *Msg);
+      continue;
+    }
+    const Diag::Arg &A = Args[*++Msg - '0'];
+    switch (A.Kind) {
+    case Diag::AK_String:
+      Buffer->append("%s", A.String);
+      break;
+    case Diag::AK_TypeName: {
+      if (SANITIZER_WINDOWS)
+        // The Windows implementation demangles names early.
+        Buffer->append("'%s'", A.String);
+      else
+        Buffer->append("'%s'", Symbolizer::GetOrInit()->Demangle(A.String));
+      break;
+    }
+    case Diag::AK_SInt:
+      // 'long long' is guaranteed to be at least 64 bits wide.
+      if (A.SInt >= INT64_MIN && A.SInt <= INT64_MAX)
+        Buffer->append("%lld", (long long)A.SInt);
+      else
+        RenderHex(Buffer, A.SInt);
+      break;
+    case Diag::AK_UInt:
+      if (A.UInt <= UINT64_MAX)
+        Buffer->append("%llu", (unsigned long long)A.UInt);
+      else
+        RenderHex(Buffer, A.UInt);
+      break;
+    case Diag::AK_Float: {
+      // FIXME: Support floating-point formatting in sanitizer_common's
+      //        printf, and stop using snprintf here.
+      char FloatBuffer[32];
 #if SANITIZER_WINDOWS
-        sprintf_s(Buffer, sizeof(Buffer), "%Lg", (long double)A.Float);
+      sprintf_s(FloatBuffer, sizeof(FloatBuffer), "%Lg", (long double)A.Float);
 #else
-        snprintf(Buffer, sizeof(Buffer), "%Lg", (long double)A.Float);
+      snprintf(FloatBuffer, sizeof(FloatBuffer), "%Lg", (long double)A.Float);
 #endif
-        Printf("%s", Buffer);
-        break;
-      }
-      case Diag::AK_Pointer:
-        Printf("%p", A.Pointer);
-        break;
-      }
+      Buffer->append("%s", FloatBuffer);
+      break;
+    }
+    case Diag::AK_Pointer:
+      Buffer->append("%p", A.Pointer);
+      break;
     }
   }
 }
@@ -253,9 +243,9 @@
 }
 
 /// Render a snippet of the address space near a location.
-static void renderMemorySnippet(const Decorator &Decor, MemoryLocation Loc,
-                                Range *Ranges, unsigned NumRanges,
-                                const Diag::Arg *Args) {
+static void PrintMemorySnippet(const Decorator &Decor, MemoryLocation Loc,
+                               Range *Ranges, unsigned NumRanges,
+                               const Diag::Arg *Args) {
   // Show at least the 8 bytes surrounding Loc.
   const unsigned MinBytesNearLoc = 4;
   MemoryLocation Min = subtractNoOverflow(Loc, MinBytesNearLoc);
@@ -278,14 +268,15 @@
   }
 
   // Emit data.
+  InternalScopedString Buffer(1024);
   for (uptr P = Min; P != Max; ++P) {
     unsigned char C = *reinterpret_cast<const unsigned char*>(P);
-    Printf("%s%02x", (P % 8 == 0) ? "  " : " ", C);
+    Buffer.append("%s%02x", (P % 8 == 0) ? "  " : " ", C);
   }
-  Printf("\n");
+  Buffer.append("\n");
 
   // Emit highlights.
-  Printf(Decor.Highlight());
+  Buffer.append(Decor.Highlight());
   Range *InRange = upperBound(Min, Ranges, NumRanges);
   for (uptr P = Min; P != Max; ++P) {
     char Pad = ' ', Byte = ' ';
@@ -297,10 +288,13 @@
       Pad = '~';
     if (InRange && InRange->getStart().getMemoryLocation() <= P)
       Byte = '~';
-    char Buffer[] = { Pad, Pad, P == Loc ? '^' : Byte, Byte, 0 };
-    Printf((P % 8 == 0) ? Buffer : &Buffer[1]);
+    if (P % 8 == 0)
+      Buffer.append("%c", Pad);
+    Buffer.append("%c", Pad);
+    Buffer.append("%c", P == Loc ? '^' : Byte);
+    Buffer.append("%c", Byte);
   }
-  Printf("%s\n", Decor.EndHighlight());
+  Buffer.append("%s\n", Decor.EndHighlight());
 
   // Go over the line again, and print names for the ranges.
   InRange = 0;
@@ -315,9 +309,9 @@
 
     if (InRange && InRange->getStart().getMemoryLocation() == P) {
       while (Spaces--)
-        Printf(" ");
-      renderText(InRange->getText(), Args);
-      Printf("\n");
+        Buffer.append(" ");
+      RenderText(&Buffer, InRange->getText(), Args);
+      Buffer.append("\n");
       // FIXME: We only support naming one range for now!
       break;
     }
@@ -325,6 +319,7 @@
     Spaces += 2;
   }
 
+  Printf("%s", Buffer.data());
   // FIXME: Print names for anything we can identify within the line:
   //
   //  * If we can identify the memory itself as belonging to a particular
@@ -341,28 +336,30 @@
   // All diagnostics should be printed under report mutex.
   CommonSanitizerReportMutex.CheckLocked();
   Decorator Decor;
-  Printf(Decor.Bold());
+  InternalScopedString Buffer(1024);
 
-  renderLocation(Loc);
+  Buffer.append(Decor.Bold());
+  RenderLocation(&Buffer, Loc);
+  Buffer.append(":");
 
   switch (Level) {
   case DL_Error:
-    Printf("%s runtime error: %s%s",
-           Decor.Warning(), Decor.EndWarning(), Decor.Bold());
+    Buffer.append("%s runtime error: %s%s", Decor.Warning(), Decor.EndWarning(),
+                  Decor.Bold());
     break;
 
   case DL_Note:
-    Printf("%s note: %s", Decor.Note(), Decor.EndNote());
+    Buffer.append("%s note: %s", Decor.Note(), Decor.EndNote());
     break;
   }
 
-  renderText(Message, Args);
+  RenderText(&Buffer, Message, Args);
 
-  Printf("%s\n", Decor.Default());
+  Buffer.append("%s\n", Decor.Default());
+  Printf("%s", Buffer.data());
 
   if (Loc.isMemoryLocation())
-    renderMemorySnippet(Decor, Loc.getMemoryLocation(), Ranges,
-                        NumRanges, Args);
+    PrintMemorySnippet(Decor, Loc.getMemoryLocation(), Ranges, NumRanges, Args);
 }
 
 ScopedReport::ScopedReport(ReportOptions Opts, Location SummaryLoc,
diff --git a/lib/ubsan/ubsan_flags.cc b/lib/ubsan/ubsan_flags.cc
index 20087b9..e77ba55 100644
--- a/lib/ubsan/ubsan_flags.cc
+++ b/lib/ubsan/ubsan_flags.cc
@@ -59,7 +59,7 @@
   parser.ParseString(MaybeCallUbsanDefaultOptions());
   // Override from environment variable.
   parser.ParseString(GetEnv("UBSAN_OPTIONS"));
-  SetVerbosity(common_flags()->verbosity);
+  InitializeCommonFlags();
   if (Verbosity()) ReportUnrecognizedFlags();
 
   if (common_flags()->help) parser.PrintFlagDescriptions();
diff --git a/lib/ubsan/ubsan_handlers.cc b/lib/ubsan/ubsan_handlers.cc
index 5d82e9a..4ede388 100644
--- a/lib/ubsan/ubsan_handlers.cc
+++ b/lib/ubsan/ubsan_handlers.cc
@@ -523,8 +523,11 @@
   Die();
 }
 
-static void handleCFIBadIcall(CFIBadIcallData *Data, ValueHandle Function,
+static void handleCFIBadIcall(CFICheckFailData *Data, ValueHandle Function,
                               ReportOptions Opts) {
+  if (Data->CheckKind != CFITCK_ICall)
+    Die();
+
   SourceLocation Loc = Data->Loc.acquire();
   ErrorType ET = ErrorType::CFIBadType;
 
@@ -544,16 +547,37 @@
   Diag(FLoc, DL_Note, "%0 defined here") << FName;
 }
 
-void __ubsan::__ubsan_handle_cfi_bad_icall(CFIBadIcallData *Data,
-                                           ValueHandle Function) {
+namespace __ubsan {
+#ifdef UBSAN_CAN_USE_CXXABI
+SANITIZER_WEAK_ATTRIBUTE
+void HandleCFIBadType(CFICheckFailData *Data, ValueHandle Vtable,
+                      bool ValidVtable, ReportOptions Opts);
+#else
+static void HandleCFIBadType(CFICheckFailData *Data, ValueHandle Vtable,
+                             bool ValidVtable, ReportOptions Opts) {
+  Die();
+}
+#endif
+}  // namespace __ubsan
+
+void __ubsan::__ubsan_handle_cfi_check_fail(CFICheckFailData *Data,
+                                            ValueHandle Value,
+                                            uptr ValidVtable) {
   GET_REPORT_OPTIONS(false);
-  handleCFIBadIcall(Data, Function, Opts);
+  if (Data->CheckKind == CFITCK_ICall)
+    handleCFIBadIcall(Data, Value, Opts);
+  else
+    HandleCFIBadType(Data, Value, ValidVtable, Opts);
 }
 
-void __ubsan::__ubsan_handle_cfi_bad_icall_abort(CFIBadIcallData *Data,
-                                                 ValueHandle Function) {
+void __ubsan::__ubsan_handle_cfi_check_fail_abort(CFICheckFailData *Data,
+                                                  ValueHandle Value,
+                                                  uptr ValidVtable) {
   GET_REPORT_OPTIONS(true);
-  handleCFIBadIcall(Data, Function, Opts);
+  if (Data->CheckKind == CFITCK_ICall)
+    handleCFIBadIcall(Data, Value, Opts);
+  else
+    HandleCFIBadType(Data, Value, ValidVtable, Opts);
   Die();
 }
 
diff --git a/lib/ubsan/ubsan_handlers.h b/lib/ubsan/ubsan_handlers.h
index 6f309cf..e0cfd5b 100644
--- a/lib/ubsan/ubsan_handlers.h
+++ b/lib/ubsan/ubsan_handlers.h
@@ -148,14 +148,25 @@
 /// \brief Handle passing null pointer to function with nonnull attribute.
 RECOVERABLE(nonnull_arg, NonNullArgData *Data)
 
-struct CFIBadIcallData {
+/// \brief Known CFI check kinds.
+/// Keep in sync with the enum of the same name in CodeGenFunction.h
+enum CFITypeCheckKind : unsigned char {
+  CFITCK_VCall,
+  CFITCK_NVCall,
+  CFITCK_DerivedCast,
+  CFITCK_UnrelatedCast,
+  CFITCK_ICall,
+};
+
+struct CFICheckFailData {
+  CFITypeCheckKind CheckKind;
   SourceLocation Loc;
   const TypeDescriptor &Type;
 };
 
-/// \brief Handle control flow integrity failure for indirect function calls.
-RECOVERABLE(cfi_bad_icall, CFIBadIcallData *Data, ValueHandle Function)
-
+/// \brief Handle control flow integrity failures.
+RECOVERABLE(cfi_check_fail, CFICheckFailData *Data, ValueHandle Function,
+            uptr VtableIsValid)
 }
 
 #endif // UBSAN_HANDLERS_H
diff --git a/lib/ubsan/ubsan_handlers_cxx.cc b/lib/ubsan/ubsan_handlers_cxx.cc
index 3e81be6..d97ec48 100644
--- a/lib/ubsan/ubsan_handlers_cxx.cc
+++ b/lib/ubsan/ubsan_handlers_cxx.cc
@@ -15,6 +15,7 @@
 
 #include "ubsan_platform.h"
 #if CAN_SANITIZE_UB
+#include "ubsan_handlers.h"
 #include "ubsan_handlers_cxx.h"
 #include "ubsan_diag.h"
 #include "ubsan_type_hash.h"
@@ -54,11 +55,17 @@
     << TypeCheckKinds[Data->TypeCheckKind] << (void*)Pointer << Data->Type;
 
   // If possible, say what type it actually points to.
-  if (!DTI.isValid())
-    Diag(Pointer, DL_Note, "object has invalid vptr")
-        << TypeName(DTI.getMostDerivedTypeName())
-        << Range(Pointer, Pointer + sizeof(uptr), "invalid vptr");
-  else if (!DTI.getOffset())
+  if (!DTI.isValid()) {
+    if (DTI.getOffset() < -VptrMaxOffsetToTop || DTI.getOffset() > VptrMaxOffsetToTop) {
+      Diag(Pointer, DL_Note, "object has a possibly invalid vptr: abs(offset to top) too big")
+          << TypeName(DTI.getMostDerivedTypeName())
+          << Range(Pointer, Pointer + sizeof(uptr), "possibly invalid vptr");
+    } else {
+      Diag(Pointer, DL_Note, "object has invalid vptr")
+          << TypeName(DTI.getMostDerivedTypeName())
+          << Range(Pointer, Pointer + sizeof(uptr), "invalid vptr");
+    }
+  } else if (!DTI.getOffset())
     Diag(Pointer, DL_Note, "object is of type %0")
         << TypeName(DTI.getMostDerivedTypeName())
         << Range(Pointer, Pointer + sizeof(uptr), "vptr for %0");
@@ -87,8 +94,9 @@
     Die();
 }
 
-static void HandleCFIBadType(CFIBadTypeData *Data, ValueHandle Vtable,
-                             ReportOptions Opts) {
+namespace __ubsan {
+void HandleCFIBadType(CFICheckFailData *Data, ValueHandle Vtable,
+                      bool ValidVtable, ReportOptions Opts) {
   SourceLocation Loc = Data->Loc.acquire();
   ErrorType ET = ErrorType::CFIBadType;
 
@@ -96,38 +104,44 @@
     return;
 
   ScopedReport R(Opts, Loc, ET);
-  DynamicTypeInfo DTI = getDynamicTypeInfoFromVtable((void*)Vtable);
+  DynamicTypeInfo DTI = ValidVtable
+                            ? getDynamicTypeInfoFromVtable((void *)Vtable)
+                            : DynamicTypeInfo(0, 0, 0);
 
-  static const char *TypeCheckKinds[] = {
-    "virtual call",
-    "non-virtual call",
-    "base-to-derived cast",
-    "cast to unrelated type",
-  };
+  const char *CheckKindStr;
+  switch (Data->CheckKind) {
+  case CFITCK_VCall:
+    CheckKindStr = "virtual call";
+    break;
+  case CFITCK_NVCall:
+    CheckKindStr = "non-virtual call";
+    break;
+  case CFITCK_DerivedCast:
+    CheckKindStr = "base-to-derived cast";
+    break;
+  case CFITCK_UnrelatedCast:
+    CheckKindStr = "cast to unrelated type";
+    break;
+  case CFITCK_ICall:
+    Die();
+  }
 
   Diag(Loc, DL_Error, "control flow integrity check for type %0 failed during "
                       "%1 (vtable address %2)")
-      << Data->Type << TypeCheckKinds[Data->TypeCheckKind] << (void *)Vtable;
+      << Data->Type << CheckKindStr << (void *)Vtable;
 
   // If possible, say what type it actually points to.
-  if (!DTI.isValid())
-    Diag(Vtable, DL_Note, "invalid vtable");
-  else
+  if (!DTI.isValid()) {
+    const char *module = Symbolizer::GetOrInit()->GetModuleNameForPc(Vtable);
+    if (module)
+      Diag(Vtable, DL_Note, "invalid vtable in module %0") << module;
+    else
+      Diag(Vtable, DL_Note, "invalid vtable");
+  } else {
     Diag(Vtable, DL_Note, "vtable is of type %0")
         << TypeName(DTI.getMostDerivedTypeName());
+  }
 }
+}  // namespace __ubsan
 
-void __ubsan::__ubsan_handle_cfi_bad_type(CFIBadTypeData *Data,
-                                          ValueHandle Vtable) {
-  GET_REPORT_OPTIONS(false);
-  HandleCFIBadType(Data, Vtable, Opts);
-}
-
-void __ubsan::__ubsan_handle_cfi_bad_type_abort(CFIBadTypeData *Data,
-                                                ValueHandle Vtable) {
-  GET_REPORT_OPTIONS(true);
-  HandleCFIBadType(Data, Vtable, Opts);
-  Die();
-}
-
-#endif  // CAN_SANITIZE_UB
+#endif // CAN_SANITIZE_UB
diff --git a/lib/ubsan/ubsan_handlers_cxx.h b/lib/ubsan/ubsan_handlers_cxx.h
index 92050d9..2ff014e 100644
--- a/lib/ubsan/ubsan_handlers_cxx.h
+++ b/lib/ubsan/ubsan_handlers_cxx.h
@@ -25,12 +25,6 @@
   unsigned char TypeCheckKind;
 };
 
-struct CFIBadTypeData {
-  SourceLocation Loc;
-  const TypeDescriptor &Type;
-  unsigned char TypeCheckKind;
-};
-
 /// \brief Handle a runtime type check failure, caused by an incorrect vptr.
 /// When this handler is called, all we know is that the type was not in the
 /// cache; this does not necessarily imply the existence of a bug.
@@ -40,14 +34,6 @@
 extern "C" SANITIZER_INTERFACE_ATTRIBUTE
 void __ubsan_handle_dynamic_type_cache_miss_abort(
   DynamicTypeCacheMissData *Data, ValueHandle Pointer, ValueHandle Hash);
-
-/// \brief Handle a control flow integrity check failure by printing a
-/// diagnostic.
-extern "C" SANITIZER_INTERFACE_ATTRIBUTE void
-__ubsan_handle_cfi_bad_type(CFIBadTypeData *Data, ValueHandle Vtable);
-extern "C" SANITIZER_INTERFACE_ATTRIBUTE void
-__ubsan_handle_cfi_bad_type_abort(CFIBadTypeData *Data, ValueHandle Vtable);
-
 }
 
 #endif // UBSAN_HANDLERS_H
diff --git a/lib/ubsan/ubsan_platform.h b/lib/ubsan/ubsan_platform.h
index 002ecf3..1a3bfd6 100644
--- a/lib/ubsan/ubsan_platform.h
+++ b/lib/ubsan/ubsan_platform.h
@@ -16,7 +16,8 @@
 // Other platforms should be easy to add, and probably work as-is.
 #if (defined(__linux__) || defined(__FreeBSD__) || defined(__APPLE__)) && \
     (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || \
-     defined(__aarch64__) || defined(__mips__) || defined(__powerpc64__))
+     defined(__aarch64__) || defined(__mips__) || defined(__powerpc64__) || \
+     defined(__s390__))
 # define CAN_SANITIZE_UB 1
 #elif defined(_WIN32)
 # define CAN_SANITIZE_UB 1
diff --git a/lib/ubsan/ubsan_type_hash.h b/lib/ubsan/ubsan_type_hash.h
index 695fed9..aa63871 100644
--- a/lib/ubsan/ubsan_type_hash.h
+++ b/lib/ubsan/ubsan_type_hash.h
@@ -53,6 +53,10 @@
 
 const unsigned VptrTypeCacheSize = 128;
 
+/// A sanity check for Vtable. Offsets to top must be reasonably small
+/// numbers (by absolute value). It's a weak check for Vtable corruption.
+const int VptrMaxOffsetToTop = 1<<20;
+
 /// \brief A cache of the results of checkDynamicType. \c checkDynamicType would
 /// return \c true (modulo hash collisions) if
 /// \code
diff --git a/lib/ubsan/ubsan_type_hash_itanium.cc b/lib/ubsan/ubsan_type_hash_itanium.cc
index b84e88d..26272e3 100644
--- a/lib/ubsan/ubsan_type_hash_itanium.cc
+++ b/lib/ubsan/ubsan_type_hash_itanium.cc
@@ -115,7 +115,9 @@
 static bool isDerivedFromAtOffset(const abi::__class_type_info *Derived,
                                   const abi::__class_type_info *Base,
                                   sptr Offset) {
-  if (Derived->__type_name == Base->__type_name)
+  if (Derived->__type_name == Base->__type_name ||
+      (SANITIZER_NON_UNIQUE_TYPEINFO &&
+       !internal_strcmp(Derived->__type_name, Base->__type_name)))
     return Offset == 0;
 
   if (const abi::__si_class_type_info *SI =
@@ -219,6 +221,10 @@
   VtablePrefix *Vtable = getVtablePrefix(VtablePtr);
   if (!Vtable)
     return false;
+  if (Vtable->Offset < -VptrMaxOffsetToTop || Vtable->Offset > VptrMaxOffsetToTop) {
+    // Too large or too small offset are signs of Vtable corruption.
+    return false;
+  }
 
   // Check that this is actually a type_info object for a class type.
   abi::__class_type_info *Derived =
@@ -241,6 +247,8 @@
   VtablePrefix *Vtable = getVtablePrefix(VtablePtr);
   if (!Vtable)
     return DynamicTypeInfo(0, 0, 0);
+  if (Vtable->Offset < -VptrMaxOffsetToTop || Vtable->Offset > VptrMaxOffsetToTop)
+    return DynamicTypeInfo(0, Vtable->Offset, 0);
   const abi::__class_type_info *ObjectType = findBaseAtOffset(
     static_cast<const abi::__class_type_info*>(Vtable->TypeInfo),
     -Vtable->Offset);
diff --git a/lib/ubsan/ubsan_value.cc b/lib/ubsan/ubsan_value.cc
index 79dc4c8..466834c 100644
--- a/lib/ubsan/ubsan_value.cc
+++ b/lib/ubsan/ubsan_value.cc
@@ -83,12 +83,12 @@
 #endif
       case 32: {
         float Value;
-#if defined(__BIG_ENDIAN__)
+#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
        // For big endian the float value is in the last 4 bytes.
        // On some targets we may only have 4 bytes so we count backwards from
        // the end of Val to account for both the 32-bit and 64-bit cases.
        internal_memcpy(&Value, ((const char*)(&Val + 1)) - 4, 4);
-#else 
+#else
        internal_memcpy(&Value, &Val, 4);
 #endif
         return Value;