Update compiler-rt for rebase to r212749.

Includes a cherry-pick of:
r213309 - fixes umodsi3

Change-Id: Ic7367e3586b6af7ef74bee6a8cf437d5f28d975a
diff --git a/lib/tsan/CMakeLists.txt b/lib/tsan/CMakeLists.txt
index 3a71e9a..19efb54 100644
--- a/lib/tsan/CMakeLists.txt
+++ b/lib/tsan/CMakeLists.txt
@@ -38,6 +38,7 @@
   rtl/tsan_rtl_mutex.cc
   rtl/tsan_rtl_report.cc
   rtl/tsan_rtl_thread.cc
+  rtl/tsan_stack_trace.cc
   rtl/tsan_stat.cc
   rtl/tsan_suppressions.cc
   rtl/tsan_symbolize.cc
@@ -54,6 +55,7 @@
 set(TSAN_HEADERS
   rtl/tsan_clock.h
   rtl/tsan_defs.h
+  rtl/tsan_dense_alloc.h
   rtl/tsan_fd.h
   rtl/tsan_flags.h
   rtl/tsan_ignoreset.h
@@ -67,6 +69,7 @@
   rtl/tsan_platform.h
   rtl/tsan_report.h
   rtl/tsan_rtl.h
+  rtl/tsan_stack_trace.h
   rtl/tsan_stat.h
   rtl/tsan_suppressions.h
   rtl/tsan_symbolize.h
@@ -101,7 +104,7 @@
 
 # Build libcxx instrumented with TSan.
 if(COMPILER_RT_HAS_LIBCXX_SOURCES AND
-   COMPILER_RT_TEST_COMPILER STREQUAL "Clang")
+   COMPILER_RT_TEST_COMPILER_ID STREQUAL "Clang")
   set(LIBCXX_PREFIX ${CMAKE_CURRENT_BINARY_DIR}/libcxx_tsan)
   add_custom_libcxx(libcxx_tsan ${LIBCXX_PREFIX}
     DEPS ${TSAN_RUNTIME_LIBRARIES}
diff --git a/lib/tsan/check_analyze.sh b/lib/tsan/check_analyze.sh
index 39d570b..08bfc7a 100755
--- a/lib/tsan/check_analyze.sh
+++ b/lib/tsan/check_analyze.sh
@@ -8,11 +8,11 @@
 
 PrintRes
 
-mops="write1 \
+wmops="write1 \
       write2 \
       write4 \
-      write8 \
-      read1 \
+      write8"
+rmops="read1 \
       read2 \
       read4 \
       read8"
@@ -27,10 +27,16 @@
   fi
 }
 
-for f in $mops; do
-  check $f rsp 1   # To read caller pc.
-  check $f push 0
-  check $f pop 0
+for f in $wmops; do
+  check $f rsp 3
+  check $f push 1
+  check $f pop 5
+done
+
+for f in $rmops; do
+  check $f rsp 3
+  check $f push 1
+  check $f pop 4
 done
 
 for f in $func; do
diff --git a/lib/tsan/check_memcpy.sh b/lib/tsan/check_memcpy.sh
index fe3e49e..101df11 100755
--- a/lib/tsan/check_memcpy.sh
+++ b/lib/tsan/check_memcpy.sh
@@ -17,7 +17,14 @@
 $CXX $SRC $CFLAGS -c -o $OBJ
 $CXX $OBJ $LDFLAGS -o $EXE
 
-NCALL=$(objdump -d $EXE | egrep "callq .*__interceptor_mem(cpy|set)" | wc -l)
+NCALL=$(objdump -d $EXE | egrep "callq .*<__interceptor_mem(cpy|set)>" | wc -l)
+if [ "$NCALL" != "0" ]; then
+  echo FAIL: found $NCALL memcpy/memset calls
+  exit 1
+fi
+
+# tail calls
+NCALL=$(objdump -d $EXE | egrep "jmpq .*<__interceptor_mem(cpy|set)>" | wc -l)
 if [ "$NCALL" != "0" ]; then
   echo FAIL: found $NCALL memcpy/memset calls
   exit 1
diff --git a/lib/tsan/dd/CMakeLists.txt b/lib/tsan/dd/CMakeLists.txt
index a21e2dd..9328721 100644
--- a/lib/tsan/dd/CMakeLists.txt
+++ b/lib/tsan/dd/CMakeLists.txt
@@ -34,18 +34,15 @@
     CFLAGS ${DD_CFLAGS}
     DEFS ${DD_COMMON_DEFINITIONS})
 
-  add_library(RTDD OBJECT ${DD_SOURCES})
-  set_target_compile_flags(RTDD ${DD_CFLAGS})
-  set_property(TARGET RTDD APPEND PROPERTY
-    COMPILE_DEFINITIONS ${DD_COMMON_DEFINITIONS})
-  set_property(TARGET RTDD APPEND PROPERTY
-    COMPILE_DEFINITIONS ${DD_DYNAMIC_DEFINITIONS})
+  add_compiler_rt_object_library(RTDD ${arch}
+    SOURCES ${DD_SOURCES} CFLAGS ${DD_CFLAGS}
+    DEFS ${DD_COMMON_DEFINITIONS} ${DD_DYNAMIC_DEFINITIONS})
 
-  add_library(clang_rt.dyndd-${arch} SHARED
-    $<TARGET_OBJECTS:RTDD>
-    $<TARGET_OBJECTS:RTInterception.${arch}>
-    $<TARGET_OBJECTS:RTSanitizerCommon.${arch}>
-    $<TARGET_OBJECTS:RTSanitizerCommonLibc.${arch}>)
+  add_compiler_rt_runtime(clang_rt.dyndd-${arch} ${arch} SHARED
+    SOURCES $<TARGET_OBJECTS:RTDD.${arch}>
+            $<TARGET_OBJECTS:RTInterception.${arch}>
+            $<TARGET_OBJECTS:RTSanitizerCommon.${arch}>
+            $<TARGET_OBJECTS:RTSanitizerCommonLibc.${arch}>)
   target_link_libraries(clang_rt.dyndd-${arch} pthread dl)
 endif()
 
diff --git a/lib/tsan/go/build.bat b/lib/tsan/go/build.bat
index bc56784..4480e66 100644
--- a/lib/tsan/go/build.bat
+++ b/lib/tsan/go/build.bat
@@ -1,4 +1,4 @@
-type tsan_go.cc ..\rtl\tsan_clock.cc ..\rtl\tsan_flags.cc ..\rtl\tsan_md5.cc ..\rtl\tsan_mutex.cc ..\rtl\tsan_report.cc ..\rtl\tsan_rtl.cc ..\rtl\tsan_rtl_mutex.cc ..\rtl\tsan_rtl_report.cc ..\rtl\tsan_rtl_thread.cc ..\rtl\tsan_stat.cc ..\rtl\tsan_suppressions.cc ..\rtl\tsan_sync.cc ..\..\sanitizer_common\sanitizer_allocator.cc ..\..\sanitizer_common\sanitizer_common.cc ..\..\sanitizer_common\sanitizer_flags.cc ..\..\sanitizer_common\sanitizer_stacktrace.cc ..\..\sanitizer_common\sanitizer_libc.cc ..\..\sanitizer_common\sanitizer_printf.cc ..\..\sanitizer_common\sanitizer_suppressions.cc ..\..\sanitizer_common\sanitizer_thread_registry.cc ..\rtl\tsan_platform_windows.cc ..\..\sanitizer_common\sanitizer_win.cc ..\..\sanitizer_common\sanitizer_deadlock_detector1.cc > gotsan.cc
+type tsan_go.cc ..\rtl\tsan_clock.cc ..\rtl\tsan_flags.cc ..\rtl\tsan_md5.cc ..\rtl\tsan_mutex.cc ..\rtl\tsan_report.cc ..\rtl\tsan_rtl.cc ..\rtl\tsan_rtl_mutex.cc ..\rtl\tsan_rtl_report.cc ..\rtl\tsan_rtl_thread.cc ..\rtl\tsan_stat.cc ..\rtl\tsan_suppressions.cc ..\rtl\tsan_sync.cc ..\rtl\tsan_stack_trace.cc ..\..\sanitizer_common\sanitizer_allocator.cc ..\..\sanitizer_common\sanitizer_common.cc ..\..\sanitizer_common\sanitizer_flags.cc ..\..\sanitizer_common\sanitizer_stacktrace.cc ..\..\sanitizer_common\sanitizer_libc.cc ..\..\sanitizer_common\sanitizer_printf.cc ..\..\sanitizer_common\sanitizer_suppressions.cc ..\..\sanitizer_common\sanitizer_thread_registry.cc ..\rtl\tsan_platform_windows.cc ..\..\sanitizer_common\sanitizer_win.cc ..\..\sanitizer_common\sanitizer_deadlock_detector1.cc ..\..\sanitizer_common\sanitizer_stackdepot.cc ..\..\sanitizer_common\sanitizer_persistent_allocator.cc > gotsan.cc
 
 gcc -c -o race_windows_amd64.syso gotsan.cc -I..\rtl -I..\.. -I..\..\sanitizer_common -I..\..\..\include -m64 -Wall -fno-exceptions -fno-rtti -DTSAN_GO -DSANITIZER_GO -DTSAN_SHADOW_COUNT=4 -Wno-error=attributes -Wno-attributes -Wno-format -DTSAN_DEBUG=0 -O3 -fomit-frame-pointer
 
diff --git a/lib/tsan/go/buildgo.sh b/lib/tsan/go/buildgo.sh
index f9db35f..f8eb081 100755
--- a/lib/tsan/go/buildgo.sh
+++ b/lib/tsan/go/buildgo.sh
@@ -1,4 +1,3 @@
-#!/bin/bash
 set -e
 
 SRCS="
@@ -12,6 +11,7 @@
 	../rtl/tsan_rtl_mutex.cc
 	../rtl/tsan_rtl_report.cc
 	../rtl/tsan_rtl_thread.cc
+	../rtl/tsan_stack_trace.cc
 	../rtl/tsan_stat.cc
 	../rtl/tsan_suppressions.cc
 	../rtl/tsan_sync.cc
@@ -29,7 +29,7 @@
 
 if [ "`uname -a | grep Linux`" != "" ]; then
 	SUFFIX="linux_amd64"
-	OSCFLAGS="-fPIC -ffreestanding -Wno-maybe-uninitialized -Werror"
+	OSCFLAGS="-fPIC -ffreestanding -Wno-maybe-uninitialized -Wno-unused-const-variable -Werror -Wno-unknown-warning-option"
 	OSLDFLAGS="-lpthread -fPIC -fpie"
 	SRCS+="
 		../rtl/tsan_platform_linux.cc
@@ -39,9 +39,21 @@
 		../../sanitizer_common/sanitizer_linux.cc
 		../../sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cc
 	"
+elif [ "`uname -a | grep FreeBSD`" != "" ]; then
+        SUFFIX="freebsd_amd64"
+        OSCFLAGS="-fno-strict-aliasing -fPIC -Werror"
+        OSLDFLAGS="-lpthread -fPIC -fpie"
+        SRCS+="
+                ../rtl/tsan_platform_linux.cc
+                ../../sanitizer_common/sanitizer_posix.cc
+                ../../sanitizer_common/sanitizer_posix_libcdep.cc
+                ../../sanitizer_common/sanitizer_procmaps_linux.cc
+                ../../sanitizer_common/sanitizer_linux.cc
+                ../../sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cc
+        "
 elif [ "`uname -a | grep Darwin`" != "" ]; then
 	SUFFIX="darwin_amd64"
-	OSCFLAGS="-fPIC"
+	OSCFLAGS="-fPIC -Wno-unused-const-variable -Wno-unknown-warning-option"
 	OSLDFLAGS="-lpthread -fPIC -fpie"
 	SRCS+="
 		../rtl/tsan_platform_mac.cc
@@ -52,7 +64,7 @@
 	"
 elif [ "`uname -a | grep MINGW`" != "" ]; then
 	SUFFIX="windows_amd64"
-	OSCFLAGS="-Wno-error=attributes -Wno-attributes"
+	OSCFLAGS="-Wno-error=attributes -Wno-attributes -Wno-unused-const-variable -Wno-unknown-warning-option"
 	OSLDFLAGS=""
 	SRCS+="
 		../rtl/tsan_platform_windows.cc
@@ -72,13 +84,15 @@
 
 FLAGS=" -I../rtl -I../.. -I../../sanitizer_common -I../../../include -m64 -Wall -fno-exceptions -fno-rtti -DTSAN_GO -DSANITIZER_GO -DTSAN_SHADOW_COUNT=4 -DSANITIZER_DEADLOCK_DETECTOR_VERSION=2 $OSCFLAGS"
 if [ "$DEBUG" == "" ]; then
-	FLAGS+=" -DTSAN_DEBUG=0 -O3 -fomit-frame-pointer"
+	FLAGS+=" -DTSAN_DEBUG=0 -O3 -msse3 -fomit-frame-pointer"
 else
 	FLAGS+=" -DTSAN_DEBUG=1 -g"
 fi
 
-echo gcc gotsan.cc -S -o tmp.s $FLAGS $CFLAGS
-gcc gotsan.cc -c -o race_$SUFFIX.syso $FLAGS $CFLAGS
+CC=${CC:-gcc}
 
-gcc test.c race_$SUFFIX.syso -m64 -o test $OSLDFLAGS
+echo $CC gotsan.cc -c -o race_$SUFFIX.syso $FLAGS $CFLAGS
+$CC gotsan.cc -c -o race_$SUFFIX.syso $FLAGS $CFLAGS
+
+$CC test.c race_$SUFFIX.syso -m64 -o test $OSLDFLAGS
 GORACE="exitcode=0 atexit_sleep_ms=0" ./test
diff --git a/lib/tsan/go/tsan_go.cc b/lib/tsan/go/tsan_go.cc
index e7761fe..5e22092 100644
--- a/lib/tsan/go/tsan_go.cc
+++ b/lib/tsan/go/tsan_go.cc
@@ -191,17 +191,17 @@
   AcquireGlobal(thr, 0);
 }
 
-void __tsan_mutex_before_lock(ThreadState *thr, uptr addr, bool write) {
+void __tsan_mutex_before_lock(ThreadState *thr, uptr addr, uptr write) {
 }
 
-void __tsan_mutex_after_lock(ThreadState *thr, uptr addr, bool write) {
+void __tsan_mutex_after_lock(ThreadState *thr, uptr addr, uptr write) {
   if (write)
     MutexLock(thr, 0, addr);
   else
     MutexReadLock(thr, 0, addr);
 }
 
-void __tsan_mutex_before_unlock(ThreadState *thr, uptr addr, bool write) {
+void __tsan_mutex_before_unlock(ThreadState *thr, uptr addr, uptr write) {
   if (write)
     MutexUnlock(thr, 0, addr);
   else
diff --git a/lib/tsan/rtl/Makefile.old b/lib/tsan/rtl/Makefile.old
index 2a71869..79c761c 100644
--- a/lib/tsan/rtl/Makefile.old
+++ b/lib/tsan/rtl/Makefile.old
@@ -1,4 +1,4 @@
-CXXFLAGS = -std=c++11 -fPIE -g -Wall -Werror -fno-builtin -DTSAN_DEBUG=$(DEBUG) -DSANITIZER_DEBUG=$(DEBUG)
+CXXFLAGS = -std=c++11 -fPIE -g -Wall -Werror -fno-builtin -msse3 -DTSAN_DEBUG=$(DEBUG) -DSANITIZER_DEBUG=$(DEBUG)
 CLANG=clang
 ifeq ($(DEBUG), 0)
   CXXFLAGS += -O3
diff --git a/lib/tsan/rtl/tsan_clock.cc b/lib/tsan/rtl/tsan_clock.cc
index d40f40f..e140a3c 100644
--- a/lib/tsan/rtl/tsan_clock.cc
+++ b/lib/tsan/rtl/tsan_clock.cc
@@ -330,6 +330,11 @@
 
 void SyncClock::Reset() {
   clk_.Reset();
+  Zero();
+}
+
+void SyncClock::Zero() {
+  clk_.Resize(0);
   release_store_tid_ = kInvalidTid;
   release_store_reused_ = 0;
   for (uptr i = 0; i < kDirtyTids; i++)
diff --git a/lib/tsan/rtl/tsan_clock.h b/lib/tsan/rtl/tsan_clock.h
index 931fde8..f7ab69a 100644
--- a/lib/tsan/rtl/tsan_clock.h
+++ b/lib/tsan/rtl/tsan_clock.h
@@ -38,6 +38,7 @@
   }
 
   void Reset();
+  void Zero();
 
   void DebugDump(int(*printf)(const char *s, ...));
 
diff --git a/lib/tsan/rtl/tsan_defs.h b/lib/tsan/rtl/tsan_defs.h
index 0ee19e9..969d09f 100644
--- a/lib/tsan/rtl/tsan_defs.h
+++ b/lib/tsan/rtl/tsan_defs.h
@@ -54,6 +54,7 @@
 # endif
 #else
 // Count of shadow values in a shadow cell.
+#define TSAN_SHADOW_COUNT 4
 const uptr kShadowCnt = 4;
 #endif
 
@@ -66,6 +67,13 @@
 // Shadow memory is kShadowMultiplier times larger than user memory.
 const uptr kShadowMultiplier = kShadowSize * kShadowCnt / kShadowCell;
 
+// That many user bytes are mapped onto a single meta shadow cell.
+// Must be less or equal to minimal memory allocator alignment.
+const uptr kMetaShadowCell = 8;
+
+// Size of a single meta shadow value (u32).
+const uptr kMetaShadowSize = 4;
+
 #if defined(TSAN_NO_HISTORY) && TSAN_NO_HISTORY
 const bool kCollectHistory = false;
 #else
@@ -167,7 +175,15 @@
 class ReportDesc;
 class RegionAlloc;
 class StackTrace;
-struct MBlock;
+
+// Descriptor of user's memory block.
+struct MBlock {
+  u64  siz;
+  u32  stk;
+  u16  tid;
+};
+
+COMPILER_CHECK(sizeof(MBlock) == 16);
 
 }  // namespace __tsan
 
diff --git a/lib/tsan/rtl/tsan_dense_alloc.h b/lib/tsan/rtl/tsan_dense_alloc.h
new file mode 100644
index 0000000..2c2e75e
--- /dev/null
+++ b/lib/tsan/rtl/tsan_dense_alloc.h
@@ -0,0 +1,136 @@
+//===-- tsan_dense_alloc.h --------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of ThreadSanitizer (TSan), a race detector.
+//
+// A DenseSlabAlloc is a freelist-based allocator of fixed-size objects.
+// DenseSlabAllocCache is a thread-local cache for DenseSlabAlloc.
+// The only difference with traditional slab allocators is that DenseSlabAlloc
+// allocates/free indices of objects and provide a functionality to map
+// the index onto the real pointer. The index is u32, that is, 2 times smaller
+// than uptr (hense the Dense prefix).
+//===----------------------------------------------------------------------===//
+#ifndef TSAN_DENSE_ALLOC_H
+#define TSAN_DENSE_ALLOC_H
+
+#include "sanitizer_common/sanitizer_common.h"
+#include "tsan_defs.h"
+#include "tsan_mutex.h"
+
+namespace __tsan {
+
+class DenseSlabAllocCache {
+  static const uptr kSize = 128;
+  typedef u32 IndexT;
+  uptr pos;
+  IndexT cache[kSize];
+  template<typename T, uptr kL1Size, uptr kL2Size> friend class DenseSlabAlloc;
+};
+
+template<typename T, uptr kL1Size, uptr kL2Size>
+class DenseSlabAlloc {
+ public:
+  typedef DenseSlabAllocCache Cache;
+  typedef typename Cache::IndexT IndexT;
+
+  DenseSlabAlloc() {
+    // Check that kL1Size and kL2Size are sane.
+    CHECK_EQ(kL1Size & (kL1Size - 1), 0);
+    CHECK_EQ(kL2Size & (kL2Size - 1), 0);
+    CHECK_GE(1ull << (sizeof(IndexT) * 8), kL1Size * kL2Size);
+    // Check that it makes sense to use the dense alloc.
+    CHECK_GE(sizeof(T), sizeof(IndexT));
+    internal_memset(map_, 0, sizeof(map_));
+    freelist_ = 0;
+    fillpos_ = 0;
+  }
+
+  ~DenseSlabAlloc() {
+    for (uptr i = 0; i < kL1Size; i++) {
+      if (map_[i] != 0)
+        UnmapOrDie(map_[i], kL2Size * sizeof(T));
+    }
+  }
+
+  IndexT Alloc(Cache *c) {
+    if (c->pos == 0)
+      Refill(c);
+    return c->cache[--c->pos];
+  }
+
+  void Free(Cache *c, IndexT idx) {
+    if (c->pos == Cache::kSize)
+      Drain(c);
+    c->cache[c->pos++] = idx;
+  }
+
+  T *Map(IndexT idx) {
+    DCHECK_NE(idx, 0);
+    DCHECK_LE(idx, kL1Size * kL2Size);
+    return &map_[idx / kL2Size][idx % kL2Size];
+  }
+
+  void FlushCache(Cache *c) {
+    SpinMutexLock lock(&mtx_);
+    while (c->pos) {
+      IndexT idx = c->cache[--c->pos];
+      *(IndexT*)Map(idx) = freelist_;
+      freelist_ = idx;
+    }
+  }
+
+  void InitCache(Cache *c) {
+    c->pos = 0;
+    internal_memset(c->cache, 0, sizeof(c->cache));
+  }
+
+ private:
+  T *map_[kL1Size];
+  SpinMutex mtx_;
+  IndexT freelist_;
+  uptr fillpos_;
+
+  void Refill(Cache *c) {
+    SpinMutexLock lock(&mtx_);
+    if (freelist_ == 0) {
+      if (fillpos_ == kL1Size) {
+        Printf("ThreadSanitizer: DenseSlabAllocator overflow. Dying.\n");
+        Die();
+      }
+      T *batch = (T*)MmapOrDie(kL2Size * sizeof(T), "DenseSlabAllocator");
+      // Reserve 0 as invalid index.
+      IndexT start = fillpos_ == 0 ? 1 : 0;
+      for (IndexT i = start; i < kL2Size; i++) {
+        new(batch + i) T();
+        *(IndexT*)(batch + i) = i + 1 + fillpos_ * kL2Size;
+      }
+      *(IndexT*)(batch + kL2Size - 1) = 0;
+      freelist_ = fillpos_ * kL2Size + start;
+      map_[fillpos_++] = batch;
+    }
+    for (uptr i = 0; i < Cache::kSize / 2 && freelist_ != 0; i++) {
+      IndexT idx = freelist_;
+      c->cache[c->pos++] = idx;
+      freelist_ = *(IndexT*)Map(idx);
+    }
+  }
+
+  void Drain(Cache *c) {
+    SpinMutexLock lock(&mtx_);
+    for (uptr i = 0; i < Cache::kSize / 2; i++) {
+      IndexT idx = c->cache[--c->pos];
+      *(IndexT*)Map(idx) = freelist_;
+      freelist_ = idx;
+    }
+  }
+};
+
+}  // namespace __tsan
+
+#endif  // TSAN_DENSE_ALLOC_H
diff --git a/lib/tsan/rtl/tsan_fd.cc b/lib/tsan/rtl/tsan_fd.cc
index 6c7fc17..68242e0 100644
--- a/lib/tsan/rtl/tsan_fd.cc
+++ b/lib/tsan/rtl/tsan_fd.cc
@@ -47,8 +47,8 @@
   return fd < 0 || fd >= kTableSize;
 }
 
-static FdSync *allocsync() {
-  FdSync *s = (FdSync*)internal_alloc(MBlockFD, sizeof(FdSync));
+static FdSync *allocsync(ThreadState *thr, uptr pc) {
+  FdSync *s = (FdSync*)user_alloc(thr, pc, sizeof(FdSync));
   atomic_store(&s->rc, 1, memory_order_relaxed);
   return s;
 }
@@ -65,10 +65,7 @@
       CHECK_NE(s, &fdctx.globsync);
       CHECK_NE(s, &fdctx.filesync);
       CHECK_NE(s, &fdctx.socksync);
-      SyncVar *v = ctx->synctab.GetAndRemove(thr, pc, (uptr)s);
-      if (v)
-        DestroyAndFree(v);
-      internal_free(s);
+      user_free(thr, pc, s);
     }
   }
 }
@@ -219,7 +216,7 @@
 
 void FdPipeCreate(ThreadState *thr, uptr pc, int rfd, int wfd) {
   DPrintf("#%d: FdCreatePipe(%d, %d)\n", thr->tid, rfd, wfd);
-  FdSync *s = allocsync();
+  FdSync *s = allocsync(thr, pc);
   init(thr, pc, rfd, ref(s));
   init(thr, pc, wfd, ref(s));
   unref(thr, pc, s);
@@ -229,7 +226,7 @@
   DPrintf("#%d: FdEventCreate(%d)\n", thr->tid, fd);
   if (bogusfd(fd))
     return;
-  init(thr, pc, fd, allocsync());
+  init(thr, pc, fd, allocsync(thr, pc));
 }
 
 void FdSignalCreate(ThreadState *thr, uptr pc, int fd) {
@@ -250,7 +247,7 @@
   DPrintf("#%d: FdPollCreate(%d)\n", thr->tid, fd);
   if (bogusfd(fd))
     return;
-  init(thr, pc, fd, allocsync());
+  init(thr, pc, fd, allocsync(thr, pc));
 }
 
 void FdSocketCreate(ThreadState *thr, uptr pc, int fd) {
diff --git a/lib/tsan/rtl/tsan_flags.cc b/lib/tsan/rtl/tsan_flags.cc
index 1431200..123df49 100644
--- a/lib/tsan/rtl/tsan_flags.cc
+++ b/lib/tsan/rtl/tsan_flags.cc
@@ -102,6 +102,7 @@
   SetCommonFlagsDefaults(f);
   // Override some common flags defaults.
   f->allow_addr2line = true;
+  f->detect_deadlocks = true;
 
   // Let a frontend override.
   ParseFlags(f, __tsan_default_options());
diff --git a/lib/tsan/rtl/tsan_interceptors.cc b/lib/tsan/rtl/tsan_interceptors.cc
index 82c4da8..100834e 100644
--- a/lib/tsan/rtl/tsan_interceptors.cc
+++ b/lib/tsan/rtl/tsan_interceptors.cc
@@ -47,7 +47,7 @@
 extern "C" int pthread_attr_setstacksize(void *attr, uptr stacksize);
 extern "C" int pthread_key_create(unsigned *key, void (*destructor)(void* v));
 extern "C" int pthread_setspecific(unsigned key, const void *v);
-extern "C" int pthread_mutexattr_gettype(void *a, int *type);
+DECLARE_REAL(int, pthread_mutexattr_gettype, void *, void *)
 extern "C" int pthread_yield();
 extern "C" int pthread_sigmask(int how, const __sanitizer_sigset_t *set,
                                __sanitizer_sigset_t *oldset);
@@ -191,6 +191,7 @@
   if (!thr_->ignore_interceptors) {
     ProcessPendingSignals(thr_);
     FuncExit(thr_);
+    CheckNoLocks(thr_);
   }
 }
 
@@ -519,7 +520,7 @@
 
 TSAN_INTERCEPTOR(uptr, malloc_usable_size, void *p) {
   SCOPED_INTERCEPTOR_RAW(malloc_usable_size, p);
-  return user_alloc_usable_size(thr, pc, p);
+  return user_alloc_usable_size(p);
 }
 
 #define OPERATOR_NEW_BODY(mangled_name) \
@@ -737,6 +738,11 @@
   return user_alloc(thr, pc, sz, align);
 }
 
+TSAN_INTERCEPTOR(void*, aligned_alloc, uptr align, uptr sz) {
+  SCOPED_INTERCEPTOR_RAW(memalign, align, sz);
+  return user_alloc(thr, pc, sz, align);
+}
+
 TSAN_INTERCEPTOR(void*, valloc, uptr sz) {
   SCOPED_INTERCEPTOR_RAW(valloc, sz);
   return user_alloc(thr, pc, sz, GetPageSizeCached());
@@ -1028,7 +1034,7 @@
     bool recursive = false;
     if (a) {
       int type = 0;
-      if (pthread_mutexattr_gettype(a, &type) == 0)
+      if (REAL(pthread_mutexattr_gettype)(a, &type) == 0)
         recursive = (type == PTHREAD_MUTEX_RECURSIVE
             || type == PTHREAD_MUTEX_RECURSIVE_NP);
     }
@@ -1139,7 +1145,7 @@
   SCOPED_TSAN_INTERCEPTOR(pthread_rwlock_tryrdlock, m);
   int res = REAL(pthread_rwlock_tryrdlock)(m);
   if (res == 0) {
-    MutexLock(thr, pc, (uptr)m, /*rec=*/1, /*try_lock=*/true);
+    MutexReadLock(thr, pc, (uptr)m, /*try_lock=*/true);
   }
   return res;
 }
@@ -1704,8 +1710,8 @@
     ThreadRegistryLock l(ctx->thread_registry);
     ScopedReport rep(ReportTypeErrnoInSignal);
     if (!IsFiredSuppression(ctx, rep, stack)) {
-      rep.AddStack(&stack);
-      OutputReport(ctx, rep, rep.GetReport()->stacks[0]);
+      rep.AddStack(&stack, true);
+      OutputReport(thr, rep);
     }
   }
   errno = saved_errno;
diff --git a/lib/tsan/rtl/tsan_interface_ann.cc b/lib/tsan/rtl/tsan_interface_ann.cc
index 5632323..a1725cb 100644
--- a/lib/tsan/rtl/tsan_interface_ann.cc
+++ b/lib/tsan/rtl/tsan_interface_ann.cc
@@ -40,6 +40,7 @@
 
   ~ScopedAnnotation() {
     FuncExit(thr_);
+    CheckNoLocks(thr_);
   }
  private:
   ThreadState *const thr_;
diff --git a/lib/tsan/rtl/tsan_interface_atomic.cc b/lib/tsan/rtl/tsan_interface_atomic.cc
index 2de0c4f..7fbc9c6 100644
--- a/lib/tsan/rtl/tsan_interface_atomic.cc
+++ b/lib/tsan/rtl/tsan_interface_atomic.cc
@@ -291,7 +291,7 @@
     MemoryReadAtomic(thr, pc, (uptr)a, SizeLog<T>());
     return NoTsanAtomicLoad(a, mo);
   }
-  SyncVar *s = ctx->synctab.GetOrCreateAndLock(thr, pc, (uptr)a, false);
+  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, (uptr)a, false);
   AcquireImpl(thr, pc, &s->clock);
   T v = NoTsanAtomicLoad(a, mo);
   s->mtx.ReadUnlock();
@@ -325,7 +325,7 @@
     return;
   }
   __sync_synchronize();
-  SyncVar *s = ctx->synctab.GetOrCreateAndLock(thr, pc, (uptr)a, true);
+  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, (uptr)a, true);
   thr->fast_state.IncrementEpoch();
   // Can't increment epoch w/o writing to the trace as well.
   TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
@@ -339,7 +339,7 @@
   MemoryWriteAtomic(thr, pc, (uptr)a, SizeLog<T>());
   SyncVar *s = 0;
   if (mo != mo_relaxed) {
-    s = ctx->synctab.GetOrCreateAndLock(thr, pc, (uptr)a, true);
+    s = ctx->metamap.GetOrCreateAndLock(thr, pc, (uptr)a, true);
     thr->fast_state.IncrementEpoch();
     // Can't increment epoch w/o writing to the trace as well.
     TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
@@ -463,7 +463,7 @@
   SyncVar *s = 0;
   bool write_lock = mo != mo_acquire && mo != mo_consume;
   if (mo != mo_relaxed) {
-    s = ctx->synctab.GetOrCreateAndLock(thr, pc, (uptr)a, write_lock);
+    s = ctx->metamap.GetOrCreateAndLock(thr, pc, (uptr)a, write_lock);
     thr->fast_state.IncrementEpoch();
     // Can't increment epoch w/o writing to the trace as well.
     TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
diff --git a/lib/tsan/rtl/tsan_interface_java.cc b/lib/tsan/rtl/tsan_interface_java.cc
index d0c003e..5dfb476 100644
--- a/lib/tsan/rtl/tsan_interface_java.cc
+++ b/lib/tsan/rtl/tsan_interface_java.cc
@@ -22,54 +22,17 @@
 
 using namespace __tsan;  // NOLINT
 
+const jptr kHeapAlignment = 8;
+
 namespace __tsan {
 
-const uptr kHeapShadow = 0x300000000000ull;
-const uptr kHeapAlignment = 8;
-
-struct BlockDesc {
-  bool begin;
-  Mutex mtx;
-  SyncVar *head;
-
-  BlockDesc()
-      : mtx(MutexTypeJavaMBlock, StatMtxJavaMBlock)
-      , head() {
-    CHECK_EQ(begin, false);
-    begin = true;
-  }
-
-  ~BlockDesc() {
-    CHECK_EQ(begin, true);
-    begin = false;
-    ThreadState *thr = cur_thread();
-    SyncVar *s = head;
-    while (s) {
-      SyncVar *s1 = s->next;
-      StatInc(thr, StatSyncDestroyed);
-      s->mtx.Lock();
-      s->mtx.Unlock();
-      thr->mset.Remove(s->GetId());
-      DestroyAndFree(s);
-      s = s1;
-    }
-  }
-};
-
 struct JavaContext {
   const uptr heap_begin;
   const uptr heap_size;
-  BlockDesc *heap_shadow;
 
   JavaContext(jptr heap_begin, jptr heap_size)
       : heap_begin(heap_begin)
       , heap_size(heap_size) {
-    uptr size = heap_size / kHeapAlignment * sizeof(BlockDesc);
-    heap_shadow = (BlockDesc*)MmapFixedNoReserve(kHeapShadow, size);
-    if ((uptr)heap_shadow != kHeapShadow) {
-      Printf("ThreadSanitizer: failed to mmap Java heap shadow\n");
-      Die();
-    }
   }
 };
 
@@ -93,63 +56,6 @@
 static u64 jctx_buf[sizeof(JavaContext) / sizeof(u64) + 1];
 static JavaContext *jctx;
 
-static BlockDesc *getblock(uptr addr) {
-  uptr i = (addr - jctx->heap_begin) / kHeapAlignment;
-  return &jctx->heap_shadow[i];
-}
-
-static uptr USED getmem(BlockDesc *b) {
-  uptr i = b - jctx->heap_shadow;
-  uptr p = jctx->heap_begin + i * kHeapAlignment;
-  CHECK_GE(p, jctx->heap_begin);
-  CHECK_LT(p, jctx->heap_begin + jctx->heap_size);
-  return p;
-}
-
-static BlockDesc *getblockbegin(uptr addr) {
-  for (BlockDesc *b = getblock(addr);; b--) {
-    CHECK_GE(b, jctx->heap_shadow);
-    if (b->begin)
-      return b;
-  }
-  return 0;
-}
-
-SyncVar* GetJavaSync(ThreadState *thr, uptr pc, uptr addr,
-                     bool write_lock, bool create) {
-  if (jctx == 0 || addr < jctx->heap_begin
-      || addr >= jctx->heap_begin + jctx->heap_size)
-    return 0;
-  BlockDesc *b = getblockbegin(addr);
-  DPrintf("#%d: GetJavaSync %p->%p\n", thr->tid, addr, b);
-  Lock l(&b->mtx);
-  SyncVar *s = b->head;
-  for (; s; s = s->next) {
-    if (s->addr == addr) {
-      DPrintf("#%d: found existing sync for %p\n", thr->tid, addr);
-      break;
-    }
-  }
-  if (s == 0 && create) {
-    DPrintf("#%d: creating new sync for %p\n", thr->tid, addr);
-    s = ctx->synctab.Create(thr, pc, addr);
-    s->next = b->head;
-    b->head = s;
-  }
-  if (s) {
-    if (write_lock)
-      s->mtx.Lock();
-    else
-      s->mtx.ReadLock();
-  }
-  return s;
-}
-
-SyncVar* GetAndRemoveJavaSync(ThreadState *thr, uptr pc, uptr addr) {
-  // We do not destroy Java mutexes other than in __tsan_java_free().
-  return 0;
-}
-
 }  // namespace __tsan
 
 #define SCOPED_JAVA_FUNC(func) \
@@ -192,8 +98,7 @@
   CHECK_GE(ptr, jctx->heap_begin);
   CHECK_LE(ptr + size, jctx->heap_begin + jctx->heap_size);
 
-  BlockDesc *b = getblock(ptr);
-  new(b) BlockDesc();
+  OnUserAlloc(thr, pc, ptr, size, false);
 }
 
 void __tsan_java_free(jptr ptr, jptr size) {
@@ -206,12 +111,7 @@
   CHECK_GE(ptr, jctx->heap_begin);
   CHECK_LE(ptr + size, jctx->heap_begin + jctx->heap_size);
 
-  BlockDesc *beg = getblock(ptr);
-  BlockDesc *end = getblock(ptr + size);
-  for (BlockDesc *b = beg; b != end; b++) {
-    if (b->begin)
-      b->~BlockDesc();
-  }
+  ctx->metamap.FreeRange(thr, pc, ptr, size);
 }
 
 void __tsan_java_move(jptr src, jptr dst, jptr size) {
@@ -226,40 +126,34 @@
   CHECK_LE(src + size, jctx->heap_begin + jctx->heap_size);
   CHECK_GE(dst, jctx->heap_begin);
   CHECK_LE(dst + size, jctx->heap_begin + jctx->heap_size);
-  CHECK(dst >= src + size || src >= dst + size);
+  CHECK_NE(dst, src);
+  CHECK_NE(size, 0);
 
   // Assuming it's not running concurrently with threads that do
   // memory accesses and mutex operations (stop-the-world phase).
-  {  // NOLINT
-    BlockDesc *s = getblock(src);
-    BlockDesc *d = getblock(dst);
-    BlockDesc *send = getblock(src + size);
-    for (; s != send; s++, d++) {
-      CHECK_EQ(d->begin, false);
-      if (s->begin) {
-        DPrintf("#%d: moving block %p->%p\n", thr->tid, getmem(s), getmem(d));
-        new(d) BlockDesc;
-        d->head = s->head;
-        for (SyncVar *sync = d->head; sync; sync = sync->next) {
-          uptr newaddr = sync->addr - src + dst;
-          DPrintf("#%d: moving sync %p->%p\n", thr->tid, sync->addr, newaddr);
-          sync->addr = newaddr;
-        }
-        s->head = 0;
-        s->~BlockDesc();
-      }
-    }
-  }
+  ctx->metamap.MoveMemory(src, dst, size);
 
-  {  // NOLINT
-    u64 *s = (u64*)MemToShadow(src);
-    u64 *d = (u64*)MemToShadow(dst);
-    u64 *send = (u64*)MemToShadow(src + size);
-    for (; s != send; s++, d++) {
-      *d = *s;
-      *s = 0;
-    }
+  // Move shadow.
+  u64 *s = (u64*)MemToShadow(src);
+  u64 *d = (u64*)MemToShadow(dst);
+  u64 *send = (u64*)MemToShadow(src + size);
+  uptr inc = 1;
+  if (dst > src) {
+    s = (u64*)MemToShadow(src + size) - 1;
+    d = (u64*)MemToShadow(dst + size) - 1;
+    send = (u64*)MemToShadow(src) - 1;
+    inc = -1;
   }
+  for (; s != send; s += inc, d += inc) {
+    *d = *s;
+    *s = 0;
+  }
+}
+
+void __tsan_java_finalize() {
+  SCOPED_JAVA_FUNC(__tsan_java_finalize);
+  DPrintf("#%d: java_mutex_finalize()\n", thr->tid);
+  AcquireGlobal(thr, 0);
 }
 
 void __tsan_java_mutex_lock(jptr addr) {
diff --git a/lib/tsan/rtl/tsan_interface_java.h b/lib/tsan/rtl/tsan_interface_java.h
index 9ac78e0..1f793df 100644
--- a/lib/tsan/rtl/tsan_interface_java.h
+++ b/lib/tsan/rtl/tsan_interface_java.h
@@ -50,8 +50,13 @@
 void __tsan_java_free(jptr ptr, jptr size) INTERFACE_ATTRIBUTE;
 // Callback for memory move by GC.
 // Can be aggregated for several objects (preferably).
-// The ranges must not overlap.
+// The ranges can overlap.
 void __tsan_java_move(jptr src, jptr dst, jptr size) INTERFACE_ATTRIBUTE;
+// This function must be called on the finalizer thread
+// before executing a batch of finalizers.
+// It ensures necessary synchronization between
+// java object creation and finalization.
+void __tsan_java_finalize() INTERFACE_ATTRIBUTE;
 
 // Mutex lock.
 // Addr is any unique address associated with the mutex.
diff --git a/lib/tsan/rtl/tsan_mman.cc b/lib/tsan/rtl/tsan_mman.cc
index 8941eb1..8542a8f 100644
--- a/lib/tsan/rtl/tsan_mman.cc
+++ b/lib/tsan/rtl/tsan_mman.cc
@@ -10,6 +10,7 @@
 // This file is a part of ThreadSanitizer (TSan), a race detector.
 //
 //===----------------------------------------------------------------------===//
+#include "sanitizer_common/sanitizer_allocator_interface.h"
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_placement_new.h"
 #include "tsan_mman.h"
@@ -22,39 +23,20 @@
   (void)ptr;
   (void)size;
 }
+extern "C" void WEAK __sanitizer_malloc_hook(void *ptr, uptr size) {
+  (void)ptr;
+  (void)size;
+}
 
 extern "C" void WEAK __tsan_free_hook(void *ptr) {
   (void)ptr;
 }
+extern "C" void WEAK __sanitizer_free_hook(void *ptr) {
+  (void)ptr;
+}
 
 namespace __tsan {
 
-COMPILER_CHECK(sizeof(MBlock) == 16);
-
-void MBlock::Lock() {
-  atomic_uintptr_t *a = reinterpret_cast<atomic_uintptr_t*>(this);
-  uptr v = atomic_load(a, memory_order_relaxed);
-  for (int iter = 0;; iter++) {
-    if (v & 1) {
-      if (iter < 10)
-        proc_yield(20);
-      else
-        internal_sched_yield();
-      v = atomic_load(a, memory_order_relaxed);
-      continue;
-    }
-    if (atomic_compare_exchange_weak(a, &v, v | 1, memory_order_acquire))
-      break;
-  }
-}
-
-void MBlock::Unlock() {
-  atomic_uintptr_t *a = reinterpret_cast<atomic_uintptr_t*>(this);
-  uptr v = atomic_load(a, memory_order_relaxed);
-  DCHECK(v & 1);
-  atomic_store(a, v & ~1, memory_order_relaxed);
-}
-
 struct MapUnmapCallback {
   void OnMap(uptr p, uptr size) const { }
   void OnUnmap(uptr p, uptr size) const {
@@ -95,8 +77,8 @@
   ThreadRegistryLock l(ctx->thread_registry);
   ScopedReport rep(ReportTypeSignalUnsafe);
   if (!IsFiredSuppression(ctx, rep, stack)) {
-    rep.AddStack(&stack);
-    OutputReport(ctx, rep, rep.GetReport()->stacks[0]);
+    rep.AddStack(&stack, true);
+    OutputReport(thr, rep);
   }
 }
 
@@ -106,43 +88,36 @@
   void *p = allocator()->Allocate(&thr->alloc_cache, sz, align);
   if (p == 0)
     return 0;
-  MBlock *b = new(allocator()->GetMetaData(p)) MBlock;
-  b->Init(sz, thr->tid, CurrentStackId(thr, pc));
-  if (ctx && ctx->initialized) {
-    if (thr->ignore_reads_and_writes == 0)
-      MemoryRangeImitateWrite(thr, pc, (uptr)p, sz);
-    else
-      MemoryResetRange(thr, pc, (uptr)p, sz);
-  }
-  DPrintf("#%d: alloc(%zu) = %p\n", thr->tid, sz, p);
+  if (ctx && ctx->initialized)
+    OnUserAlloc(thr, pc, (uptr)p, sz, true);
   SignalUnsafeCall(thr, pc);
   return p;
 }
 
 void user_free(ThreadState *thr, uptr pc, void *p) {
-  CHECK_NE(p, (void*)0);
-  DPrintf("#%d: free(%p)\n", thr->tid, p);
-  MBlock *b = (MBlock*)allocator()->GetMetaData(p);
-  if (b->ListHead()) {
-    MBlock::ScopedLock l(b);
-    for (SyncVar *s = b->ListHead(); s;) {
-      SyncVar *res = s;
-      s = s->next;
-      StatInc(thr, StatSyncDestroyed);
-      res->mtx.Lock();
-      res->mtx.Unlock();
-      DestroyAndFree(res);
-    }
-    b->ListReset();
-  }
-  if (ctx && ctx->initialized) {
-    if (thr->ignore_reads_and_writes == 0)
-      MemoryRangeFreed(thr, pc, (uptr)p, b->Size());
-  }
+  if (ctx && ctx->initialized)
+    OnUserFree(thr, pc, (uptr)p, true);
   allocator()->Deallocate(&thr->alloc_cache, p);
   SignalUnsafeCall(thr, pc);
 }
 
+void OnUserAlloc(ThreadState *thr, uptr pc, uptr p, uptr sz, bool write) {
+  DPrintf("#%d: alloc(%zu) = %p\n", thr->tid, sz, p);
+  ctx->metamap.AllocBlock(thr, pc, p, sz);
+  if (write && thr->ignore_reads_and_writes == 0)
+    MemoryRangeImitateWrite(thr, pc, (uptr)p, sz);
+  else
+    MemoryResetRange(thr, pc, (uptr)p, sz);
+}
+
+void OnUserFree(ThreadState *thr, uptr pc, uptr p, bool write) {
+  CHECK_NE(p, (void*)0);
+  uptr sz = ctx->metamap.FreeBlock(thr, pc, p);
+  DPrintf("#%d: free(%p, %zu)\n", thr->tid, p, sz);
+  if (write && thr->ignore_reads_and_writes == 0)
+    MemoryRangeFreed(thr, pc, (uptr)p, sz);
+}
+
 void *user_realloc(ThreadState *thr, uptr pc, void *p, uptr sz) {
   void *p2 = 0;
   // FIXME: Handle "shrinking" more efficiently,
@@ -152,9 +127,8 @@
     if (p2 == 0)
       return 0;
     if (p) {
-      MBlock *b = user_mblock(thr, p);
-      CHECK_NE(b, 0);
-      internal_memcpy(p2, p, min(b->Size(), sz));
+      uptr oldsz = user_alloc_usable_size(p);
+      internal_memcpy(p2, p, min(oldsz, sz));
     }
   }
   if (p)
@@ -162,20 +136,11 @@
   return p2;
 }
 
-uptr user_alloc_usable_size(ThreadState *thr, uptr pc, void *p) {
+uptr user_alloc_usable_size(const void *p) {
   if (p == 0)
     return 0;
-  MBlock *b = (MBlock*)allocator()->GetMetaData(p);
-  return b ? b->Size() : 0;
-}
-
-MBlock *user_mblock(ThreadState *thr, void *p) {
-  CHECK_NE(p, 0);
-  Allocator *a = allocator();
-  void *b = a->GetBlockBegin(p);
-  if (b == 0)
-    return 0;
-  return (MBlock*)a->GetMetaData(b);
+  MBlock *b = ctx->metamap.GetBlock((uptr)p);
+  return b ? b->siz : 0;
 }
 
 void invoke_malloc_hook(void *ptr, uptr size) {
@@ -183,6 +148,7 @@
   if (ctx == 0 || !ctx->initialized || thr->ignore_interceptors)
     return;
   __tsan_malloc_hook(ptr, size);
+  __sanitizer_malloc_hook(ptr, size);
 }
 
 void invoke_free_hook(void *ptr) {
@@ -190,6 +156,7 @@
   if (ctx == 0 || !ctx->initialized || thr->ignore_interceptors)
     return;
   __tsan_free_hook(ptr);
+  __sanitizer_free_hook(ptr);
 }
 
 void *internal_alloc(MBlockType typ, uptr sz) {
@@ -216,51 +183,63 @@
 using namespace __tsan;
 
 extern "C" {
+uptr __sanitizer_get_current_allocated_bytes() {
+  uptr stats[AllocatorStatCount];
+  allocator()->GetStats(stats);
+  return stats[AllocatorStatAllocated];
+}
 uptr __tsan_get_current_allocated_bytes() {
-  u64 stats[AllocatorStatCount];
-  allocator()->GetStats(stats);
-  u64 m = stats[AllocatorStatMalloced];
-  u64 f = stats[AllocatorStatFreed];
-  return m >= f ? m - f : 1;
+  return __sanitizer_get_current_allocated_bytes();
 }
 
+uptr __sanitizer_get_heap_size() {
+  uptr stats[AllocatorStatCount];
+  allocator()->GetStats(stats);
+  return stats[AllocatorStatMapped];
+}
 uptr __tsan_get_heap_size() {
-  u64 stats[AllocatorStatCount];
-  allocator()->GetStats(stats);
-  u64 m = stats[AllocatorStatMmapped];
-  u64 f = stats[AllocatorStatUnmapped];
-  return m >= f ? m - f : 1;
+  return __sanitizer_get_heap_size();
 }
 
+uptr __sanitizer_get_free_bytes() {
+  return 1;
+}
 uptr __tsan_get_free_bytes() {
-  return 1;
+  return __sanitizer_get_free_bytes();
 }
 
+uptr __sanitizer_get_unmapped_bytes() {
+  return 1;
+}
 uptr __tsan_get_unmapped_bytes() {
-  return 1;
+  return __sanitizer_get_unmapped_bytes();
 }
 
-uptr __tsan_get_estimated_allocated_size(uptr size) {
+uptr __sanitizer_get_estimated_allocated_size(uptr size) {
   return size;
 }
-
-bool __tsan_get_ownership(void *p) {
-  return allocator()->GetBlockBegin(p) != 0;
+uptr __tsan_get_estimated_allocated_size(uptr size) {
+  return __sanitizer_get_estimated_allocated_size(size);
 }
 
-uptr __tsan_get_allocated_size(void *p) {
-  if (p == 0)
-    return 0;
-  p = allocator()->GetBlockBegin(p);
-  if (p == 0)
-    return 0;
-  MBlock *b = (MBlock*)allocator()->GetMetaData(p);
-  return b->Size();
+int __sanitizer_get_ownership(const void *p) {
+  return allocator()->GetBlockBegin(p) != 0;
+}
+int __tsan_get_ownership(const void *p) {
+  return __sanitizer_get_ownership(p);
+}
+
+uptr __sanitizer_get_allocated_size(const void *p) {
+  return user_alloc_usable_size(p);
+}
+uptr __tsan_get_allocated_size(const void *p) {
+  return __sanitizer_get_allocated_size(p);
 }
 
 void __tsan_on_thread_idle() {
   ThreadState *thr = cur_thread();
   allocator()->SwallowCache(&thr->alloc_cache);
   internal_allocator()->SwallowCache(&thr->internal_alloc_cache);
+  ctx->metamap.OnThreadIdle(thr);
 }
 }  // extern "C"
diff --git a/lib/tsan/rtl/tsan_mman.h b/lib/tsan/rtl/tsan_mman.h
index 19d5554..4f87ad6 100644
--- a/lib/tsan/rtl/tsan_mman.h
+++ b/lib/tsan/rtl/tsan_mman.h
@@ -31,10 +31,7 @@
 void user_free(ThreadState *thr, uptr pc, void *p);
 void *user_realloc(ThreadState *thr, uptr pc, void *p, uptr sz);
 void *user_alloc_aligned(ThreadState *thr, uptr pc, uptr sz, uptr align);
-uptr user_alloc_usable_size(ThreadState *thr, uptr pc, void *p);
-// Given the pointer p into a valid allocated block,
-// returns the descriptor of the block.
-MBlock *user_mblock(ThreadState *thr, void *p);
+uptr user_alloc_usable_size(const void *p);
 
 // Invoking malloc/free hooks that may be installed by the user.
 void invoke_malloc_hook(void *ptr, uptr size);
@@ -62,7 +59,6 @@
   MBlockSuppression,
   MBlockExpectRace,
   MBlockSignal,
-  MBlockFD,
   MBlockJmpBuf,
 
   // This must be the last.
diff --git a/lib/tsan/rtl/tsan_mutex.cc b/lib/tsan/rtl/tsan_mutex.cc
index 2c16208..55d6e18 100644
--- a/lib/tsan/rtl/tsan_mutex.cc
+++ b/lib/tsan/rtl/tsan_mutex.cc
@@ -31,13 +31,13 @@
   /*0  MutexTypeInvalid*/     {},
   /*1  MutexTypeTrace*/       {MutexTypeLeaf},
   /*2  MutexTypeThreads*/     {MutexTypeReport},
-  /*3  MutexTypeReport*/      {MutexTypeSyncTab, MutexTypeSyncVar,
+  /*3  MutexTypeReport*/      {MutexTypeSyncVar,
                                MutexTypeMBlock, MutexTypeJavaMBlock},
   /*4  MutexTypeSyncVar*/     {MutexTypeDDetector},
-  /*5  MutexTypeSyncTab*/     {MutexTypeSyncVar},
+  /*5  MutexTypeSyncTab*/     {},  // unused
   /*6  MutexTypeSlab*/        {MutexTypeLeaf},
   /*7  MutexTypeAnnotations*/ {},
-  /*8  MutexTypeAtExit*/      {MutexTypeSyncTab},
+  /*8  MutexTypeAtExit*/      {MutexTypeSyncVar},
   /*9  MutexTypeMBlock*/      {MutexTypeSyncVar},
   /*10 MutexTypeJavaMBlock*/  {MutexTypeSyncVar},
   /*11 MutexTypeDDetector*/   {},
@@ -161,8 +161,20 @@
   CHECK(locked_[t]);
   locked_[t] = 0;
 }
+
+void InternalDeadlockDetector::CheckNoLocks() {
+  for (int i = 0; i != MutexTypeCount; i++) {
+    CHECK_EQ(locked_[i], 0);
+  }
+}
 #endif
 
+void CheckNoLocks(ThreadState *thr) {
+#if TSAN_DEBUG && !TSAN_GO
+  thr->internal_deadlock_detector.CheckNoLocks();
+#endif
+}
+
 const uptr kUnlocked = 0;
 const uptr kWriteLock = 1;
 const uptr kReadLock = 2;
@@ -222,7 +234,7 @@
       cmp = kUnlocked;
       if (atomic_compare_exchange_weak(&state_, &cmp, kWriteLock,
                                        memory_order_acquire)) {
-#if TSAN_COLLECT_STATS
+#if TSAN_COLLECT_STATS && !TSAN_GO
         StatInc(cur_thread(), stat_type_, backoff.Contention());
 #endif
         return;
@@ -250,7 +262,7 @@
   for (Backoff backoff; backoff.Do();) {
     prev = atomic_load(&state_, memory_order_acquire);
     if ((prev & kWriteLock) == 0) {
-#if TSAN_COLLECT_STATS
+#if TSAN_COLLECT_STATS && !TSAN_GO
       StatInc(cur_thread(), stat_type_, backoff.Contention());
 #endif
       return;
diff --git a/lib/tsan/rtl/tsan_mutex.h b/lib/tsan/rtl/tsan_mutex.h
index 12580fa..7bb1c48 100644
--- a/lib/tsan/rtl/tsan_mutex.h
+++ b/lib/tsan/rtl/tsan_mutex.h
@@ -71,6 +71,7 @@
   InternalDeadlockDetector();
   void Lock(MutexType t);
   void Unlock(MutexType t);
+  void CheckNoLocks();
  private:
   u64 seq_;
   u64 locked_[MutexTypeCount];
@@ -78,6 +79,10 @@
 
 void InitializeMutex();
 
+// Checks that the current thread does not hold any runtime locks
+// (e.g. when returning from an interceptor).
+void CheckNoLocks(ThreadState *thr);
+
 }  // namespace __tsan
 
 #endif  // TSAN_MUTEX_H
diff --git a/lib/tsan/rtl/tsan_platform.h b/lib/tsan/rtl/tsan_platform.h
index 7abe5f0..7d8d977 100644
--- a/lib/tsan/rtl/tsan_platform.h
+++ b/lib/tsan/rtl/tsan_platform.h
@@ -16,7 +16,9 @@
 C++ linux memory layout:
 0000 0000 0000 - 03c0 0000 0000: protected
 03c0 0000 0000 - 1000 0000 0000: shadow
-1000 0000 0000 - 6000 0000 0000: protected
+1000 0000 0000 - 3000 0000 0000: protected
+3000 0000 0000 - 4000 0000 0000: metainfo (memory blocks and sync objects)
+4000 0000 0000 - 6000 0000 0000: protected
 6000 0000 0000 - 6200 0000 0000: traces
 6200 0000 0000 - 7d00 0000 0000: -
 7d00 0000 0000 - 7e00 0000 0000: heap
@@ -27,7 +29,9 @@
 0400 0000 0000 - 1000 0000 0000: shadow
 1000 0000 0000 - 2900 0000 0000: protected
 2900 0000 0000 - 2c00 0000 0000: modules
-2c00 0000 0000 - 6000 0000 0000: -
+2c00 0000 0000 - 3000 0000 0000: -
+3000 0000 0000 - 4000 0000 0000: metainfo (memory blocks and sync objects)
+4000 0000 0000 - 6000 0000 0000: -
 6000 0000 0000 - 6200 0000 0000: traces
 6200 0000 0000 - 7d00 0000 0000: -
 7d00 0000 0000 - 7e00 0000 0000: heap
@@ -40,7 +44,9 @@
 00c0 0000 0000 - 00e0 0000 0000: heap
 00e0 0000 0000 - 1000 0000 0000: -
 1000 0000 0000 - 1380 0000 0000: shadow
-1460 0000 0000 - 6000 0000 0000: -
+1460 0000 0000 - 2000 0000 0000: -
+3000 0000 0000 - 4000 0000 0000: metainfo (memory blocks and sync objects)
+4000 0000 0000 - 6000 0000 0000: -
 6000 0000 0000 - 6200 0000 0000: traces
 6200 0000 0000 - 7fff ffff ffff: -
 
@@ -51,7 +57,8 @@
 00e0 0000 0000 - 0100 0000 0000: -
 0100 0000 0000 - 0560 0000 0000: shadow
 0560 0000 0000 - 0760 0000 0000: traces
-0760 0000 0000 - 07ff ffff ffff: -
+0760 0000 0000 - 07d0 0000 0000: metainfo (memory blocks and sync objects)
+07d0 0000 0000 - 07ff ffff ffff: -
 */
 
 #ifndef TSAN_PLATFORM_H
@@ -68,20 +75,28 @@
 static const uptr kLinuxAppMemEnd = 0x04dfffffffffULL;
 # if SANITIZER_WINDOWS
 static const uptr kLinuxShadowMsk = 0x010000000000ULL;
-# else
+static const uptr kMetaShadow     = 0x076000000000ULL;
+static const uptr kMetaSize       = 0x007000000000ULL;
+# else  // if SANITIZER_WINDOWS
 static const uptr kLinuxShadowMsk = 0x200000000000ULL;
-# endif
+static const uptr kMetaShadow     = 0x300000000000ULL;
+static const uptr kMetaSize       = 0x100000000000ULL;
+# endif  // if SANITIZER_WINDOWS
+#else  // defined(TSAN_GO)
+static const uptr kMetaShadow     = 0x300000000000ULL;
+static const uptr kMetaSize       = 0x100000000000ULL;
 // TSAN_COMPAT_SHADOW is intended for COMPAT virtual memory layout,
 // when memory addresses are of the 0x2axxxxxxxxxx form.
 // The option is enabled with 'setarch x86_64 -L'.
-#elif defined(TSAN_COMPAT_SHADOW) && TSAN_COMPAT_SHADOW
+# if defined(TSAN_COMPAT_SHADOW) && TSAN_COMPAT_SHADOW
 static const uptr kLinuxAppMemBeg = 0x290000000000ULL;
 static const uptr kLinuxAppMemEnd = 0x7fffffffffffULL;
 static const uptr kAppMemGapBeg   = 0x2c0000000000ULL;
 static const uptr kAppMemGapEnd   = 0x7d0000000000ULL;
-#else
+# else
 static const uptr kLinuxAppMemBeg = 0x7cf000000000ULL;
 static const uptr kLinuxAppMemEnd = 0x7fffffffffffULL;
+# endif
 #endif
 
 static const uptr kLinuxAppMemMsk = 0x7c0000000000ULL;
@@ -96,10 +111,16 @@
 // This has to be a macro to allow constant initialization of constants below.
 #ifndef TSAN_GO
 #define MemToShadow(addr) \
-    (((addr) & ~(kLinuxAppMemMsk | (kShadowCell - 1))) * kShadowCnt)
+    ((((uptr)addr) & ~(kLinuxAppMemMsk | (kShadowCell - 1))) * kShadowCnt)
+#define MemToMeta(addr) \
+    (u32*)(((((uptr)addr) & ~(kLinuxAppMemMsk | (kMetaShadowCell - 1))) \
+    / kMetaShadowCell * kMetaShadowSize) | kMetaShadow)
 #else
 #define MemToShadow(addr) \
-    ((((addr) & ~(kShadowCell - 1)) * kShadowCnt) | kLinuxShadowMsk)
+    (((((uptr)addr) & ~(kShadowCell - 1)) * kShadowCnt) | kLinuxShadowMsk)
+#define MemToMeta(addr) \
+    (u32*)(((((uptr)addr) & ~(kMetaShadowCell - 1)) \
+    / kMetaShadowCell * kMetaShadowSize) | kMetaShadow)
 #endif
 
 static const uptr kLinuxShadowBeg = MemToShadow(kLinuxAppMemBeg);
@@ -110,6 +131,8 @@
 #if defined(TSAN_COMPAT_SHADOW) && TSAN_COMPAT_SHADOW
   return (mem >= kLinuxAppMemBeg && mem < kAppMemGapBeg) ||
          (mem >= kAppMemGapEnd   && mem <= kLinuxAppMemEnd);
+#elif defined(TSAN_GO)
+  return mem <= kLinuxAppMemEnd;
 #else
   return mem >= kLinuxAppMemBeg && mem <= kLinuxAppMemEnd;
 #endif
@@ -140,7 +163,7 @@
 }
 
 void FlushShadowMemory();
-void WriteMemoryProfile(char *buf, uptr buf_size);
+void WriteMemoryProfile(char *buf, uptr buf_size, uptr nthread, uptr nlive);
 uptr GetRSS();
 
 const char *InitializePlatform();
diff --git a/lib/tsan/rtl/tsan_platform_linux.cc b/lib/tsan/rtl/tsan_platform_linux.cc
index 3c3a58b..53ecfc6 100644
--- a/lib/tsan/rtl/tsan_platform_linux.cc
+++ b/lib/tsan/rtl/tsan_platform_linux.cc
@@ -14,7 +14,7 @@
 
 
 #include "sanitizer_common/sanitizer_platform.h"
-#if SANITIZER_LINUX
+#if SANITIZER_LINUX || SANITIZER_FREEBSD
 
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_libc.h"
@@ -32,7 +32,6 @@
 #include <string.h>
 #include <stdarg.h>
 #include <sys/mman.h>
-#include <sys/prctl.h>
 #include <sys/syscall.h>
 #include <sys/socket.h>
 #include <sys/time.h>
@@ -43,9 +42,10 @@
 #include <errno.h>
 #include <sched.h>
 #include <dlfcn.h>
+#if SANITIZER_LINUX
 #define __need_res_state
 #include <resolv.h>
-#include <malloc.h>
+#endif
 
 #ifdef sa_handler
 # undef sa_handler
@@ -55,44 +55,57 @@
 # undef sa_sigaction
 #endif
 
-extern "C" struct mallinfo __libc_mallinfo();
+#if SANITIZER_FREEBSD
+extern "C" void *__libc_stack_end;
+void *__libc_stack_end = 0;
+#endif
 
 namespace __tsan {
 
 const uptr kPageSize = 4096;
 
+enum {
+  MemTotal  = 0,
+  MemShadow = 1,
+  MemMeta   = 2,
+  MemFile   = 3,
+  MemMmap   = 4,
+  MemTrace  = 5,
+  MemHeap   = 6,
+  MemOther  = 7,
+  MemCount  = 8,
+};
+
 void FillProfileCallback(uptr start, uptr rss, bool file,
                          uptr *mem, uptr stats_size) {
-  CHECK_EQ(7, stats_size);
-  mem[6] += rss;  // total
+  mem[MemTotal] += rss;
   start >>= 40;
-  if (start < 0x10)  // shadow
-    mem[0] += rss;
-  else if (start >= 0x20 && start < 0x30)  // compat modules
-    mem[file ? 1 : 2] += rss;
-  else if (start >= 0x7e)  // modules
-    mem[file ? 1 : 2] += rss;
-  else if (start >= 0x60 && start < 0x62)  // traces
-    mem[3] += rss;
-  else if (start >= 0x7d && start < 0x7e)  // heap
-    mem[4] += rss;
-  else  // other
-    mem[5] += rss;
+  if (start < 0x10)
+    mem[MemShadow] += rss;
+  else if (start >= 0x20 && start < 0x30)
+    mem[file ? MemFile : MemMmap] += rss;
+  else if (start >= 0x30 && start < 0x40)
+    mem[MemMeta] += rss;
+  else if (start >= 0x7e)
+    mem[file ? MemFile : MemMmap] += rss;
+  else if (start >= 0x60 && start < 0x62)
+    mem[MemTrace] += rss;
+  else if (start >= 0x7d && start < 0x7e)
+    mem[MemHeap] += rss;
+  else
+    mem[MemOther] += rss;
 }
 
-void WriteMemoryProfile(char *buf, uptr buf_size) {
-  uptr mem[7] = {};
+void WriteMemoryProfile(char *buf, uptr buf_size, uptr nthread, uptr nlive) {
+  uptr mem[MemCount] = {};
   __sanitizer::GetMemoryProfile(FillProfileCallback, mem, 7);
-  char *buf_pos = buf;
-  char *buf_end = buf + buf_size;
-  buf_pos += internal_snprintf(buf_pos, buf_end - buf_pos,
-      "RSS %zd MB: shadow:%zd file:%zd mmap:%zd trace:%zd heap:%zd other:%zd\n",
-      mem[6] >> 20, mem[0] >> 20, mem[1] >> 20, mem[2] >> 20,
-      mem[3] >> 20, mem[4] >> 20, mem[5] >> 20);
-  struct mallinfo mi = __libc_mallinfo();
-  buf_pos += internal_snprintf(buf_pos, buf_end - buf_pos,
-      "mallinfo: arena=%d mmap=%d fordblks=%d keepcost=%d\n",
-      mi.arena >> 20, mi.hblkhd >> 20, mi.fordblks >> 20, mi.keepcost >> 20);
+  internal_snprintf(buf, buf_size,
+      "RSS %zd MB: shadow:%zd meta:%zd file:%zd mmap:%zd"
+      " trace:%zd heap:%zd other:%zd nthr=%zd/%zd\n",
+      mem[MemTotal] >> 20, mem[MemShadow] >> 20, mem[MemMeta] >> 20,
+      mem[MemFile] >> 20, mem[MemMmap] >> 20, mem[MemTrace] >> 20,
+      mem[MemHeap] >> 20, mem[MemOther] >> 20,
+      nlive, nthread);
 }
 
 uptr GetRSS() {
@@ -101,15 +114,18 @@
   return mem[6];
 }
 
-
+#if SANITIZER_LINUX
 void FlushShadowMemoryCallback(
     const SuspendedThreadsList &suspended_threads_list,
     void *argument) {
   FlushUnneededShadowMemory(kLinuxShadowBeg, kLinuxShadowEnd - kLinuxShadowBeg);
 }
+#endif
 
 void FlushShadowMemory() {
+#if SANITIZER_LINUX
   StopTheWorld(FlushShadowMemoryCallback, 0);
+#endif
 }
 
 #ifndef TSAN_GO
@@ -123,9 +139,7 @@
     Die();
   }
 }
-#endif
 
-#ifndef TSAN_GO
 // Mark shadow for .rodata sections with the special kShadowRodata marker.
 // Accesses to .rodata can't race, so this saves time, memory and trace space.
 static void MapRodata() {
@@ -184,6 +198,7 @@
 }
 
 void InitializeShadowMemory() {
+  // Map memory shadow.
   uptr shadow = (uptr)MmapFixedNoReserve(kLinuxShadowBeg,
     kLinuxShadowEnd - kLinuxShadowBeg);
   if (shadow != kLinuxShadowBeg) {
@@ -192,23 +207,48 @@
                "to link with -pie (%p, %p).\n", shadow, kLinuxShadowBeg);
     Die();
   }
+  DPrintf("memory shadow: %zx-%zx (%zuGB)\n",
+      kLinuxShadowBeg, kLinuxShadowEnd,
+      (kLinuxShadowEnd - kLinuxShadowBeg) >> 30);
+
+  // Map meta shadow.
+  if (MemToMeta(kLinuxAppMemBeg) < (u32*)kMetaShadow) {
+    Printf("ThreadSanitizer: bad meta shadow (%p -> %p < %p)\n",
+        kLinuxAppMemBeg, MemToMeta(kLinuxAppMemBeg), kMetaShadow);
+    Die();
+  }
+  if (MemToMeta(kLinuxAppMemEnd) >= (u32*)(kMetaShadow + kMetaSize)) {
+    Printf("ThreadSanitizer: bad meta shadow (%p -> %p >= %p)\n",
+        kLinuxAppMemEnd, MemToMeta(kLinuxAppMemEnd), kMetaShadow + kMetaSize);
+    Die();
+  }
+  uptr meta = (uptr)MmapFixedNoReserve(kMetaShadow, kMetaSize);
+  if (meta != kMetaShadow) {
+    Printf("FATAL: ThreadSanitizer can not mmap the shadow memory\n");
+    Printf("FATAL: Make sure to compile with -fPIE and "
+               "to link with -pie (%p, %p).\n", meta, kMetaShadow);
+    Die();
+  }
+  DPrintf("meta shadow: %zx-%zx (%zuGB)\n",
+      kMetaShadow, kMetaShadow + kMetaSize, kMetaSize >> 30);
+
+  // Protect gaps.
   const uptr kClosedLowBeg  = 0x200000;
   const uptr kClosedLowEnd  = kLinuxShadowBeg - 1;
   const uptr kClosedMidBeg = kLinuxShadowEnd + 1;
-  const uptr kClosedMidEnd = min(kLinuxAppMemBeg, kTraceMemBegin);
+  const uptr kClosedMidEnd = min(min(kLinuxAppMemBeg, kTraceMemBegin),
+      kMetaShadow);
+
   ProtectRange(kClosedLowBeg, kClosedLowEnd);
   ProtectRange(kClosedMidBeg, kClosedMidEnd);
-  DPrintf("kClosedLow   %zx-%zx (%zuGB)\n",
+  VPrintf(2, "kClosedLow   %zx-%zx (%zuGB)\n",
       kClosedLowBeg, kClosedLowEnd, (kClosedLowEnd - kClosedLowBeg) >> 30);
-  DPrintf("kLinuxShadow %zx-%zx (%zuGB)\n",
-      kLinuxShadowBeg, kLinuxShadowEnd,
-      (kLinuxShadowEnd - kLinuxShadowBeg) >> 30);
-  DPrintf("kClosedMid   %zx-%zx (%zuGB)\n",
+  VPrintf(2, "kClosedMid   %zx-%zx (%zuGB)\n",
       kClosedMidBeg, kClosedMidEnd, (kClosedMidEnd - kClosedMidBeg) >> 30);
-  DPrintf("kLinuxAppMem %zx-%zx (%zuGB)\n",
+  VPrintf(2, "app mem: %zx-%zx (%zuGB)\n",
       kLinuxAppMemBeg, kLinuxAppMemEnd,
       (kLinuxAppMemEnd - kLinuxAppMemBeg) >> 30);
-  DPrintf("stack        %zx\n", (uptr)&shadow);
+  VPrintf(2, "stack: %zx\n", (uptr)&shadow);
 
   MapRodata();
 }
@@ -330,6 +370,7 @@
 // This is required to properly "close" the fds, because we do not see internal
 // closes within glibc. The code is a pure hack.
 int ExtractResolvFDs(void *state, int *fds, int nfd) {
+#if SANITIZER_LINUX
   int cnt = 0;
   __res_state *statp = (__res_state*)state;
   for (int i = 0; i < MAXNS && cnt < nfd; i++) {
@@ -337,6 +378,9 @@
       fds[cnt++] = statp->_u._ext.nssocks[i];
   }
   return cnt;
+#else
+  return 0;
+#endif
 }
 
 // Extract file descriptors passed via UNIX domain sockets.
diff --git a/lib/tsan/rtl/tsan_platform_mac.cc b/lib/tsan/rtl/tsan_platform_mac.cc
index a545884..15d0688 100644
--- a/lib/tsan/rtl/tsan_platform_mac.cc
+++ b/lib/tsan/rtl/tsan_platform_mac.cc
@@ -47,7 +47,7 @@
 void FlushShadowMemory() {
 }
 
-void WriteMemoryProfile(char *buf, uptr buf_size) {
+void WriteMemoryProfile(char *buf, uptr buf_size, uptr nthread, uptr nlive) {
 }
 
 uptr GetRSS() {
diff --git a/lib/tsan/rtl/tsan_platform_windows.cc b/lib/tsan/rtl/tsan_platform_windows.cc
index efc5d78..8b9d20e 100644
--- a/lib/tsan/rtl/tsan_platform_windows.cc
+++ b/lib/tsan/rtl/tsan_platform_windows.cc
@@ -28,7 +28,7 @@
 void FlushShadowMemory() {
 }
 
-void WriteMemoryProfile(char *buf, uptr buf_size) {
+void WriteMemoryProfile(char *buf, uptr buf_size, uptr nthread, uptr nlive) {
 }
 
 uptr GetRSS() {
diff --git a/lib/tsan/rtl/tsan_report.cc b/lib/tsan/rtl/tsan_report.cc
index a835cee..e14d0b9 100644
--- a/lib/tsan/rtl/tsan_report.cc
+++ b/lib/tsan/rtl/tsan_report.cc
@@ -17,9 +17,9 @@
 
 namespace __tsan {
 
-class Decorator: private __sanitizer::AnsiColorDecorator {
+class Decorator: public __sanitizer::SanitizerCommonDecorator {
  public:
-  Decorator() : __sanitizer::AnsiColorDecorator(PrintsToTtyCached()) { }
+  Decorator() : SanitizerCommonDecorator() { }
   const char *Warning()    { return Red(); }
   const char *EndWarning() { return Default(); }
   const char *Access()     { return Blue(); }
diff --git a/lib/tsan/rtl/tsan_report.h b/lib/tsan/rtl/tsan_report.h
index 3817bcd..8ea9774 100644
--- a/lib/tsan/rtl/tsan_report.h
+++ b/lib/tsan/rtl/tsan_report.h
@@ -42,6 +42,7 @@
   char *file;
   int line;
   int col;
+  bool suppressable;
 };
 
 struct ReportMopMutex {
@@ -80,6 +81,7 @@
   char *name;
   char *file;
   int line;
+  bool suppressable;
   ReportStack *stack;
 };
 
diff --git a/lib/tsan/rtl/tsan_rtl.cc b/lib/tsan/rtl/tsan_rtl.cc
index 39e78c0..3e3e339 100644
--- a/lib/tsan/rtl/tsan_rtl.cc
+++ b/lib/tsan/rtl/tsan_rtl.cc
@@ -25,6 +25,16 @@
 #include "tsan_suppressions.h"
 #include "tsan_symbolize.h"
 
+#ifdef __SSE3__
+// <emmintrin.h> transitively includes <stdlib.h>,
+// and it's prohibited to include std headers into tsan runtime.
+// So we do this dirty trick.
+#define _MM_MALLOC_H_INCLUDED
+#define __MM_MALLOC_H
+#include <emmintrin.h>
+typedef __m128i m128;
+#endif
+
 volatile int __tsan_resumed = 0;
 
 extern "C" void __tsan_resume() {
@@ -112,10 +122,7 @@
   uptr n_running_threads;
   ctx->thread_registry->GetNumberOfThreads(&n_threads, &n_running_threads);
   InternalScopedBuffer<char> buf(4096);
-  internal_snprintf(buf.data(), buf.size(), "%d: nthr=%d nlive=%d\n",
-      i, n_threads, n_running_threads);
-  internal_write(fd, buf.data(), internal_strlen(buf.data()));
-  WriteMemoryProfile(buf.data(), buf.size());
+  WriteMemoryProfile(buf.data(), buf.size(), n_threads, n_running_threads);
   internal_write(fd, buf.data(), internal_strlen(buf.data()));
 }
 
@@ -131,19 +138,26 @@
 
   fd_t mprof_fd = kInvalidFd;
   if (flags()->profile_memory && flags()->profile_memory[0]) {
-    InternalScopedBuffer<char> filename(4096);
-    internal_snprintf(filename.data(), filename.size(), "%s.%d",
-        flags()->profile_memory, (int)internal_getpid());
-    uptr openrv = OpenFile(filename.data(), true);
-    if (internal_iserror(openrv)) {
-      Printf("ThreadSanitizer: failed to open memory profile file '%s'\n",
-          &filename[0]);
+    if (internal_strcmp(flags()->profile_memory, "stdout") == 0) {
+      mprof_fd = 1;
+    } else if (internal_strcmp(flags()->profile_memory, "stderr") == 0) {
+      mprof_fd = 2;
     } else {
-      mprof_fd = openrv;
+      InternalScopedBuffer<char> filename(4096);
+      internal_snprintf(filename.data(), filename.size(), "%s.%d",
+          flags()->profile_memory, (int)internal_getpid());
+      uptr openrv = OpenFile(filename.data(), true);
+      if (internal_iserror(openrv)) {
+        Printf("ThreadSanitizer: failed to open memory profile file '%s'\n",
+            &filename[0]);
+      } else {
+        mprof_fd = openrv;
+      }
     }
   }
 
   u64 last_flush = NanoTime();
+  u64 last_rss_check = NanoTime();
   uptr last_rss = 0;
   for (int i = 0;
       atomic_load(&ctx->stop_background_thread, memory_order_relaxed) == 0;
@@ -160,7 +174,9 @@
         last_flush = NanoTime();
       }
     }
-    if (flags()->memory_limit_mb > 0) {
+    // GetRSS can be expensive on huge programs, so don't do it every 100ms.
+    if (flags()->memory_limit_mb > 0 && last_rss_check + 1000 * kMs2Ns < now) {
+      last_rss_check = now;
       uptr rss = GetRSS();
       uptr limit = uptr(flags()->memory_limit_mb) << 20;
       if (flags()->verbosity > 0) {
@@ -222,6 +238,32 @@
   // so we can get away with unaligned mapping.
   // CHECK_EQ(addr, addr & ~((64 << 10) - 1));  // windows wants 64K alignment
   MmapFixedNoReserve(MemToShadow(addr), size * kShadowMultiplier);
+
+  // Meta shadow is 2:1, so tread carefully.
+  static bool data_mapped = false;
+  static uptr mapped_meta_end = 0;
+  uptr meta_begin = (uptr)MemToMeta(addr);
+  uptr meta_end = (uptr)MemToMeta(addr + size);
+  meta_begin = RoundDownTo(meta_begin, 64 << 10);
+  meta_end = RoundUpTo(meta_end, 64 << 10);
+  if (!data_mapped) {
+    // First call maps data+bss.
+    data_mapped = true;
+    MmapFixedNoReserve(meta_begin, meta_end - meta_begin);
+  } else {
+    // Mapping continous heap.
+    // Windows wants 64K alignment.
+    meta_begin = RoundDownTo(meta_begin, 64 << 10);
+    meta_end = RoundUpTo(meta_end, 64 << 10);
+    if (meta_end <= mapped_meta_end)
+      return;
+    if (meta_begin < mapped_meta_end)
+      meta_begin = mapped_meta_end;
+    MmapFixedNoReserve(meta_begin, meta_end - meta_begin);
+    mapped_meta_end = meta_end;
+  }
+  VPrintf(2, "mapped meta shadow for (%p-%p) at (%p-%p)\n",
+      addr, addr+size, meta_begin, meta_end);
 }
 
 void MapThreadTrace(uptr addr, uptr size) {
@@ -378,16 +420,37 @@
 }
 #endif
 
+#ifdef TSAN_GO
+NOINLINE
+void GrowShadowStack(ThreadState *thr) {
+  const int sz = thr->shadow_stack_end - thr->shadow_stack;
+  const int newsz = 2 * sz;
+  uptr *newstack = (uptr*)internal_alloc(MBlockShadowStack,
+      newsz * sizeof(uptr));
+  internal_memcpy(newstack, thr->shadow_stack, sz * sizeof(uptr));
+  internal_free(thr->shadow_stack);
+  thr->shadow_stack = newstack;
+  thr->shadow_stack_pos = newstack + sz;
+  thr->shadow_stack_end = newstack + newsz;
+}
+#endif
+
 u32 CurrentStackId(ThreadState *thr, uptr pc) {
   if (thr->shadow_stack_pos == 0)  // May happen during bootstrap.
     return 0;
-  if (pc) {
+  if (pc != 0) {
+#ifndef TSAN_GO
+    DCHECK_LT(thr->shadow_stack_pos, thr->shadow_stack_end);
+#else
+    if (thr->shadow_stack_pos == thr->shadow_stack_end)
+      GrowShadowStack(thr);
+#endif
     thr->shadow_stack_pos[0] = pc;
     thr->shadow_stack_pos++;
   }
   u32 id = StackDepotPut(thr->shadow_stack,
                          thr->shadow_stack_pos - thr->shadow_stack);
-  if (pc)
+  if (pc != 0)
     thr->shadow_stack_pos--;
   return id;
 }
@@ -449,7 +512,8 @@
   *s = 0;
 }
 
-static inline void HandleRace(ThreadState *thr, u64 *shadow_mem,
+ALWAYS_INLINE
+void HandleRace(ThreadState *thr, u64 *shadow_mem,
                               Shadow cur, Shadow old) {
   thr->racy_state[0] = cur.raw();
   thr->racy_state[1] = old.raw();
@@ -461,16 +525,12 @@
 #endif
 }
 
-static inline bool OldIsInSameSynchEpoch(Shadow old, ThreadState *thr) {
-  return old.epoch() >= thr->fast_synch_epoch;
-}
-
 static inline bool HappensBefore(Shadow old, ThreadState *thr) {
   return thr->clock.get(old.TidWithIgnore()) >= old.epoch();
 }
 
-ALWAYS_INLINE USED
-void MemoryAccessImpl(ThreadState *thr, uptr addr,
+ALWAYS_INLINE
+void MemoryAccessImpl1(ThreadState *thr, uptr addr,
     int kAccessSizeLog, bool kAccessIsWrite, bool kIsAtomic,
     u64 *shadow_mem, Shadow cur) {
   StatInc(thr, StatMop);
@@ -564,6 +624,90 @@
   }
 }
 
+ALWAYS_INLINE
+bool ContainsSameAccessSlow(u64 *s, u64 a, u64 sync_epoch, bool is_write) {
+  Shadow cur(a);
+  for (uptr i = 0; i < kShadowCnt; i++) {
+    Shadow old(LoadShadow(&s[i]));
+    if (Shadow::Addr0AndSizeAreEqual(cur, old) &&
+        old.TidWithIgnore() == cur.TidWithIgnore() &&
+        old.epoch() > sync_epoch &&
+        old.IsAtomic() == cur.IsAtomic() &&
+        old.IsRead() <= cur.IsRead())
+      return true;
+  }
+  return false;
+}
+
+#if defined(__SSE3__) && TSAN_SHADOW_COUNT == 4
+#define SHUF(v0, v1, i0, i1, i2, i3) _mm_castps_si128(_mm_shuffle_ps( \
+    _mm_castsi128_ps(v0), _mm_castsi128_ps(v1), \
+    (i0)*1 + (i1)*4 + (i2)*16 + (i3)*64))
+ALWAYS_INLINE
+bool ContainsSameAccessFast(u64 *s, u64 a, u64 sync_epoch, bool is_write) {
+  // This is an optimized version of ContainsSameAccessSlow.
+  // load current access into access[0:63]
+  const m128 access     = _mm_cvtsi64_si128(a);
+  // duplicate high part of access in addr0:
+  // addr0[0:31]        = access[32:63]
+  // addr0[32:63]       = access[32:63]
+  // addr0[64:95]       = access[32:63]
+  // addr0[96:127]      = access[32:63]
+  const m128 addr0      = SHUF(access, access, 1, 1, 1, 1);
+  // load 4 shadow slots
+  const m128 shadow0    = _mm_load_si128((__m128i*)s);
+  const m128 shadow1    = _mm_load_si128((__m128i*)s + 1);
+  // load high parts of 4 shadow slots into addr_vect:
+  // addr_vect[0:31]    = shadow0[32:63]
+  // addr_vect[32:63]   = shadow0[96:127]
+  // addr_vect[64:95]   = shadow1[32:63]
+  // addr_vect[96:127]  = shadow1[96:127]
+  m128 addr_vect        = SHUF(shadow0, shadow1, 1, 3, 1, 3);
+  if (!is_write) {
+    // set IsRead bit in addr_vect
+    const m128 rw_mask1 = _mm_cvtsi64_si128(1<<15);
+    const m128 rw_mask  = SHUF(rw_mask1, rw_mask1, 0, 0, 0, 0);
+    addr_vect           = _mm_or_si128(addr_vect, rw_mask);
+  }
+  // addr0 == addr_vect?
+  const m128 addr_res   = _mm_cmpeq_epi32(addr0, addr_vect);
+  // epoch1[0:63]       = sync_epoch
+  const m128 epoch1     = _mm_cvtsi64_si128(sync_epoch);
+  // epoch[0:31]        = sync_epoch[0:31]
+  // epoch[32:63]       = sync_epoch[0:31]
+  // epoch[64:95]       = sync_epoch[0:31]
+  // epoch[96:127]      = sync_epoch[0:31]
+  const m128 epoch      = SHUF(epoch1, epoch1, 0, 0, 0, 0);
+  // load low parts of shadow cell epochs into epoch_vect:
+  // epoch_vect[0:31]   = shadow0[0:31]
+  // epoch_vect[32:63]  = shadow0[64:95]
+  // epoch_vect[64:95]  = shadow1[0:31]
+  // epoch_vect[96:127] = shadow1[64:95]
+  const m128 epoch_vect = SHUF(shadow0, shadow1, 0, 2, 0, 2);
+  // epoch_vect >= sync_epoch?
+  const m128 epoch_res  = _mm_cmpgt_epi32(epoch_vect, epoch);
+  // addr_res & epoch_res
+  const m128 res        = _mm_and_si128(addr_res, epoch_res);
+  // mask[0] = res[7]
+  // mask[1] = res[15]
+  // ...
+  // mask[15] = res[127]
+  const int mask        = _mm_movemask_epi8(res);
+  return mask != 0;
+}
+#endif
+
+ALWAYS_INLINE
+bool ContainsSameAccess(u64 *s, u64 a, u64 sync_epoch, bool is_write) {
+#if defined(__SSE3__) && TSAN_SHADOW_COUNT == 4
+  bool res = ContainsSameAccessFast(s, a, sync_epoch, is_write);
+  DCHECK_EQ(res, ContainsSameAccessSlow(s, a, sync_epoch, is_write));
+  return res;
+#else
+  return ContainsSameAccessSlow(s, a, sync_epoch, is_write);
+#endif
+}
+
 ALWAYS_INLINE USED
 void MemoryAccess(ThreadState *thr, uptr pc, uptr addr,
     int kAccessSizeLog, bool kAccessIsWrite, bool kIsAtomic) {
@@ -585,7 +729,7 @@
   }
 #endif
 
-  if (*shadow_mem == kShadowRodata) {
+  if (kCppMode && *shadow_mem == kShadowRodata) {
     // Access to .rodata section, no races here.
     // Measurements show that it can be 10-20% of all memory accesses.
     StatInc(thr, StatMop);
@@ -596,14 +740,12 @@
   }
 
   FastState fast_state = thr->fast_state;
-  if (fast_state.GetIgnoreBit())
+  if (fast_state.GetIgnoreBit()) {
+    StatInc(thr, StatMop);
+    StatInc(thr, kAccessIsWrite ? StatMopWrite : StatMopRead);
+    StatInc(thr, (StatType)(StatMop1 + kAccessSizeLog));
+    StatInc(thr, StatMopIgnored);
     return;
-  if (kCollectHistory) {
-    fast_state.IncrementEpoch();
-    thr->fast_state = fast_state;
-    // We must not store to the trace if we do not store to the shadow.
-    // That is, this call must be moved somewhere below.
-    TraceAddEvent(thr, fast_state, EventTypeMop, pc);
   }
 
   Shadow cur(fast_state);
@@ -611,7 +753,41 @@
   cur.SetWrite(kAccessIsWrite);
   cur.SetAtomic(kIsAtomic);
 
-  MemoryAccessImpl(thr, addr, kAccessSizeLog, kAccessIsWrite, kIsAtomic,
+  if (LIKELY(ContainsSameAccess(shadow_mem, cur.raw(),
+      thr->fast_synch_epoch, kAccessIsWrite))) {
+    StatInc(thr, StatMop);
+    StatInc(thr, kAccessIsWrite ? StatMopWrite : StatMopRead);
+    StatInc(thr, (StatType)(StatMop1 + kAccessSizeLog));
+    StatInc(thr, StatMopSame);
+    return;
+  }
+
+  if (kCollectHistory) {
+    fast_state.IncrementEpoch();
+    thr->fast_state = fast_state;
+    TraceAddEvent(thr, fast_state, EventTypeMop, pc);
+    cur.IncrementEpoch();
+  }
+
+  MemoryAccessImpl1(thr, addr, kAccessSizeLog, kAccessIsWrite, kIsAtomic,
+      shadow_mem, cur);
+}
+
+// Called by MemoryAccessRange in tsan_rtl_thread.cc
+ALWAYS_INLINE USED
+void MemoryAccessImpl(ThreadState *thr, uptr addr,
+    int kAccessSizeLog, bool kAccessIsWrite, bool kIsAtomic,
+    u64 *shadow_mem, Shadow cur) {
+  if (LIKELY(ContainsSameAccess(shadow_mem, cur.raw(),
+      thr->fast_synch_epoch, kAccessIsWrite))) {
+    StatInc(thr, StatMop);
+    StatInc(thr, kAccessIsWrite ? StatMopWrite : StatMopRead);
+    StatInc(thr, (StatType)(StatMop1 + kAccessSizeLog));
+    StatInc(thr, StatMopSame);
+    return;
+  }
+
+  MemoryAccessImpl1(thr, addr, kAccessSizeLog, kAccessIsWrite, kIsAtomic,
       shadow_mem, cur);
 }
 
@@ -729,17 +905,8 @@
 #ifndef TSAN_GO
   DCHECK_LT(thr->shadow_stack_pos, thr->shadow_stack_end);
 #else
-  if (thr->shadow_stack_pos == thr->shadow_stack_end) {
-    const int sz = thr->shadow_stack_end - thr->shadow_stack;
-    const int newsz = 2 * sz;
-    uptr *newstack = (uptr*)internal_alloc(MBlockShadowStack,
-        newsz * sizeof(uptr));
-    internal_memcpy(newstack, thr->shadow_stack, sz * sizeof(uptr));
-    internal_free(thr->shadow_stack);
-    thr->shadow_stack = newstack;
-    thr->shadow_stack_pos = newstack + sz;
-    thr->shadow_stack_end = newstack + newsz;
-  }
+  if (thr->shadow_stack_pos == thr->shadow_stack_end)
+    GrowShadowStack(thr);
 #endif
   thr->shadow_stack_pos[0] = pc;
   thr->shadow_stack_pos++;
diff --git a/lib/tsan/rtl/tsan_rtl.h b/lib/tsan/rtl/tsan_rtl.h
index 4364ef8..1b590b8 100644
--- a/lib/tsan/rtl/tsan_rtl.h
+++ b/lib/tsan/rtl/tsan_rtl.h
@@ -44,6 +44,7 @@
 #include "tsan_platform.h"
 #include "tsan_mutexset.h"
 #include "tsan_ignoreset.h"
+#include "tsan_stack_trace.h"
 
 #if SANITIZER_WORDSIZE != 64
 # error "ThreadSanitizer is supported only on 64-bit platforms"
@@ -51,77 +52,6 @@
 
 namespace __tsan {
 
-// Descriptor of user's memory block.
-struct MBlock {
-  /*
-  u64 mtx : 1;  // must be first
-  u64 lst : 44;
-  u64 stk : 31;  // on word boundary
-  u64 tid : kTidBits;
-  u64 siz : 128 - 1 - 31 - 44 - kTidBits;  // 39
-  */
-  u64 raw[2];
-
-  void Init(uptr siz, u32 tid, u32 stk) {
-    raw[0] = raw[1] = 0;
-    raw[1] |= (u64)siz << ((1 + 44 + 31 + kTidBits) % 64);
-    raw[1] |= (u64)tid << ((1 + 44 + 31) % 64);
-    raw[0] |= (u64)stk << (1 + 44);
-    raw[1] |= (u64)stk >> (64 - 44 - 1);
-    DCHECK_EQ(Size(), siz);
-    DCHECK_EQ(Tid(), tid);
-    DCHECK_EQ(StackId(), stk);
-  }
-
-  u32 Tid() const {
-    return GetLsb(raw[1] >> ((1 + 44 + 31) % 64), kTidBits);
-  }
-
-  uptr Size() const {
-    return raw[1] >> ((1 + 31 + 44 + kTidBits) % 64);
-  }
-
-  u32 StackId() const {
-    return (raw[0] >> (1 + 44)) | GetLsb(raw[1] << (64 - 44 - 1), 31);
-  }
-
-  SyncVar *ListHead() const {
-    return (SyncVar*)(GetLsb(raw[0] >> 1, 44) << 3);
-  }
-
-  void ListPush(SyncVar *v) {
-    SyncVar *lst = ListHead();
-    v->next = lst;
-    u64 x = (u64)v ^ (u64)lst;
-    x = (x >> 3) << 1;
-    raw[0] ^= x;
-    DCHECK_EQ(ListHead(), v);
-  }
-
-  SyncVar *ListPop() {
-    SyncVar *lst = ListHead();
-    SyncVar *nxt = lst->next;
-    lst->next = 0;
-    u64 x = (u64)lst ^ (u64)nxt;
-    x = (x >> 3) << 1;
-    raw[0] ^= x;
-    DCHECK_EQ(ListHead(), nxt);
-    return lst;
-  }
-
-  void ListReset() {
-    SyncVar *lst = ListHead();
-    u64 x = (u64)lst;
-    x = (x >> 3) << 1;
-    raw[0] ^= x;
-    DCHECK_EQ(ListHead(), 0);
-  }
-
-  void Lock();
-  void Unlock();
-  typedef GenericScopedLock<MBlock> ScopedLock;
-};
-
 #ifndef TSAN_GO
 #if defined(TSAN_COMPAT_SHADOW) && TSAN_COMPAT_SHADOW
 const uptr kAllocatorSpace = 0x7d0000000000ULL;
@@ -131,7 +61,7 @@
 const uptr kAllocatorSize  =  0x10000000000ULL;  // 1T.
 
 struct MapUnmapCallback;
-typedef SizeClassAllocator64<kAllocatorSpace, kAllocatorSize, sizeof(MBlock),
+typedef SizeClassAllocator64<kAllocatorSpace, kAllocatorSize, 0,
     DefaultSizeClassMap, MapUnmapCallback> PrimaryAllocator;
 typedef SizeClassAllocatorLocalCache<PrimaryAllocator> AllocatorCache;
 typedef LargeMmapAllocator<MapUnmapCallback> SecondaryAllocator;
@@ -148,14 +78,14 @@
 // FastState (from most significant bit):
 //   ignore          : 1
 //   tid             : kTidBits
-//   epoch           : kClkBits
 //   unused          : -
 //   history_size    : 3
+//   epoch           : kClkBits
 class FastState {
  public:
   FastState(u64 tid, u64 epoch) {
     x_ = tid << kTidShift;
-    x_ |= epoch << kClkShift;
+    x_ |= epoch;
     DCHECK_EQ(tid, this->tid());
     DCHECK_EQ(epoch, this->epoch());
     DCHECK_EQ(GetIgnoreBit(), false);
@@ -180,13 +110,13 @@
   }
 
   u64 epoch() const {
-    u64 res = (x_ << (kTidBits + 1)) >> (64 - kClkBits);
+    u64 res = x_ & ((1ull << kClkBits) - 1);
     return res;
   }
 
   void IncrementEpoch() {
     u64 old_epoch = epoch();
-    x_ += 1 << kClkShift;
+    x_ += 1;
     DCHECK_EQ(old_epoch + 1, epoch());
     (void)old_epoch;
   }
@@ -198,17 +128,19 @@
   void SetHistorySize(int hs) {
     CHECK_GE(hs, 0);
     CHECK_LE(hs, 7);
-    x_ = (x_ & ~7) | hs;
+    x_ = (x_ & ~(kHistoryMask << kHistoryShift)) | (u64(hs) << kHistoryShift);
   }
 
+  ALWAYS_INLINE
   int GetHistorySize() const {
-    return (int)(x_ & 7);
+    return (int)((x_ >> kHistoryShift) & kHistoryMask);
   }
 
   void ClearHistorySize() {
-    x_ &= ~7;
+    SetHistorySize(0);
   }
 
+  ALWAYS_INLINE
   u64 GetTracePos() const {
     const int hs = GetHistorySize();
     // When hs == 0, the trace consists of 2 parts.
@@ -219,20 +151,21 @@
  private:
   friend class Shadow;
   static const int kTidShift = 64 - kTidBits - 1;
-  static const int kClkShift = kTidShift - kClkBits;
   static const u64 kIgnoreBit = 1ull << 63;
   static const u64 kFreedBit = 1ull << 63;
+  static const u64 kHistoryShift = kClkBits;
+  static const u64 kHistoryMask = 7;
   u64 x_;
 };
 
 // Shadow (from most significant bit):
 //   freed           : 1
 //   tid             : kTidBits
-//   epoch           : kClkBits
 //   is_atomic       : 1
 //   is_read         : 1
 //   size_log        : 2
 //   addr0           : 3
+//   epoch           : kClkBits
 class Shadow : public FastState {
  public:
   explicit Shadow(u64 x)
@@ -245,10 +178,10 @@
   }
 
   void SetAddr0AndSizeLog(u64 addr0, unsigned kAccessSizeLog) {
-    DCHECK_EQ(x_ & 31, 0);
+    DCHECK_EQ((x_ >> kClkBits) & 31, 0);
     DCHECK_LE(addr0, 7);
     DCHECK_LE(kAccessSizeLog, 3);
-    x_ |= (kAccessSizeLog << 3) | addr0;
+    x_ |= ((kAccessSizeLog << 3) | addr0) << kClkBits;
     DCHECK_EQ(kAccessSizeLog, size_log());
     DCHECK_EQ(addr0, this->addr0());
   }
@@ -281,47 +214,34 @@
     return shifted_xor == 0;
   }
 
-  static inline bool Addr0AndSizeAreEqual(const Shadow s1, const Shadow s2) {
-    u64 masked_xor = (s1.x_ ^ s2.x_) & 31;
+  static ALWAYS_INLINE
+  bool Addr0AndSizeAreEqual(const Shadow s1, const Shadow s2) {
+    u64 masked_xor = ((s1.x_ ^ s2.x_) >> kClkBits) & 31;
     return masked_xor == 0;
   }
 
-  static inline bool TwoRangesIntersect(Shadow s1, Shadow s2,
+  static ALWAYS_INLINE bool TwoRangesIntersect(Shadow s1, Shadow s2,
       unsigned kS2AccessSize) {
     bool res = false;
     u64 diff = s1.addr0() - s2.addr0();
     if ((s64)diff < 0) {  // s1.addr0 < s2.addr0  // NOLINT
       // if (s1.addr0() + size1) > s2.addr0()) return true;
-      if (s1.size() > -diff)  res = true;
+      if (s1.size() > -diff)
+        res = true;
     } else {
       // if (s2.addr0() + kS2AccessSize > s1.addr0()) return true;
-      if (kS2AccessSize > diff) res = true;
+      if (kS2AccessSize > diff)
+        res = true;
     }
-    DCHECK_EQ(res, TwoRangesIntersectSLOW(s1, s2));
-    DCHECK_EQ(res, TwoRangesIntersectSLOW(s2, s1));
+    DCHECK_EQ(res, TwoRangesIntersectSlow(s1, s2));
+    DCHECK_EQ(res, TwoRangesIntersectSlow(s2, s1));
     return res;
   }
 
-  // The idea behind the offset is as follows.
-  // Consider that we have 8 bool's contained within a single 8-byte block
-  // (mapped to a single shadow "cell"). Now consider that we write to the bools
-  // from a single thread (which we consider the common case).
-  // W/o offsetting each access will have to scan 4 shadow values at average
-  // to find the corresponding shadow value for the bool.
-  // With offsetting we start scanning shadow with the offset so that
-  // each access hits necessary shadow straight off (at least in an expected
-  // optimistic case).
-  // This logic works seamlessly for any layout of user data. For example,
-  // if user data is {int, short, char, char}, then accesses to the int are
-  // offsetted to 0, short - 4, 1st char - 6, 2nd char - 7. Hopefully, accesses
-  // from a single thread won't need to scan all 8 shadow values.
-  unsigned ComputeSearchOffset() {
-    return x_ & 7;
-  }
-  u64 addr0() const { return x_ & 7; }
-  u64 size() const { return 1ull << size_log(); }
-  bool IsWrite() const { return !IsRead(); }
-  bool IsRead() const { return x_ & kReadBit; }
+  u64 ALWAYS_INLINE addr0() const { return (x_ >> kClkBits) & 7; }
+  u64 ALWAYS_INLINE size() const { return 1ull << size_log(); }
+  bool ALWAYS_INLINE IsWrite() const { return !IsRead(); }
+  bool ALWAYS_INLINE IsRead() const { return x_ & kReadBit; }
 
   // The idea behind the freed bit is as follows.
   // When the memory is freed (or otherwise unaccessible) we write to the shadow
@@ -346,15 +266,14 @@
     return res;
   }
 
-  bool IsBothReadsOrAtomic(bool kIsWrite, bool kIsAtomic) const {
-    // analyzes 5-th bit (is_read) and 6-th bit (is_atomic)
-    bool v = x_ & u64(((kIsWrite ^ 1) << kReadShift)
-        | (kIsAtomic << kAtomicShift));
+  bool ALWAYS_INLINE IsBothReadsOrAtomic(bool kIsWrite, bool kIsAtomic) const {
+    bool v = x_ & ((u64(kIsWrite ^ 1) << kReadShift)
+        | (u64(kIsAtomic) << kAtomicShift));
     DCHECK_EQ(v, (!IsWrite() && !kIsWrite) || (IsAtomic() && kIsAtomic));
     return v;
   }
 
-  bool IsRWNotWeaker(bool kIsWrite, bool kIsAtomic) const {
+  bool ALWAYS_INLINE IsRWNotWeaker(bool kIsWrite, bool kIsAtomic) const {
     bool v = ((x_ >> kReadShift) & 3)
         <= u64((kIsWrite ^ 1) | (kIsAtomic << 1));
     DCHECK_EQ(v, (IsAtomic() < kIsAtomic) ||
@@ -362,7 +281,7 @@
     return v;
   }
 
-  bool IsRWWeakerOrEqual(bool kIsWrite, bool kIsAtomic) const {
+  bool ALWAYS_INLINE IsRWWeakerOrEqual(bool kIsWrite, bool kIsAtomic) const {
     bool v = ((x_ >> kReadShift) & 3)
         >= u64((kIsWrite ^ 1) | (kIsAtomic << 1));
     DCHECK_EQ(v, (IsAtomic() > kIsAtomic) ||
@@ -371,14 +290,14 @@
   }
 
  private:
-  static const u64 kReadShift   = 5;
+  static const u64 kReadShift   = 5 + kClkBits;
   static const u64 kReadBit     = 1ull << kReadShift;
-  static const u64 kAtomicShift = 6;
+  static const u64 kAtomicShift = 6 + kClkBits;
   static const u64 kAtomicBit   = 1ull << kAtomicShift;
 
-  u64 size_log() const { return (x_ >> 3) & 3; }
+  u64 size_log() const { return (x_ >> (3 + kClkBits)) & 3; }
 
-  static bool TwoRangesIntersectSLOW(const Shadow s1, const Shadow s2) {
+  static bool TwoRangesIntersectSlow(const Shadow s1, const Shadow s2) {
     if (s1.addr0() == s2.addr0()) return true;
     if (s1.addr0() < s2.addr0() && s1.addr0() + s1.size() > s2.addr0())
       return true;
@@ -457,6 +376,9 @@
   bool in_signal_handler;
   SignalContext *signal_ctx;
 
+  DenseSlabAllocCache block_cache;
+  DenseSlabAllocCache sync_cache;
+
 #ifndef TSAN_GO
   u32 last_sleep_stack_id;
   ThreadClock last_sleep_clock;
@@ -530,7 +452,7 @@
   bool initialized;
   bool after_multithreaded_fork;
 
-  SyncTab synctab;
+  MetaMap metamap;
 
   Mutex report_mtx;
   int nreported;
@@ -576,11 +498,11 @@
   explicit ScopedReport(ReportType typ);
   ~ScopedReport();
 
-  void AddStack(const StackTrace *stack);
   void AddMemoryAccess(uptr addr, Shadow s, const StackTrace *stack,
                        const MutexSet *mset);
-  void AddThread(const ThreadContext *tctx);
-  void AddThread(int unique_tid);
+  void AddStack(const StackTrace *stack, bool suppressable = false);
+  void AddThread(const ThreadContext *tctx, bool suppressable = false);
+  void AddThread(int unique_tid, bool suppressable = false);
   void AddUniqueTid(int unique_tid);
   void AddMutex(const SyncVar *s);
   u64 AddMutex(u64 id);
@@ -628,11 +550,7 @@
 void ForkChildAfter(ThreadState *thr, uptr pc);
 
 void ReportRace(ThreadState *thr);
-bool OutputReport(Context *ctx,
-                  const ScopedReport &srep,
-                  const ReportStack *suppress_stack1,
-                  const ReportStack *suppress_stack2 = 0,
-                  const ReportLocation *suppress_loc = 0);
+bool OutputReport(ThreadState *thr, const ScopedReport &srep);
 bool IsFiredSuppression(Context *ctx,
                         const ScopedReport &srep,
                         const StackTrace &trace);
@@ -661,9 +579,8 @@
 void Initialize(ThreadState *thr);
 int Finalize(ThreadState *thr);
 
-SyncVar* GetJavaSync(ThreadState *thr, uptr pc, uptr addr,
-                     bool write_lock, bool create);
-SyncVar* GetAndRemoveJavaSync(ThreadState *thr, uptr pc, uptr addr);
+void OnUserAlloc(ThreadState *thr, uptr pc, uptr p, uptr sz, bool write);
+void OnUserFree(ThreadState *thr, uptr pc, uptr p, bool write);
 
 void MemoryAccess(ThreadState *thr, uptr pc, uptr addr,
     int kAccessSizeLog, bool kAccessIsWrite, bool kIsAtomic);
diff --git a/lib/tsan/rtl/tsan_rtl_mutex.cc b/lib/tsan/rtl/tsan_rtl_mutex.cc
index 650cd62..4cf47ec 100644
--- a/lib/tsan/rtl/tsan_rtl_mutex.cc
+++ b/lib/tsan/rtl/tsan_rtl_mutex.cc
@@ -52,14 +52,18 @@
 
 static void ReportMutexMisuse(ThreadState *thr, uptr pc, ReportType typ,
     uptr addr, u64 mid) {
+  // In Go, these misuses are either impossible, or detected by std lib,
+  // or false positives (e.g. unlock in a different thread).
+  if (kGoMode)
+    return;
   ThreadRegistryLock l(ctx->thread_registry);
   ScopedReport rep(typ);
   rep.AddMutex(mid);
   StackTrace trace;
   trace.ObtainCurrent(thr, pc);
-  rep.AddStack(&trace);
+  rep.AddStack(&trace, true);
   rep.AddLocation(addr, 1);
-  OutputReport(ctx, rep, rep.GetReport()->stacks[0]);
+  OutputReport(thr, rep);
 }
 
 void MutexCreate(ThreadState *thr, uptr pc, uptr addr,
@@ -72,10 +76,12 @@
     MemoryWrite(thr, pc, addr, kSizeLog1);
     thr->is_freeing = false;
   }
-  SyncVar *s = ctx->synctab.GetOrCreateAndLock(thr, pc, addr, true);
+  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, true);
   s->is_rw = rw;
   s->is_recursive = recursive;
   s->is_linker_init = linker_init;
+  if (kCppMode && s->creation_stack_id == 0)
+    s->creation_stack_id = CurrentStackId(thr, pc);
   s->mtx.Unlock();
 }
 
@@ -88,37 +94,54 @@
   if (IsGlobalVar(addr))
     return;
 #endif
-  SyncVar *s = ctx->synctab.GetAndRemove(thr, pc, addr);
-  if (s == 0)
-    return;
-  if (flags()->detect_deadlocks) {
-    Callback cb(thr, pc);
-    ctx->dd->MutexDestroy(&cb, &s->dd);
-  }
   if (IsAppMem(addr)) {
     CHECK(!thr->is_freeing);
     thr->is_freeing = true;
     MemoryWrite(thr, pc, addr, kSizeLog1);
     thr->is_freeing = false;
   }
+  SyncVar *s = ctx->metamap.GetIfExistsAndLock(addr);
+  if (s == 0)
+    return;
+  if (flags()->detect_deadlocks) {
+    Callback cb(thr, pc);
+    ctx->dd->MutexDestroy(&cb, &s->dd);
+    ctx->dd->MutexInit(&cb, &s->dd);
+  }
+  bool unlock_locked = false;
   if (flags()->report_destroy_locked
       && s->owner_tid != SyncVar::kInvalidTid
       && !s->is_broken) {
     s->is_broken = true;
+    unlock_locked = true;
+  }
+  u64 mid = s->GetId();
+  u32 last_lock = s->last_lock;
+  if (!unlock_locked)
+    s->Reset();  // must not reset it before the report is printed
+  s->mtx.Unlock();
+  if (unlock_locked) {
     ThreadRegistryLock l(ctx->thread_registry);
     ScopedReport rep(ReportTypeMutexDestroyLocked);
-    rep.AddMutex(s);
+    rep.AddMutex(mid);
     StackTrace trace;
     trace.ObtainCurrent(thr, pc);
     rep.AddStack(&trace);
-    FastState last(s->last_lock);
+    FastState last(last_lock);
     RestoreStack(last.tid(), last.epoch(), &trace, 0);
-    rep.AddStack(&trace);
-    rep.AddLocation(s->addr, 1);
-    OutputReport(ctx, rep, rep.GetReport()->stacks[0]);
+    rep.AddStack(&trace, true);
+    rep.AddLocation(addr, 1);
+    OutputReport(thr, rep);
   }
-  thr->mset.Remove(s->GetId());
-  DestroyAndFree(s);
+  if (unlock_locked) {
+    SyncVar *s = ctx->metamap.GetIfExistsAndLock(addr);
+    if (s != 0) {
+      s->Reset();
+      s->mtx.Unlock();
+    }
+  }
+  thr->mset.Remove(mid);
+  // s will be destroyed and freed in MetaMap::FreeBlock.
 }
 
 void MutexLock(ThreadState *thr, uptr pc, uptr addr, int rec, bool try_lock) {
@@ -126,7 +149,7 @@
   CHECK_GT(rec, 0);
   if (IsAppMem(addr))
     MemoryReadAtomic(thr, pc, addr, kSizeLog1);
-  SyncVar *s = ctx->synctab.GetOrCreateAndLock(thr, pc, addr, true);
+  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, true);
   thr->fast_state.IncrementEpoch();
   TraceAddEvent(thr, thr->fast_state, EventTypeLock, s->GetId());
   bool report_double_lock = false;
@@ -149,7 +172,7 @@
   }
   s->recursion += rec;
   thr->mset.Add(s->GetId(), true, thr->fast_state.epoch());
-  if (flags()->detect_deadlocks && s->recursion == 1) {
+  if (flags()->detect_deadlocks && (s->recursion - rec) == 0) {
     Callback cb(thr, pc);
     if (!try_lock)
       ctx->dd->MutexBeforeLock(&cb, &s->dd, true);
@@ -170,12 +193,12 @@
   DPrintf("#%d: MutexUnlock %zx all=%d\n", thr->tid, addr, all);
   if (IsAppMem(addr))
     MemoryReadAtomic(thr, pc, addr, kSizeLog1);
-  SyncVar *s = ctx->synctab.GetOrCreateAndLock(thr, pc, addr, true);
+  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, true);
   thr->fast_state.IncrementEpoch();
   TraceAddEvent(thr, thr->fast_state, EventTypeUnlock, s->GetId());
   int rec = 0;
   bool report_bad_unlock = false;
-  if (s->recursion == 0 || s->owner_tid != thr->tid) {
+  if (kCppMode && (s->recursion == 0 || s->owner_tid != thr->tid)) {
     if (flags()->report_mutex_bugs && !s->is_broken) {
       s->is_broken = true;
       report_bad_unlock = true;
@@ -192,7 +215,7 @@
     }
   }
   thr->mset.Del(s->GetId(), true);
-  if (flags()->detect_deadlocks && s->recursion == 0) {
+  if (flags()->detect_deadlocks && s->recursion == 0 && !report_bad_unlock) {
     Callback cb(thr, pc);
     ctx->dd->MutexBeforeUnlock(&cb, &s->dd, true);
   }
@@ -201,7 +224,7 @@
   // Can't touch s after this point.
   if (report_bad_unlock)
     ReportMutexMisuse(thr, pc, ReportTypeMutexBadUnlock, addr, mid);
-  if (flags()->detect_deadlocks) {
+  if (flags()->detect_deadlocks && !report_bad_unlock) {
     Callback cb(thr, pc);
     ReportDeadlock(thr, pc, ctx->dd->GetReport(&cb));
   }
@@ -213,7 +236,7 @@
   StatInc(thr, StatMutexReadLock);
   if (IsAppMem(addr))
     MemoryReadAtomic(thr, pc, addr, kSizeLog1);
-  SyncVar *s = ctx->synctab.GetOrCreateAndLock(thr, pc, addr, false);
+  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, false);
   thr->fast_state.IncrementEpoch();
   TraceAddEvent(thr, thr->fast_state, EventTypeRLock, s->GetId());
   bool report_bad_lock = false;
@@ -248,7 +271,7 @@
   StatInc(thr, StatMutexReadUnlock);
   if (IsAppMem(addr))
     MemoryReadAtomic(thr, pc, addr, kSizeLog1);
-  SyncVar *s = ctx->synctab.GetOrCreateAndLock(thr, pc, addr, true);
+  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, true);
   thr->fast_state.IncrementEpoch();
   TraceAddEvent(thr, thr->fast_state, EventTypeRUnlock, s->GetId());
   bool report_bad_unlock = false;
@@ -279,7 +302,7 @@
   DPrintf("#%d: MutexReadOrWriteUnlock %zx\n", thr->tid, addr);
   if (IsAppMem(addr))
     MemoryReadAtomic(thr, pc, addr, kSizeLog1);
-  SyncVar *s = ctx->synctab.GetOrCreateAndLock(thr, pc, addr, true);
+  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, true);
   bool write = true;
   bool report_bad_unlock = false;
   if (s->owner_tid == SyncVar::kInvalidTid) {
@@ -324,7 +347,7 @@
 
 void MutexRepair(ThreadState *thr, uptr pc, uptr addr) {
   DPrintf("#%d: MutexRepair %zx\n", thr->tid, addr);
-  SyncVar *s = ctx->synctab.GetOrCreateAndLock(thr, pc, addr, true);
+  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, true);
   s->owner_tid = SyncVar::kInvalidTid;
   s->recursion = 0;
   s->mtx.Unlock();
@@ -334,7 +357,7 @@
   DPrintf("#%d: Acquire %zx\n", thr->tid, addr);
   if (thr->ignore_sync)
     return;
-  SyncVar *s = ctx->synctab.GetOrCreateAndLock(thr, pc, addr, false);
+  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, false);
   AcquireImpl(thr, pc, &s->clock);
   s->mtx.ReadUnlock();
 }
@@ -361,7 +384,7 @@
   DPrintf("#%d: Release %zx\n", thr->tid, addr);
   if (thr->ignore_sync)
     return;
-  SyncVar *s = ctx->synctab.GetOrCreateAndLock(thr, pc, addr, true);
+  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, true);
   thr->fast_state.IncrementEpoch();
   // Can't increment epoch w/o writing to the trace as well.
   TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
@@ -373,7 +396,7 @@
   DPrintf("#%d: ReleaseStore %zx\n", thr->tid, addr);
   if (thr->ignore_sync)
     return;
-  SyncVar *s = ctx->synctab.GetOrCreateAndLock(thr, pc, addr, true);
+  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, true);
   thr->fast_state.IncrementEpoch();
   // Can't increment epoch w/o writing to the trace as well.
   TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
@@ -448,7 +471,7 @@
     rep.AddUniqueTid((int)r->loop[i].thr_ctx);
     rep.AddThread((int)r->loop[i].thr_ctx);
   }
-  StackTrace stacks[2 * DDReport::kMaxLoopSize];
+  InternalScopedBuffer<StackTrace> stacks(2 * DDReport::kMaxLoopSize);
   uptr dummy_pc = 0x42;
   for (int i = 0; i < r->n; i++) {
     uptr size;
@@ -462,12 +485,10 @@
         // but we should still produce some stack trace in the report.
         stacks[i].Init(&dummy_pc, 1);
       }
-      rep.AddStack(&stacks[i]);
+      rep.AddStack(&stacks[i], true);
     }
   }
-  // FIXME: use all stacks for suppressions, not just the second stack of the
-  // first edge.
-  OutputReport(ctx, rep, rep.GetReport()->stacks[0]);
+  OutputReport(thr, rep);
 }
 
 }  // namespace __tsan
diff --git a/lib/tsan/rtl/tsan_rtl_report.cc b/lib/tsan/rtl/tsan_rtl_report.cc
index 52edb5a..b75c319 100644
--- a/lib/tsan/rtl/tsan_rtl_report.cc
+++ b/lib/tsan/rtl/tsan_rtl_report.cc
@@ -162,9 +162,10 @@
   DestroyAndFree(rep_);
 }
 
-void ScopedReport::AddStack(const StackTrace *stack) {
+void ScopedReport::AddStack(const StackTrace *stack, bool suppressable) {
   ReportStack **rs = rep_->stacks.PushBack();
   *rs = SymbolizeStack(*stack);
+  (*rs)->suppressable = suppressable;
 }
 
 void ScopedReport::AddMemoryAccess(uptr addr, Shadow s,
@@ -178,6 +179,8 @@
   mop->write = s.IsWrite();
   mop->atomic = s.IsAtomic();
   mop->stack = SymbolizeStack(*stack);
+  if (mop->stack)
+    mop->stack->suppressable = true;
   for (uptr i = 0; i < mset->Size(); i++) {
     MutexSet::Desc d = mset->Get(i);
     u64 mid = this->AddMutex(d.id);
@@ -190,7 +193,7 @@
   rep_->unique_tids.PushBack(unique_tid);
 }
 
-void ScopedReport::AddThread(const ThreadContext *tctx) {
+void ScopedReport::AddThread(const ThreadContext *tctx, bool suppressable) {
   for (uptr i = 0; i < rep_->threads.Size(); i++) {
     if ((u32)rep_->threads[i]->id == tctx->tid)
       return;
@@ -205,6 +208,8 @@
   rt->parent_tid = tctx->parent_tid;
   rt->stack = 0;
   rt->stack = SymbolizeStackId(tctx->creation_stack_id);
+  if (rt->stack)
+    rt->stack->suppressable = suppressable;
 }
 
 #ifndef TSAN_GO
@@ -251,9 +256,9 @@
 }
 #endif
 
-void ScopedReport::AddThread(int unique_tid) {
+void ScopedReport::AddThread(int unique_tid, bool suppressable) {
 #ifndef TSAN_GO
-  AddThread(FindThreadByUidLocked(unique_tid));
+  AddThread(FindThreadByUidLocked(unique_tid), suppressable);
 #endif
 }
 
@@ -275,7 +280,7 @@
   u64 uid = 0;
   u64 mid = id;
   uptr addr = SyncVar::SplitId(id, &uid);
-  SyncVar *s = ctx->synctab.GetIfExistsAndLock(addr, false);
+  SyncVar *s = ctx->metamap.GetIfExistsAndLock(addr);
   // Check that the mutex is still alive.
   // Another mutex can be created at the same address,
   // so check uid as well.
@@ -286,7 +291,7 @@
     AddDeadMutex(id);
   }
   if (s)
-    s->mtx.ReadUnlock();
+    s->mtx.Unlock();
   return mid;
 }
 
@@ -326,21 +331,26 @@
     return;
   }
   MBlock *b = 0;
-  if (allocator()->PointerIsMine((void*)addr)
-      && (b = user_mblock(0, (void*)addr))) {
-    ThreadContext *tctx = FindThreadByTidLocked(b->Tid());
+  Allocator *a = allocator();
+  if (a->PointerIsMine((void*)addr)) {
+    void *block_begin = a->GetBlockBegin((void*)addr);
+    if (block_begin)
+      b = ctx->metamap.GetBlock((uptr)block_begin);
+  }
+  if (b != 0) {
+    ThreadContext *tctx = FindThreadByTidLocked(b->tid);
     void *mem = internal_alloc(MBlockReportLoc, sizeof(ReportLocation));
     ReportLocation *loc = new(mem) ReportLocation();
     rep_->locs.PushBack(loc);
     loc->type = ReportLocationHeap;
     loc->addr = (uptr)allocator()->GetBlockBegin((void*)addr);
-    loc->size = b->Size();
-    loc->tid = tctx ? tctx->tid : b->Tid();
+    loc->size = b->siz;
+    loc->tid = tctx ? tctx->tid : b->tid;
     loc->name = 0;
     loc->file = 0;
     loc->line = 0;
     loc->stack = 0;
-    loc->stack = SymbolizeStackId(b->StackId());
+    loc->stack = SymbolizeStackId(b->stk);
     if (tctx)
       AddThread(tctx);
     return;
@@ -356,6 +366,7 @@
   }
   ReportLocation *loc = SymbolizeData(addr);
   if (loc) {
+    loc->suppressable = true;
     rep_->locs.PushBack(loc);
     return;
   }
@@ -495,25 +506,31 @@
   }
 }
 
-bool OutputReport(Context *ctx,
-                  const ScopedReport &srep,
-                  const ReportStack *suppress_stack1,
-                  const ReportStack *suppress_stack2,
-                  const ReportLocation *suppress_loc) {
+bool OutputReport(ThreadState *thr, const ScopedReport &srep) {
   atomic_store(&ctx->last_symbolize_time_ns, NanoTime(), memory_order_relaxed);
   const ReportDesc *rep = srep.GetReport();
   Suppression *supp = 0;
-  uptr suppress_pc = IsSuppressed(rep->typ, suppress_stack1, &supp);
-  if (suppress_pc == 0)
-    suppress_pc = IsSuppressed(rep->typ, suppress_stack2, &supp);
-  if (suppress_pc == 0)
-    suppress_pc = IsSuppressed(rep->typ, suppress_loc, &supp);
+  uptr suppress_pc = 0;
+  for (uptr i = 0; suppress_pc == 0 && i < rep->mops.Size(); i++)
+    suppress_pc = IsSuppressed(rep->typ, rep->mops[i]->stack, &supp);
+  for (uptr i = 0; suppress_pc == 0 && i < rep->stacks.Size(); i++)
+    suppress_pc = IsSuppressed(rep->typ, rep->stacks[i], &supp);
+  for (uptr i = 0; suppress_pc == 0 && i < rep->threads.Size(); i++)
+    suppress_pc = IsSuppressed(rep->typ, rep->threads[i]->stack, &supp);
+  for (uptr i = 0; suppress_pc == 0 && i < rep->locs.Size(); i++)
+    suppress_pc = IsSuppressed(rep->typ, rep->locs[i], &supp);
   if (suppress_pc != 0) {
     FiredSuppression s = {srep.GetReport()->typ, suppress_pc, supp};
     ctx->fired_suppressions.push_back(s);
   }
-  if (OnReport(rep, suppress_pc != 0))
-    return false;
+  {
+    bool old_is_freeing = thr->is_freeing;
+    thr->is_freeing = false;
+    bool suppressed = OnReport(rep, suppress_pc != 0);
+    thr->is_freeing = old_is_freeing;
+    if (suppressed)
+      return false;
+  }
   PrintReport(rep);
   ctx->nreported++;
   if (flags()->halt_on_error)
@@ -611,6 +628,8 @@
 }
 
 void ReportRace(ThreadState *thr) {
+  CheckNoLocks(thr);
+
   // Symbolizer makes lots of intercepted calls. If we try to process them,
   // at best it will cause deadlocks on internal mutexes.
   ScopedIgnoreInterceptors ignore;
@@ -695,11 +714,7 @@
   }
 #endif
 
-  ReportLocation *suppress_loc = rep.GetReport()->locs.Size() ?
-                                 rep.GetReport()->locs[0] : 0;
-  if (!OutputReport(ctx, rep, rep.GetReport()->mops[0]->stack,
-                              rep.GetReport()->mops[1]->stack,
-                              suppress_loc))
+  if (!OutputReport(thr, rep))
     return;
 
   AddRacyStacks(thr, traces, addr_min, addr_max);
diff --git a/lib/tsan/rtl/tsan_rtl_thread.cc b/lib/tsan/rtl/tsan_rtl_thread.cc
index b2ac7bb..94bf754 100644
--- a/lib/tsan/rtl/tsan_rtl_thread.cc
+++ b/lib/tsan/rtl/tsan_rtl_thread.cc
@@ -134,6 +134,7 @@
     ctx->dd->DestroyPhysicalThread(thr->dd_pt);
     ctx->dd->DestroyLogicalThread(thr->dd_lt);
   }
+  ctx->metamap.OnThreadIdle(thr);
 #ifndef TSAN_GO
   AllocatorThreadFinish(thr);
 #endif
@@ -205,9 +206,9 @@
       MaybeReportThreadLeak, &leaks);
   for (uptr i = 0; i < leaks.Size(); i++) {
     ScopedReport rep(ReportTypeThreadLeak);
-    rep.AddThread(leaks[i].tctx);
+    rep.AddThread(leaks[i].tctx, true);
     rep.SetCount(leaks[i].count);
-    OutputReport(ctx, rep, rep.GetReport()->threads[0]->stack);
+    OutputReport(thr, rep);
   }
 #endif
 }
diff --git a/lib/tsan/rtl/tsan_stack_trace.cc b/lib/tsan/rtl/tsan_stack_trace.cc
new file mode 100644
index 0000000..a8374f4
--- /dev/null
+++ b/lib/tsan/rtl/tsan_stack_trace.cc
@@ -0,0 +1,112 @@
+//===-- tsan_stack_trace.cc -----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of ThreadSanitizer (TSan), a race detector.
+//
+//===----------------------------------------------------------------------===//
+//#include "sanitizer_common/sanitizer_placement_new.h"
+#include "tsan_stack_trace.h"
+#include "tsan_rtl.h"
+#include "tsan_mman.h"
+
+namespace __tsan {
+
+StackTrace::StackTrace()
+    : n_()
+    , s_()
+    , c_() {
+}
+
+StackTrace::StackTrace(uptr *buf, uptr cnt)
+    : n_()
+    , s_(buf)
+    , c_(cnt) {
+  CHECK_NE(buf, 0);
+  CHECK_NE(cnt, 0);
+}
+
+StackTrace::~StackTrace() {
+  Reset();
+}
+
+void StackTrace::Reset() {
+  if (s_ && !c_) {
+    CHECK_NE(n_, 0);
+    internal_free(s_);
+    s_ = 0;
+  }
+  n_ = 0;
+}
+
+void StackTrace::Init(const uptr *pcs, uptr cnt) {
+  Reset();
+  if (cnt == 0)
+    return;
+  if (c_) {
+    CHECK_NE(s_, 0);
+    CHECK_LE(cnt, c_);
+  } else {
+    s_ = (uptr*)internal_alloc(MBlockStackTrace, cnt * sizeof(s_[0]));
+  }
+  n_ = cnt;
+  internal_memcpy(s_, pcs, cnt * sizeof(s_[0]));
+}
+
+void StackTrace::ObtainCurrent(ThreadState *thr, uptr toppc) {
+  Reset();
+  n_ = thr->shadow_stack_pos - thr->shadow_stack;
+  if (n_ + !!toppc == 0)
+    return;
+  uptr start = 0;
+  if (c_) {
+    CHECK_NE(s_, 0);
+    if (n_ + !!toppc > c_) {
+      start = n_ - c_ + !!toppc;
+      n_ = c_ - !!toppc;
+    }
+  } else {
+    // Cap potentially huge stacks.
+    if (n_ + !!toppc > kTraceStackSize) {
+      start = n_ - kTraceStackSize + !!toppc;
+      n_ = kTraceStackSize - !!toppc;
+    }
+    s_ = (uptr*)internal_alloc(MBlockStackTrace,
+                               (n_ + !!toppc) * sizeof(s_[0]));
+  }
+  for (uptr i = 0; i < n_; i++)
+    s_[i] = thr->shadow_stack[start + i];
+  if (toppc) {
+    s_[n_] = toppc;
+    n_++;
+  }
+}
+
+void StackTrace::CopyFrom(const StackTrace& other) {
+  Reset();
+  Init(other.Begin(), other.Size());
+}
+
+bool StackTrace::IsEmpty() const {
+  return n_ == 0;
+}
+
+uptr StackTrace::Size() const {
+  return n_;
+}
+
+uptr StackTrace::Get(uptr i) const {
+  CHECK_LT(i, n_);
+  return s_[i];
+}
+
+const uptr *StackTrace::Begin() const {
+  return s_;
+}
+
+}  // namespace __tsan
diff --git a/lib/tsan/rtl/tsan_stack_trace.h b/lib/tsan/rtl/tsan_stack_trace.h
new file mode 100644
index 0000000..fe82f6e
--- /dev/null
+++ b/lib/tsan/rtl/tsan_stack_trace.h
@@ -0,0 +1,54 @@
+//===-- tsan_stack_trace.h --------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of ThreadSanitizer (TSan), a race detector.
+//
+//===----------------------------------------------------------------------===//
+#ifndef TSAN_STACK_TRACE_H
+#define TSAN_STACK_TRACE_H
+
+//#include "sanitizer_common/sanitizer_atomic.h"
+//#include "sanitizer_common/sanitizer_common.h"
+//#include "sanitizer_common/sanitizer_deadlock_detector_interface.h"
+#include "tsan_defs.h"
+//#include "tsan_clock.h"
+//#include "tsan_mutex.h"
+//#include "tsan_dense_alloc.h"
+
+namespace __tsan {
+
+class StackTrace {
+ public:
+  StackTrace();
+  // Initialized the object in "static mode",
+  // in this mode it never calls malloc/free but uses the provided buffer.
+  StackTrace(uptr *buf, uptr cnt);
+  ~StackTrace();
+  void Reset();
+
+  void Init(const uptr *pcs, uptr cnt);
+  void ObtainCurrent(ThreadState *thr, uptr toppc);
+  bool IsEmpty() const;
+  uptr Size() const;
+  uptr Get(uptr i) const;
+  const uptr *Begin() const;
+  void CopyFrom(const StackTrace& other);
+
+ private:
+  uptr n_;
+  uptr *s_;
+  const uptr c_;
+
+  StackTrace(const StackTrace&);
+  void operator = (const StackTrace&);
+};
+
+}  // namespace __tsan
+
+#endif  // TSAN_STACK_TRACE_H
diff --git a/lib/tsan/rtl/tsan_stat.cc b/lib/tsan/rtl/tsan_stat.cc
index b42348a..350a2ba 100644
--- a/lib/tsan/rtl/tsan_stat.cc
+++ b/lib/tsan/rtl/tsan_stat.cc
@@ -37,6 +37,7 @@
   name[StatMop4]                         = "            size 4                ";
   name[StatMop8]                         = "            size 8                ";
   name[StatMopSame]                      = "  Including same                  ";
+  name[StatMopIgnored]                   = "  Including ignored               ";
   name[StatMopRange]                     = "  Including range                 ";
   name[StatMopRodata]                    = "  Including .rodata               ";
   name[StatMopRangeRodata]               = "  Including .rodata range         ";
diff --git a/lib/tsan/rtl/tsan_stat.h b/lib/tsan/rtl/tsan_stat.h
index 8cdf146..0bd949e 100644
--- a/lib/tsan/rtl/tsan_stat.h
+++ b/lib/tsan/rtl/tsan_stat.h
@@ -26,6 +26,7 @@
   StatMop4,
   StatMop8,
   StatMopSame,
+  StatMopIgnored,
   StatMopRange,
   StatMopRodata,
   StatMopRangeRodata,
diff --git a/lib/tsan/rtl/tsan_suppressions.cc b/lib/tsan/rtl/tsan_suppressions.cc
index 0ac2526..0670396 100644
--- a/lib/tsan/rtl/tsan_suppressions.cc
+++ b/lib/tsan/rtl/tsan_suppressions.cc
@@ -123,7 +123,8 @@
 
 uptr IsSuppressed(ReportType typ, const ReportStack *stack, Suppression **sp) {
   CHECK(g_ctx);
-  if (!g_ctx->SuppressionCount() || stack == 0) return 0;
+  if (!g_ctx->SuppressionCount() || stack == 0 || !stack->suppressable)
+    return 0;
   SuppressionType stype = conv(typ);
   if (stype == SuppressionNone)
     return 0;
@@ -144,7 +145,7 @@
 uptr IsSuppressed(ReportType typ, const ReportLocation *loc, Suppression **sp) {
   CHECK(g_ctx);
   if (!g_ctx->SuppressionCount() || loc == 0 ||
-      loc->type != ReportLocationGlobal)
+      loc->type != ReportLocationGlobal || !loc->suppressable)
     return 0;
   SuppressionType stype = conv(typ);
   if (stype == SuppressionNone)
diff --git a/lib/tsan/rtl/tsan_sync.cc b/lib/tsan/rtl/tsan_sync.cc
index 5d71f9f..10f52b4 100644
--- a/lib/tsan/rtl/tsan_sync.cc
+++ b/lib/tsan/rtl/tsan_sync.cc
@@ -19,293 +19,202 @@
 
 void DDMutexInit(ThreadState *thr, uptr pc, SyncVar *s);
 
-SyncVar::SyncVar(uptr addr, u64 uid)
-  : mtx(MutexTypeSyncVar, StatMtxSyncVar)
-  , addr(addr)
-  , uid(uid)
-  , creation_stack_id()
-  , owner_tid(kInvalidTid)
-  , last_lock()
-  , recursion()
-  , is_rw()
-  , is_recursive()
-  , is_broken()
-  , is_linker_init() {
+SyncVar::SyncVar()
+    : mtx(MutexTypeSyncVar, StatMtxSyncVar) {
+  Reset();
 }
 
-SyncTab::Part::Part()
-  : mtx(MutexTypeSyncTab, StatMtxSyncTab)
-  , val() {
+void SyncVar::Init(ThreadState *thr, uptr pc, uptr addr, u64 uid) {
+  this->addr = addr;
+  this->uid = uid;
+  this->next = 0;
+
+  creation_stack_id = 0;
+  if (kCppMode)  // Go does not use them
+    creation_stack_id = CurrentStackId(thr, pc);
+  if (flags()->detect_deadlocks)
+    DDMutexInit(thr, pc, this);
 }
 
-SyncTab::SyncTab() {
+void SyncVar::Reset() {
+  uid = 0;
+  creation_stack_id = 0;
+  owner_tid = kInvalidTid;
+  last_lock = 0;
+  recursion = 0;
+  is_rw = 0;
+  is_recursive = 0;
+  is_broken = 0;
+  is_linker_init = 0;
+
+  clock.Zero();
+  read_clock.Reset();
 }
 
-SyncTab::~SyncTab() {
-  for (int i = 0; i < kPartCount; i++) {
-    while (tab_[i].val) {
-      SyncVar *tmp = tab_[i].val;
-      tab_[i].val = tmp->next;
-      DestroyAndFree(tmp);
+MetaMap::MetaMap() {
+  atomic_store(&uid_gen_, 0, memory_order_relaxed);
+}
+
+void MetaMap::AllocBlock(ThreadState *thr, uptr pc, uptr p, uptr sz) {
+  u32 idx = block_alloc_.Alloc(&thr->block_cache);
+  MBlock *b = block_alloc_.Map(idx);
+  b->siz = sz;
+  b->tid = thr->tid;
+  b->stk = CurrentStackId(thr, pc);
+  u32 *meta = MemToMeta(p);
+  DCHECK_EQ(*meta, 0);
+  *meta = idx | kFlagBlock;
+}
+
+uptr MetaMap::FreeBlock(ThreadState *thr, uptr pc, uptr p) {
+  MBlock* b = GetBlock(p);
+  if (b == 0)
+    return 0;
+  uptr sz = RoundUpTo(b->siz, kMetaShadowCell);
+  FreeRange(thr, pc, p, sz);
+  return sz;
+}
+
+void MetaMap::FreeRange(ThreadState *thr, uptr pc, uptr p, uptr sz) {
+  u32 *meta = MemToMeta(p);
+  u32 *end = MemToMeta(p + sz);
+  if (end == meta)
+    end++;
+  for (; meta < end; meta++) {
+    u32 idx = *meta;
+    *meta = 0;
+    for (;;) {
+      if (idx == 0)
+        break;
+      if (idx & kFlagBlock) {
+        block_alloc_.Free(&thr->block_cache, idx & ~kFlagMask);
+        break;
+      } else if (idx & kFlagSync) {
+        DCHECK(idx & kFlagSync);
+        SyncVar *s = sync_alloc_.Map(idx & ~kFlagMask);
+        u32 next = s->next;
+        s->Reset();
+        sync_alloc_.Free(&thr->sync_cache, idx & ~kFlagMask);
+        idx = next;
+      } else {
+        CHECK(0);
+      }
     }
   }
 }
 
-SyncVar* SyncTab::GetOrCreateAndLock(ThreadState *thr, uptr pc,
-                                     uptr addr, bool write_lock) {
+MBlock* MetaMap::GetBlock(uptr p) {
+  u32 *meta = MemToMeta(p);
+  u32 idx = *meta;
+  for (;;) {
+    if (idx == 0)
+      return 0;
+    if (idx & kFlagBlock)
+      return block_alloc_.Map(idx & ~kFlagMask);
+    DCHECK(idx & kFlagSync);
+    SyncVar * s = sync_alloc_.Map(idx & ~kFlagMask);
+    idx = s->next;
+  }
+}
+
+SyncVar* MetaMap::GetOrCreateAndLock(ThreadState *thr, uptr pc,
+                              uptr addr, bool write_lock) {
   return GetAndLock(thr, pc, addr, write_lock, true);
 }
 
-SyncVar* SyncTab::GetIfExistsAndLock(uptr addr, bool write_lock) {
-  return GetAndLock(0, 0, addr, write_lock, false);
+SyncVar* MetaMap::GetIfExistsAndLock(uptr addr) {
+  return GetAndLock(0, 0, addr, true, false);
 }
 
-SyncVar* SyncTab::Create(ThreadState *thr, uptr pc, uptr addr) {
-  StatInc(thr, StatSyncCreated);
-  void *mem = internal_alloc(MBlockSync, sizeof(SyncVar));
-  const u64 uid = atomic_fetch_add(&uid_gen_, 1, memory_order_relaxed);
-  SyncVar *res = new(mem) SyncVar(addr, uid);
-  res->creation_stack_id = 0;
-  if (!kGoMode)  // Go does not use them
-    res->creation_stack_id = CurrentStackId(thr, pc);
-  if (flags()->detect_deadlocks)
-    DDMutexInit(thr, pc, res);
-  return res;
-}
-
-SyncVar* SyncTab::GetAndLock(ThreadState *thr, uptr pc,
+SyncVar* MetaMap::GetAndLock(ThreadState *thr, uptr pc,
                              uptr addr, bool write_lock, bool create) {
-#ifndef TSAN_GO
-  {  // NOLINT
-    SyncVar *res = GetJavaSync(thr, pc, addr, write_lock, create);
-    if (res)
-      return res;
-  }
-
-  // Here we ask only PrimaryAllocator, because
-  // SecondaryAllocator::PointerIsMine() is slow and we have fallback on
-  // the hashmap anyway.
-  if (PrimaryAllocator::PointerIsMine((void*)addr)) {
-    MBlock *b = user_mblock(thr, (void*)addr);
-    CHECK_NE(b, 0);
-    MBlock::ScopedLock l(b);
-    SyncVar *res = 0;
-    for (res = b->ListHead(); res; res = res->next) {
-      if (res->addr == addr)
+  u32 *meta = MemToMeta(addr);
+  u32 idx0 = *meta;
+  u32 myidx = 0;
+  SyncVar *mys = 0;
+  for (;;) {
+    u32 idx = idx0;
+    for (;;) {
+      if (idx == 0)
         break;
-    }
-    if (res == 0) {
-      if (!create)
-        return 0;
-      res = Create(thr, pc, addr);
-      b->ListPush(res);
-    }
-    if (write_lock)
-      res->mtx.Lock();
-    else
-      res->mtx.ReadLock();
-    return res;
-  }
-#endif
-
-  Part *p = &tab_[PartIdx(addr)];
-  {
-    ReadLock l(&p->mtx);
-    for (SyncVar *res = p->val; res; res = res->next) {
-      if (res->addr == addr) {
+      if (idx & kFlagBlock)
+        break;
+      DCHECK(idx & kFlagSync);
+      SyncVar * s = sync_alloc_.Map(idx & ~kFlagMask);
+      if (s->addr == addr) {
+        if (myidx != 0) {
+          mys->Reset();
+          sync_alloc_.Free(&thr->sync_cache, myidx);
+        }
         if (write_lock)
-          res->mtx.Lock();
+          s->mtx.Lock();
         else
-          res->mtx.ReadLock();
-        return res;
+          s->mtx.ReadLock();
+        return s;
       }
+      idx = s->next;
+    }
+    if (!create)
+      return 0;
+    if (*meta != idx0) {
+      idx0 = *meta;
+      continue;
+    }
+
+    if (myidx == 0) {
+      const u64 uid = atomic_fetch_add(&uid_gen_, 1, memory_order_relaxed);
+      myidx = sync_alloc_.Alloc(&thr->sync_cache);
+      mys = sync_alloc_.Map(myidx);
+      mys->Init(thr, pc, addr, uid);
+    }
+    mys->next = idx0;
+    if (atomic_compare_exchange_strong((atomic_uint32_t*)meta, &idx0,
+        myidx | kFlagSync, memory_order_release)) {
+      if (write_lock)
+        mys->mtx.Lock();
+      else
+        mys->mtx.ReadLock();
+      return mys;
     }
   }
-  if (!create)
-    return 0;
-  {
-    Lock l(&p->mtx);
-    SyncVar *res = p->val;
-    for (; res; res = res->next) {
-      if (res->addr == addr)
+}
+
+void MetaMap::MoveMemory(uptr src, uptr dst, uptr sz) {
+  // src and dst can overlap,
+  // there are no concurrent accesses to the regions (e.g. stop-the-world).
+  CHECK_NE(src, dst);
+  CHECK_NE(sz, 0);
+  uptr diff = dst - src;
+  u32 *src_meta = MemToMeta(src);
+  u32 *dst_meta = MemToMeta(dst);
+  u32 *src_meta_end = MemToMeta(src + sz);
+  uptr inc = 1;
+  if (dst > src) {
+    src_meta = MemToMeta(src + sz) - 1;
+    dst_meta = MemToMeta(dst + sz) - 1;
+    src_meta_end = MemToMeta(src) - 1;
+    inc = -1;
+  }
+  for (; src_meta != src_meta_end; src_meta += inc, dst_meta += inc) {
+    CHECK_EQ(*dst_meta, 0);
+    u32 idx = *src_meta;
+    *src_meta = 0;
+    *dst_meta = idx;
+    // Patch the addresses in sync objects.
+    while (idx != 0) {
+      if (idx & kFlagBlock)
         break;
-    }
-    if (res == 0) {
-      res = Create(thr, pc, addr);
-      res->next = p->val;
-      p->val = res;
-    }
-    if (write_lock)
-      res->mtx.Lock();
-    else
-      res->mtx.ReadLock();
-    return res;
-  }
-}
-
-SyncVar* SyncTab::GetAndRemove(ThreadState *thr, uptr pc, uptr addr) {
-#ifndef TSAN_GO
-  {  // NOLINT
-    SyncVar *res = GetAndRemoveJavaSync(thr, pc, addr);
-    if (res)
-      return res;
-  }
-  if (PrimaryAllocator::PointerIsMine((void*)addr)) {
-    MBlock *b = user_mblock(thr, (void*)addr);
-    CHECK_NE(b, 0);
-    SyncVar *res = 0;
-    {
-      MBlock::ScopedLock l(b);
-      res = b->ListHead();
-      if (res) {
-        if (res->addr == addr) {
-          if (res->is_linker_init)
-            return 0;
-          b->ListPop();
-        } else {
-          SyncVar **prev = &res->next;
-          res = *prev;
-          while (res) {
-            if (res->addr == addr) {
-              if (res->is_linker_init)
-                return 0;
-              *prev = res->next;
-              break;
-            }
-            prev = &res->next;
-            res = *prev;
-          }
-        }
-        if (res) {
-          StatInc(thr, StatSyncDestroyed);
-          res->mtx.Lock();
-          res->mtx.Unlock();
-        }
-      }
-    }
-    return res;
-  }
-#endif
-
-  Part *p = &tab_[PartIdx(addr)];
-  SyncVar *res = 0;
-  {
-    Lock l(&p->mtx);
-    SyncVar **prev = &p->val;
-    res = *prev;
-    while (res) {
-      if (res->addr == addr) {
-        if (res->is_linker_init)
-          return 0;
-        *prev = res->next;
-        break;
-      }
-      prev = &res->next;
-      res = *prev;
+      CHECK(idx & kFlagSync);
+      SyncVar *s = sync_alloc_.Map(idx & ~kFlagMask);
+      s->addr += diff;
+      idx = s->next;
     }
   }
-  if (res) {
-    StatInc(thr, StatSyncDestroyed);
-    res->mtx.Lock();
-    res->mtx.Unlock();
-  }
-  return res;
 }
 
-int SyncTab::PartIdx(uptr addr) {
-  return (addr >> 3) % kPartCount;
-}
-
-StackTrace::StackTrace()
-    : n_()
-    , s_()
-    , c_() {
-}
-
-StackTrace::StackTrace(uptr *buf, uptr cnt)
-    : n_()
-    , s_(buf)
-    , c_(cnt) {
-  CHECK_NE(buf, 0);
-  CHECK_NE(cnt, 0);
-}
-
-StackTrace::~StackTrace() {
-  Reset();
-}
-
-void StackTrace::Reset() {
-  if (s_ && !c_) {
-    CHECK_NE(n_, 0);
-    internal_free(s_);
-    s_ = 0;
-  }
-  n_ = 0;
-}
-
-void StackTrace::Init(const uptr *pcs, uptr cnt) {
-  Reset();
-  if (cnt == 0)
-    return;
-  if (c_) {
-    CHECK_NE(s_, 0);
-    CHECK_LE(cnt, c_);
-  } else {
-    s_ = (uptr*)internal_alloc(MBlockStackTrace, cnt * sizeof(s_[0]));
-  }
-  n_ = cnt;
-  internal_memcpy(s_, pcs, cnt * sizeof(s_[0]));
-}
-
-void StackTrace::ObtainCurrent(ThreadState *thr, uptr toppc) {
-  Reset();
-  n_ = thr->shadow_stack_pos - thr->shadow_stack;
-  if (n_ + !!toppc == 0)
-    return;
-  uptr start = 0;
-  if (c_) {
-    CHECK_NE(s_, 0);
-    if (n_ + !!toppc > c_) {
-      start = n_ - c_ + !!toppc;
-      n_ = c_ - !!toppc;
-    }
-  } else {
-    // Cap potentially huge stacks.
-    if (n_ + !!toppc > kTraceStackSize) {
-      start = n_ - kTraceStackSize + !!toppc;
-      n_ = kTraceStackSize - !!toppc;
-    }
-    s_ = (uptr*)internal_alloc(MBlockStackTrace,
-                               (n_ + !!toppc) * sizeof(s_[0]));
-  }
-  for (uptr i = 0; i < n_; i++)
-    s_[i] = thr->shadow_stack[start + i];
-  if (toppc) {
-    s_[n_] = toppc;
-    n_++;
-  }
-}
-
-void StackTrace::CopyFrom(const StackTrace& other) {
-  Reset();
-  Init(other.Begin(), other.Size());
-}
-
-bool StackTrace::IsEmpty() const {
-  return n_ == 0;
-}
-
-uptr StackTrace::Size() const {
-  return n_;
-}
-
-uptr StackTrace::Get(uptr i) const {
-  CHECK_LT(i, n_);
-  return s_[i];
-}
-
-const uptr *StackTrace::Begin() const {
-  return s_;
+void MetaMap::OnThreadIdle(ThreadState *thr) {
+  block_alloc_.FlushCache(&thr->block_cache);
+  sync_alloc_.FlushCache(&thr->sync_cache);
 }
 
 }  // namespace __tsan
diff --git a/lib/tsan/rtl/tsan_sync.h b/lib/tsan/rtl/tsan_sync.h
index ed0ac59..7c8682f 100644
--- a/lib/tsan/rtl/tsan_sync.h
+++ b/lib/tsan/rtl/tsan_sync.h
@@ -16,46 +16,21 @@
 #include "sanitizer_common/sanitizer_atomic.h"
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_deadlock_detector_interface.h"
-#include "tsan_clock.h"
 #include "tsan_defs.h"
+#include "tsan_clock.h"
 #include "tsan_mutex.h"
+#include "tsan_dense_alloc.h"
 
 namespace __tsan {
 
-class StackTrace {
- public:
-  StackTrace();
-  // Initialized the object in "static mode",
-  // in this mode it never calls malloc/free but uses the provided buffer.
-  StackTrace(uptr *buf, uptr cnt);
-  ~StackTrace();
-  void Reset();
-
-  void Init(const uptr *pcs, uptr cnt);
-  void ObtainCurrent(ThreadState *thr, uptr toppc);
-  bool IsEmpty() const;
-  uptr Size() const;
-  uptr Get(uptr i) const;
-  const uptr *Begin() const;
-  void CopyFrom(const StackTrace& other);
-
- private:
-  uptr n_;
-  uptr *s_;
-  const uptr c_;
-
-  StackTrace(const StackTrace&);
-  void operator = (const StackTrace&);
-};
-
 struct SyncVar {
-  explicit SyncVar(uptr addr, u64 uid);
+  SyncVar();
 
   static const int kInvalidTid = -1;
 
+  uptr addr;  // overwritten by DenseSlabAlloc freelist
   Mutex mtx;
-  uptr addr;
-  const u64 uid;  // Globally unique id.
+  u64 uid;  // Globally unique id.
   u32 creation_stack_id;
   int owner_tid;  // Set only by exclusive owners.
   u64 last_lock;
@@ -64,13 +39,16 @@
   bool is_recursive;
   bool is_broken;
   bool is_linker_init;
-  SyncVar *next;  // In SyncTab hashtable.
+  u32 next;  // in MetaMap
   DDMutex dd;
   SyncClock read_clock;  // Used for rw mutexes only.
   // The clock is placed last, so that it is situated on a different cache line
   // with the mtx. This reduces contention for hot sync objects.
   SyncClock clock;
 
+  void Init(ThreadState *thr, uptr pc, uptr addr, u64 uid);
+  void Reset();
+
   u64 GetId() const {
     // 47 lsb is addr, then 14 bits is low part of uid, then 3 zero bits.
     return GetLsb((u64)addr | (uid << 47), 61);
@@ -85,40 +63,39 @@
   }
 };
 
-class SyncTab {
+/* MetaMap allows to map arbitrary user pointers onto various descriptors.
+   Currently it maps pointers to heap block descriptors and sync var descs.
+   It uses 1/2 direct shadow, see tsan_platform.h.
+*/
+class MetaMap {
  public:
-  SyncTab();
-  ~SyncTab();
+  MetaMap();
+
+  void AllocBlock(ThreadState *thr, uptr pc, uptr p, uptr sz);
+  uptr FreeBlock(ThreadState *thr, uptr pc, uptr p);
+  void FreeRange(ThreadState *thr, uptr pc, uptr p, uptr sz);
+  MBlock* GetBlock(uptr p);
 
   SyncVar* GetOrCreateAndLock(ThreadState *thr, uptr pc,
                               uptr addr, bool write_lock);
-  SyncVar* GetIfExistsAndLock(uptr addr, bool write_lock);
+  SyncVar* GetIfExistsAndLock(uptr addr);
 
-  // If the SyncVar does not exist, returns 0.
-  SyncVar* GetAndRemove(ThreadState *thr, uptr pc, uptr addr);
+  void MoveMemory(uptr src, uptr dst, uptr sz);
 
-  SyncVar* Create(ThreadState *thr, uptr pc, uptr addr);
+  void OnThreadIdle(ThreadState *thr);
 
  private:
-  struct Part {
-    Mutex mtx;
-    SyncVar *val;
-    char pad[kCacheLineSize - sizeof(Mutex) - sizeof(SyncVar*)];  // NOLINT
-    Part();
-  };
-
-  // FIXME: Implement something more sane.
-  static const int kPartCount = 1009;
-  Part tab_[kPartCount];
+  static const u32 kFlagMask  = 3 << 30;
+  static const u32 kFlagBlock = 1 << 30;
+  static const u32 kFlagSync  = 2 << 30;
+  typedef DenseSlabAlloc<MBlock, 1<<16, 1<<12> BlockAlloc;
+  typedef DenseSlabAlloc<SyncVar, 1<<16, 1<<10> SyncAlloc;
+  BlockAlloc block_alloc_;
+  SyncAlloc sync_alloc_;
   atomic_uint64_t uid_gen_;
 
-  int PartIdx(uptr addr);
-
-  SyncVar* GetAndLock(ThreadState *thr, uptr pc,
-                      uptr addr, bool write_lock, bool create);
-
-  SyncTab(const SyncTab&);  // Not implemented.
-  void operator = (const SyncTab&);  // Not implemented.
+  SyncVar* GetAndLock(ThreadState *thr, uptr pc, uptr addr, bool write_lock,
+                      bool create);
 };
 
 }  // namespace __tsan
diff --git a/lib/tsan/rtl/tsan_trace.h b/lib/tsan/rtl/tsan_trace.h
index 5ed0356..686160c 100644
--- a/lib/tsan/rtl/tsan_trace.h
+++ b/lib/tsan/rtl/tsan_trace.h
@@ -15,7 +15,7 @@
 
 #include "tsan_defs.h"
 #include "tsan_mutex.h"
-#include "tsan_sync.h"
+#include "tsan_stack_trace.h"
 #include "tsan_mutexset.h"
 
 namespace __tsan {
diff --git a/lib/tsan/rtl/tsan_update_shadow_word_inl.h b/lib/tsan/rtl/tsan_update_shadow_word_inl.h
index a11c9bc..c80e0a8 100644
--- a/lib/tsan/rtl/tsan_update_shadow_word_inl.h
+++ b/lib/tsan/rtl/tsan_update_shadow_word_inl.h
@@ -16,8 +16,7 @@
 do {
   StatInc(thr, StatShadowProcessed);
   const unsigned kAccessSize = 1 << kAccessSizeLog;
-  unsigned off = cur.ComputeSearchOffset();
-  u64 *sp = &shadow_mem[(idx + off) % kShadowCnt];
+  u64 *sp = &shadow_mem[idx];
   old = LoadShadow(sp);
   if (old.IsZero()) {
     StatInc(thr, StatShadowZero);
@@ -33,16 +32,6 @@
     // same thread?
     if (Shadow::TidsAreEqual(old, cur)) {
       StatInc(thr, StatShadowSameThread);
-      if (OldIsInSameSynchEpoch(old, thr)) {
-        if (old.IsRWNotWeaker(kAccessIsWrite, kIsAtomic)) {
-          // found a slot that holds effectively the same info
-          // (that is, same tid, same sync epoch and same size)
-          StatInc(thr, StatMopSame);
-          return;
-        }
-        StoreIfNotYetStored(sp, &store_word);
-        break;
-      }
       if (old.IsRWWeakerOrEqual(kAccessIsWrite, kIsAtomic))
         StoreIfNotYetStored(sp, &store_word);
       break;
diff --git a/lib/tsan/rtl/tsan_vector.h b/lib/tsan/rtl/tsan_vector.h
index ae84522..a7fb3fa 100644
--- a/lib/tsan/rtl/tsan_vector.h
+++ b/lib/tsan/rtl/tsan_vector.h
@@ -78,6 +78,10 @@
   }
 
   void Resize(uptr size) {
+    if (size == 0) {
+      end_ = begin_;
+      return;
+    }
     uptr old_size = Size();
     EnsureSize(size);
     if (old_size < size) {
@@ -100,7 +104,7 @@
       return;
     }
     uptr cap0 = last_ - begin_;
-    uptr cap = 2 * cap0;
+    uptr cap = cap0 * 5 / 4;  // 25% growth
     if (cap == 0)
       cap = 16;
     if (cap < size)
diff --git a/lib/tsan/tests/unit/tsan_dense_alloc_test.cc b/lib/tsan/tests/unit/tsan_dense_alloc_test.cc
new file mode 100644
index 0000000..fc9e4cb
--- /dev/null
+++ b/lib/tsan/tests/unit/tsan_dense_alloc_test.cc
@@ -0,0 +1,55 @@
+//===-- tsan_dense_alloc_test.cc ------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of ThreadSanitizer (TSan), a race detector.
+//
+//===----------------------------------------------------------------------===//
+#include "tsan_dense_alloc.h"
+#include "tsan_rtl.h"
+#include "tsan_mman.h"
+#include "gtest/gtest.h"
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <map>
+
+namespace __tsan {
+
+TEST(DenseSlabAlloc, Basic) {
+  typedef DenseSlabAlloc<int, 128, 128> Alloc;
+  typedef Alloc::Cache Cache;
+  typedef Alloc::IndexT IndexT;
+  const int N = 1000;
+
+  Alloc alloc;
+  Cache cache;
+  alloc.InitCache(&cache);
+
+  IndexT blocks[N];
+  for (int ntry = 0; ntry < 3; ntry++) {
+    for (int i = 0; i < N; i++) {
+      IndexT idx = alloc.Alloc(&cache);
+      blocks[i] = idx;
+      EXPECT_NE(idx, 0);
+      int *v = alloc.Map(idx);
+      *v = i;
+    }
+
+    for (int i = 0; i < N; i++) {
+      IndexT idx = blocks[i];
+      int *v = alloc.Map(idx);
+      EXPECT_EQ(*v, i);
+      alloc.Free(&cache, idx);
+    }
+
+    alloc.FlushCache(&cache);
+  }
+}
+
+}  // namespace __tsan
diff --git a/lib/tsan/tests/unit/tsan_mman_test.cc b/lib/tsan/tests/unit/tsan_mman_test.cc
index 5e39bea..0c4a8ff 100644
--- a/lib/tsan/tests/unit/tsan_mman_test.cc
+++ b/lib/tsan/tests/unit/tsan_mman_test.cc
@@ -11,20 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 #include <limits>
+#include <sanitizer/allocator_interface.h>
 #include "tsan_mman.h"
 #include "tsan_rtl.h"
 #include "gtest/gtest.h"
 
-extern "C" {
-uptr __tsan_get_current_allocated_bytes();
-uptr __tsan_get_heap_size();
-uptr __tsan_get_free_bytes();
-uptr __tsan_get_unmapped_bytes();
-uptr __tsan_get_estimated_allocated_size(uptr size);
-bool __tsan_get_ownership(void *p);
-uptr __tsan_get_allocated_size(void *p);
-}
-
 namespace __tsan {
 
 TEST(Mman, Internal) {
@@ -51,20 +42,8 @@
   char *p2 = (char*)user_alloc(thr, pc, 20);
   EXPECT_NE(p2, (char*)0);
   EXPECT_NE(p2, p);
-  MBlock *b = user_mblock(thr, p);
-  EXPECT_NE(b, (MBlock*)0);
-  EXPECT_EQ(b->Size(), (uptr)10);
-  MBlock *b2 = user_mblock(thr, p2);
-  EXPECT_NE(b2, (MBlock*)0);
-  EXPECT_EQ(b2->Size(), (uptr)20);
-  for (int i = 0; i < 10; i++) {
-    p[i] = 42;
-    EXPECT_EQ(b, user_mblock(thr, p + i));
-  }
-  for (int i = 0; i < 20; i++) {
-    ((char*)p2)[i] = 42;
-    EXPECT_EQ(b2, user_mblock(thr, p2 + i));
-  }
+  EXPECT_EQ(10U, user_alloc_usable_size(p));
+  EXPECT_EQ(20U, user_alloc_usable_size(p2));
   user_free(thr, pc, p);
   user_free(thr, pc, p2);
 }
@@ -119,43 +98,49 @@
   uptr pc = 0;
   char *p = (char*)user_alloc(thr, pc, 10);
   char *p2 = (char*)user_alloc(thr, pc, 20);
-  EXPECT_EQ(0U, user_alloc_usable_size(thr, pc, NULL));
-  EXPECT_EQ(10U, user_alloc_usable_size(thr, pc, p));
-  EXPECT_EQ(20U, user_alloc_usable_size(thr, pc, p2));
+  EXPECT_EQ(0U, user_alloc_usable_size(NULL));
+  EXPECT_EQ(10U, user_alloc_usable_size(p));
+  EXPECT_EQ(20U, user_alloc_usable_size(p2));
   user_free(thr, pc, p);
   user_free(thr, pc, p2);
+  EXPECT_EQ(0U, user_alloc_usable_size((void*)0x123));
 }
 
 TEST(Mman, Stats) {
   ThreadState *thr = cur_thread();
 
-  uptr alloc0 = __tsan_get_current_allocated_bytes();
-  uptr heap0 = __tsan_get_heap_size();
-  uptr free0 = __tsan_get_free_bytes();
-  uptr unmapped0 = __tsan_get_unmapped_bytes();
+  uptr alloc0 = __sanitizer_get_current_allocated_bytes();
+  uptr heap0 = __sanitizer_get_heap_size();
+  uptr free0 = __sanitizer_get_free_bytes();
+  uptr unmapped0 = __sanitizer_get_unmapped_bytes();
 
-  EXPECT_EQ(__tsan_get_estimated_allocated_size(10), (uptr)10);
-  EXPECT_EQ(__tsan_get_estimated_allocated_size(20), (uptr)20);
-  EXPECT_EQ(__tsan_get_estimated_allocated_size(100), (uptr)100);
+  EXPECT_EQ(10U, __sanitizer_get_estimated_allocated_size(10));
+  EXPECT_EQ(20U, __sanitizer_get_estimated_allocated_size(20));
+  EXPECT_EQ(100U, __sanitizer_get_estimated_allocated_size(100));
 
   char *p = (char*)user_alloc(thr, 0, 10);
-  EXPECT_EQ(__tsan_get_ownership(p), true);
-  EXPECT_EQ(__tsan_get_allocated_size(p), (uptr)10);
+  EXPECT_TRUE(__sanitizer_get_ownership(p));
+  EXPECT_EQ(10U, __sanitizer_get_allocated_size(p));
 
-  EXPECT_EQ(__tsan_get_current_allocated_bytes(), alloc0 + 16);
-  EXPECT_GE(__tsan_get_heap_size(), heap0);
-  EXPECT_EQ(__tsan_get_free_bytes(), free0);
-  EXPECT_EQ(__tsan_get_unmapped_bytes(), unmapped0);
+  EXPECT_EQ(alloc0 + 16, __sanitizer_get_current_allocated_bytes());
+  EXPECT_GE(__sanitizer_get_heap_size(), heap0);
+  EXPECT_EQ(free0, __sanitizer_get_free_bytes());
+  EXPECT_EQ(unmapped0, __sanitizer_get_unmapped_bytes());
 
   user_free(thr, 0, p);
 
-  EXPECT_EQ(__tsan_get_current_allocated_bytes(), alloc0);
-  EXPECT_GE(__tsan_get_heap_size(), heap0);
-  EXPECT_EQ(__tsan_get_free_bytes(), free0);
-  EXPECT_EQ(__tsan_get_unmapped_bytes(), unmapped0);
+  EXPECT_EQ(alloc0, __sanitizer_get_current_allocated_bytes());
+  EXPECT_GE(__sanitizer_get_heap_size(), heap0);
+  EXPECT_EQ(free0, __sanitizer_get_free_bytes());
+  EXPECT_EQ(unmapped0, __sanitizer_get_unmapped_bytes());
 }
 
 TEST(Mman, CallocOverflow) {
+#if TSAN_DEBUG
+  // EXPECT_DEATH clones a thread with 4K stack,
+  // which is overflown by tsan memory accesses functions in debug mode.
+  return;
+#endif
   size_t kArraySize = 4096;
   volatile size_t kMaxSizeT = std::numeric_limits<size_t>::max();
   volatile size_t kArraySize2 = kMaxSizeT / kArraySize + 10;
diff --git a/lib/tsan/tests/unit/tsan_sync_test.cc b/lib/tsan/tests/unit/tsan_sync_test.cc
index 1cfcf99..6f36c64 100644
--- a/lib/tsan/tests/unit/tsan_sync_test.cc
+++ b/lib/tsan/tests/unit/tsan_sync_test.cc
@@ -12,53 +12,112 @@
 //===----------------------------------------------------------------------===//
 #include "tsan_sync.h"
 #include "tsan_rtl.h"
-#include "tsan_mman.h"
 #include "gtest/gtest.h"
 
-#include <stdlib.h>
-#include <stdint.h>
-#include <map>
-
 namespace __tsan {
 
-TEST(Sync, Table) {
-  const uintptr_t kIters = 512*1024;
-  const uintptr_t kRange = 10000;
-
+TEST(MetaMap, Basic) {
   ThreadState *thr = cur_thread();
-  uptr pc = 0;
+  MetaMap *m = &ctx->metamap;
+  u64 block[1] = {};  // fake malloc block
+  m->AllocBlock(thr, 0, (uptr)&block[0], 1 * sizeof(u64));
+  MBlock *mb = m->GetBlock((uptr)&block[0]);
+  EXPECT_NE(mb, (MBlock*)0);
+  EXPECT_EQ(mb->siz, 1 * sizeof(u64));
+  EXPECT_EQ(mb->tid, thr->tid);
+  uptr sz = m->FreeBlock(thr, 0, (uptr)&block[0]);
+  EXPECT_EQ(sz, 1 * sizeof(u64));
+  mb = m->GetBlock((uptr)&block[0]);
+  EXPECT_EQ(mb, (MBlock*)0);
+}
 
-  SyncTab tab;
-  SyncVar *golden[kRange] = {};
-  unsigned seed = 0;
-  for (uintptr_t i = 0; i < kIters; i++) {
-    uintptr_t addr = rand_r(&seed) % (kRange - 1) + 1;
-    if (rand_r(&seed) % 2) {
-      // Get or add.
-      SyncVar *v = tab.GetOrCreateAndLock(thr, pc, addr, true);
-      EXPECT_TRUE(golden[addr] == 0 || golden[addr] == v);
-      EXPECT_EQ(v->addr, addr);
-      golden[addr] = v;
-      v->mtx.Unlock();
-    } else {
-      // Remove.
-      SyncVar *v = tab.GetAndRemove(thr, pc, addr);
-      EXPECT_EQ(golden[addr], v);
-      if (v) {
-        EXPECT_EQ(v->addr, addr);
-        golden[addr] = 0;
-        DestroyAndFree(v);
-      }
-    }
-  }
-  for (uintptr_t addr = 0; addr < kRange; addr++) {
-    if (golden[addr] == 0)
-      continue;
-    SyncVar *v = tab.GetAndRemove(thr, pc, addr);
-    EXPECT_EQ(v, golden[addr]);
-    EXPECT_EQ(v->addr, addr);
-    DestroyAndFree(v);
-  }
+TEST(MetaMap, FreeRange) {
+  ThreadState *thr = cur_thread();
+  MetaMap *m = &ctx->metamap;
+  u64 block[4] = {};  // fake malloc block
+  m->AllocBlock(thr, 0, (uptr)&block[0], 1 * sizeof(u64));
+  m->AllocBlock(thr, 0, (uptr)&block[1], 3 * sizeof(u64));
+  MBlock *mb1 = m->GetBlock((uptr)&block[0]);
+  EXPECT_EQ(mb1->siz, 1 * sizeof(u64));
+  MBlock *mb2 = m->GetBlock((uptr)&block[1]);
+  EXPECT_EQ(mb2->siz, 3 * sizeof(u64));
+  m->FreeRange(thr, 0, (uptr)&block[0], 4 * sizeof(u64));
+  mb1 = m->GetBlock((uptr)&block[0]);
+  EXPECT_EQ(mb1, (MBlock*)0);
+  mb2 = m->GetBlock((uptr)&block[1]);
+  EXPECT_EQ(mb2, (MBlock*)0);
+}
+
+TEST(MetaMap, Sync) {
+  ThreadState *thr = cur_thread();
+  MetaMap *m = &ctx->metamap;
+  u64 block[4] = {};  // fake malloc block
+  m->AllocBlock(thr, 0, (uptr)&block[0], 4 * sizeof(u64));
+  SyncVar *s1 = m->GetIfExistsAndLock((uptr)&block[0]);
+  EXPECT_EQ(s1, (SyncVar*)0);
+  s1 = m->GetOrCreateAndLock(thr, 0, (uptr)&block[0], true);
+  EXPECT_NE(s1, (SyncVar*)0);
+  EXPECT_EQ(s1->addr, (uptr)&block[0]);
+  s1->mtx.Unlock();
+  SyncVar *s2 = m->GetOrCreateAndLock(thr, 0, (uptr)&block[1], false);
+  EXPECT_NE(s2, (SyncVar*)0);
+  EXPECT_EQ(s2->addr, (uptr)&block[1]);
+  s2->mtx.ReadUnlock();
+  m->FreeBlock(thr, 0, (uptr)&block[0]);
+  s1 = m->GetIfExistsAndLock((uptr)&block[0]);
+  EXPECT_EQ(s1, (SyncVar*)0);
+  s2 = m->GetIfExistsAndLock((uptr)&block[1]);
+  EXPECT_EQ(s2, (SyncVar*)0);
+  m->OnThreadIdle(thr);
+}
+
+TEST(MetaMap, MoveMemory) {
+  ThreadState *thr = cur_thread();
+  MetaMap *m = &ctx->metamap;
+  u64 block1[4] = {};  // fake malloc block
+  u64 block2[4] = {};  // fake malloc block
+  m->AllocBlock(thr, 0, (uptr)&block1[0], 3 * sizeof(u64));
+  m->AllocBlock(thr, 0, (uptr)&block1[3], 1 * sizeof(u64));
+  SyncVar *s1 = m->GetOrCreateAndLock(thr, 0, (uptr)&block1[0], true);
+  s1->mtx.Unlock();
+  SyncVar *s2 = m->GetOrCreateAndLock(thr, 0, (uptr)&block1[1], true);
+  s2->mtx.Unlock();
+  m->MoveMemory((uptr)&block1[0], (uptr)&block2[0], 4 * sizeof(u64));
+  MBlock *mb1 = m->GetBlock((uptr)&block1[0]);
+  EXPECT_EQ(mb1, (MBlock*)0);
+  MBlock *mb2 = m->GetBlock((uptr)&block1[3]);
+  EXPECT_EQ(mb2, (MBlock*)0);
+  mb1 = m->GetBlock((uptr)&block2[0]);
+  EXPECT_NE(mb1, (MBlock*)0);
+  EXPECT_EQ(mb1->siz, 3 * sizeof(u64));
+  mb2 = m->GetBlock((uptr)&block2[3]);
+  EXPECT_NE(mb2, (MBlock*)0);
+  EXPECT_EQ(mb2->siz, 1 * sizeof(u64));
+  s1 = m->GetIfExistsAndLock((uptr)&block1[0]);
+  EXPECT_EQ(s1, (SyncVar*)0);
+  s2 = m->GetIfExistsAndLock((uptr)&block1[1]);
+  EXPECT_EQ(s2, (SyncVar*)0);
+  s1 = m->GetIfExistsAndLock((uptr)&block2[0]);
+  EXPECT_NE(s1, (SyncVar*)0);
+  EXPECT_EQ(s1->addr, (uptr)&block2[0]);
+  s1->mtx.Unlock();
+  s2 = m->GetIfExistsAndLock((uptr)&block2[1]);
+  EXPECT_NE(s2, (SyncVar*)0);
+  EXPECT_EQ(s2->addr, (uptr)&block2[1]);
+  s2->mtx.Unlock();
+  m->FreeRange(thr, 0, (uptr)&block2[0], 4 * sizeof(u64));
+}
+
+TEST(MetaMap, ResetSync) {
+  ThreadState *thr = cur_thread();
+  MetaMap *m = &ctx->metamap;
+  u64 block[1] = {};  // fake malloc block
+  m->AllocBlock(thr, 0, (uptr)&block[0], 1 * sizeof(u64));
+  SyncVar *s = m->GetOrCreateAndLock(thr, 0, (uptr)&block[0], true);
+  s->Reset();
+  s->mtx.Unlock();
+  uptr sz = m->FreeBlock(thr, 0, (uptr)&block[0]);
+  EXPECT_EQ(sz, 1 * sizeof(u64));
 }
 
 }  // namespace __tsan