tsan: speed up race deduplication

Race deduplication code proved to be a performance bottleneck in the past if suppressions/annotations are used, or just some races left unaddressed. And we still get user complaints about this:
https://groups.google.com/forum/#!topic/thread-sanitizer/hB0WyiTI4e4
ReportRace already has several layers of caching for racy pcs/addresses to make deduplication faster. However, ReportRace still takes a global mutex (ThreadRegistry and ReportMutex) during deduplication and also calls mmap/munmap (which take process-wide semaphore in kernel), this makes deduplication non-scalable.

This patch moves race deduplication outside of global mutexes and also removes all mmap/munmap calls.
As the result, race_stress.cc with 100 threads and 10000 iterations become 30x faster:

before:
real	0m21.673s
user	0m5.932s
sys	0m34.885s

after:
real	0m0.720s
user	0m23.646s
sys	0m1.254s

http://reviews.llvm.org/D12554

llvm-svn: 246758
diff --git a/compiler-rt/lib/tsan/rtl/tsan_interface_ann.cc b/compiler-rt/lib/tsan/rtl/tsan_interface_ann.cc
index fd3c846..62db796 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_interface_ann.cc
+++ b/compiler-rt/lib/tsan/rtl/tsan_interface_ann.cc
@@ -63,8 +63,8 @@
 struct ExpectRace {
   ExpectRace *next;
   ExpectRace *prev;
-  int hitcount;
-  int addcount;
+  atomic_uintptr_t hitcount;
+  atomic_uintptr_t addcount;
   uptr addr;
   uptr size;
   char *file;
@@ -90,7 +90,8 @@
   ExpectRace *race = list->next;
   for (; race != list; race = race->next) {
     if (race->addr == addr && race->size == size) {
-      race->addcount++;
+      atomic_store_relaxed(&race->addcount,
+          atomic_load_relaxed(&race->addcount) + 1);
       return;
     }
   }
@@ -100,8 +101,8 @@
   race->file = f;
   race->line = l;
   race->desc[0] = 0;
-  race->hitcount = 0;
-  race->addcount = 1;
+  atomic_store_relaxed(&race->hitcount, 0);
+  atomic_store_relaxed(&race->addcount, 1);
   if (desc) {
     int i = 0;
     for (; i < kMaxDescLen - 1 && desc[i]; i++)
@@ -130,7 +131,7 @@
     return false;
   DPrintf("Hit expected/benign race: %s addr=%zx:%d %s:%d\n",
       race->desc, race->addr, (int)race->size, race->file, race->line);
-  race->hitcount++;
+  atomic_fetch_add(&race->hitcount, 1, memory_order_relaxed);
   return true;
 }
 
@@ -146,7 +147,7 @@
 }
 
 bool IsExpectedReport(uptr addr, uptr size) {
-  Lock lock(&dyn_ann_ctx->mtx);
+  ReadLock lock(&dyn_ann_ctx->mtx);
   if (CheckContains(&dyn_ann_ctx->expect, addr, size))
     return true;
   if (CheckContains(&dyn_ann_ctx->benign, addr, size))
@@ -155,20 +156,21 @@
 }
 
 static void CollectMatchedBenignRaces(Vector<ExpectRace> *matched,
-    int *unique_count, int *hit_count, int ExpectRace::*counter) {
+    int *unique_count, int *hit_count, atomic_uintptr_t ExpectRace::*counter) {
   ExpectRace *list = &dyn_ann_ctx->benign;
   for (ExpectRace *race = list->next; race != list; race = race->next) {
     (*unique_count)++;
-    if (race->*counter == 0)
+    const uptr cnt = atomic_load_relaxed(&(race->*counter));
+    if (cnt == 0)
       continue;
-    (*hit_count) += race->*counter;
+    *hit_count += cnt;
     uptr i = 0;
     for (; i < matched->Size(); i++) {
       ExpectRace *race0 = &(*matched)[i];
       if (race->line == race0->line
           && internal_strcmp(race->file, race0->file) == 0
           && internal_strcmp(race->desc, race0->desc) == 0) {
-        race0->*counter += race->*counter;
+        atomic_fetch_add(&(race0->*counter), cnt, memory_order_relaxed);
         break;
       }
     }
@@ -193,8 +195,8 @@
         hit_count, (int)internal_getpid());
     for (uptr i = 0; i < hit_matched.Size(); i++) {
       Printf("%d %s:%d %s\n",
-          hit_matched[i].hitcount, hit_matched[i].file,
-          hit_matched[i].line, hit_matched[i].desc);
+          atomic_load_relaxed(&hit_matched[i].hitcount),
+          hit_matched[i].file, hit_matched[i].line, hit_matched[i].desc);
     }
   }
   if (hit_matched.Size()) {
@@ -203,8 +205,8 @@
         add_count, unique_count, (int)internal_getpid());
     for (uptr i = 0; i < add_matched.Size(); i++) {
       Printf("%d %s:%d %s\n",
-          add_matched[i].addcount, add_matched[i].file,
-          add_matched[i].line, add_matched[i].desc);
+          atomic_load_relaxed(&add_matched[i].addcount),
+          add_matched[i].file, add_matched[i].line, add_matched[i].desc);
     }
   }
 }
@@ -303,7 +305,7 @@
   Lock lock(&dyn_ann_ctx->mtx);
   while (dyn_ann_ctx->expect.next != &dyn_ann_ctx->expect) {
     ExpectRace *race = dyn_ann_ctx->expect.next;
-    if (race->hitcount == 0) {
+    if (atomic_load_relaxed(&race->hitcount) == 0) {
       ctx->nmissed_expected++;
       ReportMissedExpectedRace(race);
     }