[WebAssembly] Implement thread-local storage (local-exec model)

Summary:
Thread local variables are placed inside a `.tdata` segment. Their symbols are
offsets from the start of the segment. The address of a thread local variable
is computed as `__tls_base` + the offset from the start of the segment.

`.tdata` segment is a passive segment and `memory.init` is used once per thread
to initialize the thread local storage.

`__tls_base` is a wasm global. Since each thread has its own wasm instance,
it is effectively thread local. Currently, `__tls_base` must be initialized
at thread startup, and so cannot be used with dynamic libraries.

`__tls_base` is to be initialized with a new linker-synthesized function,
`__wasm_init_tls`, which takes as an argument a block of memory to use as the
storage for thread locals. It then initializes the block of memory and sets
`__tls_base`. As `__wasm_init_tls` will handle the memory initialization,
the memory does not have to be zeroed.

To help allocating memory for thread-local storage, a new compiler intrinsic
is introduced: `__builtin_wasm_tls_size()`. This instrinsic function returns
the size of the thread-local storage for the current function.

The expected usage is to run something like the following upon thread startup:

    __wasm_init_tls(malloc(__builtin_wasm_tls_size()));

Reviewers: tlively, aheejin, kripken, sbc100

Subscribers: dschuff, jgravelle-google, hiraditya, sunfish, jfb, cfe-commits, llvm-commits

Tags: #clang, #llvm

Differential Revision: https://reviews.llvm.org/D64537

llvm-svn: 366272
diff --git a/lld/wasm/Writer.cpp b/lld/wasm/Writer.cpp
index 77a29a2..23a63ed 100644
--- a/lld/wasm/Writer.cpp
+++ b/lld/wasm/Writer.cpp
@@ -57,6 +57,7 @@
   void createInitMemoryFunction();
   void createApplyRelocationsFunction();
   void createCallCtorsFunction();
+  void createInitTLSFunction();
 
   void assignIndexes();
   void populateSymtab();
@@ -242,6 +243,11 @@
     log(formatv("mem: {0,-15} offset={1,-8} size={2,-8} align={3}", seg->name,
                 memoryPtr, seg->size, seg->alignment));
     memoryPtr += seg->size;
+
+    if (WasmSym::tlsSize && seg->name == ".tdata") {
+      auto *tlsSize = cast<DefinedGlobal>(WasmSym::tlsSize);
+      tlsSize->global->global.InitExpr.Value.Int32 = seg->size;
+    }
   }
 
   // TODO: Add .bss space here.
@@ -353,6 +359,7 @@
   StringMap<std::string> used;
   StringMap<std::string> required;
   StringMap<std::string> disallowed;
+  bool tlsUsed = false;
 
   // Only infer used features if user did not specify features
   bool inferFeatures = !config->features.hasValue();
@@ -385,6 +392,14 @@
               std::to_string(feature.Prefix));
       }
     }
+
+    for (InputSegment *segment : file->segments) {
+      if (!segment->live)
+        continue;
+      StringRef name = segment->getName();
+      if (name.startswith(".tdata") || name.startswith(".tbss"))
+        tlsUsed = true;
+    }
   }
 
   if (inferFeatures)
@@ -411,6 +426,10 @@
     error("'bulk-memory' feature must be used in order to emit passive "
           "segments");
 
+  if (!used.count("bulk-memory") && tlsUsed)
+    error("'bulk-memory' feature must be used in order to use thread-local "
+          "storage");
+
   // Validate that used features are allowed in output
   if (!inferFeatures) {
     for (auto &feature : used.keys()) {
@@ -492,8 +511,8 @@
       // implement in all major browsers.
       // See: https://github.com/WebAssembly/mutable-global
       if (g->getGlobalType()->Mutable) {
-        // Only the __stack_pointer should ever be create as mutable.
-        assert(g == WasmSym::stackPointer);
+        // Only __stack_pointer and __tls_base should ever be create as mutable.
+        assert(g == WasmSym::stackPointer || g == WasmSym::tlsBase);
         continue;
       }
       export_ = {name, WASM_EXTERNAL_GLOBAL, g->getGlobalIndex()};
@@ -602,6 +621,11 @@
   // we only have a single __memory_base to use as our base address.
   if (config->isPic)
     return ".data";
+  // We only support one thread-local segment, so we must merge the segments
+  // despite --no-merge-data-segments.
+  // We also need to merge .tbss into .tdata so they share the same offsets.
+  if (name.startswith(".tdata") || name.startswith(".tbss"))
+    return ".tdata";
   if (!config->mergeDataSegments)
     return name;
   if (name.startswith(".text."))
@@ -625,7 +649,7 @@
       if (s == nullptr) {
         LLVM_DEBUG(dbgs() << "new segment: " << name << "\n");
         s = make<OutputSegment>(name, segments.size());
-        if (config->passiveSegments)
+        if (config->passiveSegments || name == ".tdata")
           s->initFlags = WASM_SEGMENT_IS_PASSIVE;
         segments.push_back(s);
       }
@@ -655,7 +679,7 @@
 
     // initialize passive data segments
     for (const OutputSegment *s : segments) {
-      if (s->initFlags & WASM_SEGMENT_IS_PASSIVE) {
+      if (s->initFlags & WASM_SEGMENT_IS_PASSIVE && s->name != ".tdata") {
         // destination address
         writeU8(os, WASM_OPCODE_I32_CONST, "i32.const");
         writeSleb128(os, s->startVA, "destination address");
@@ -737,6 +761,49 @@
   createFunction(WasmSym::callCtors, bodyContent);
 }
 
+void Writer::createInitTLSFunction() {
+  if (!WasmSym::initTLS->isLive())
+    return;
+
+  std::string bodyContent;
+  {
+    raw_string_ostream os(bodyContent);
+
+    OutputSegment *tlsSeg = nullptr;
+    for (auto *seg : segments) {
+      if (seg->name == ".tdata")
+        tlsSeg = seg;
+      break;
+    }
+
+    writeUleb128(os, 0, "num locals");
+    if (tlsSeg) {
+      writeU8(os, WASM_OPCODE_LOCAL_GET, "local.get");
+      writeUleb128(os, 0, "local index");
+
+      writeU8(os, WASM_OPCODE_GLOBAL_SET, "global.set");
+      writeUleb128(os, WasmSym::tlsBase->getGlobalIndex(), "global index");
+
+      writeU8(os, WASM_OPCODE_LOCAL_GET, "local.get");
+      writeUleb128(os, 0, "local index");
+
+      writeU8(os, WASM_OPCODE_I32_CONST, "i32.const");
+      writeSleb128(os, 0, "segment offset");
+
+      writeU8(os, WASM_OPCODE_I32_CONST, "i32.const");
+      writeSleb128(os, tlsSeg->size, "memory region size");
+
+      writeU8(os, WASM_OPCODE_MISC_PREFIX, "bulk-memory prefix");
+      writeUleb128(os, WASM_OPCODE_MEMORY_INIT, "MEMORY.INIT");
+      writeUleb128(os, tlsSeg->index, "segment index immediate");
+      writeU8(os, 0, "memory index immediate");
+    }
+    writeU8(os, WASM_OPCODE_END, "end function");
+  }
+
+  createFunction(WasmSym::initTLS, bodyContent);
+}
+
 // Populate InitFunctions vector with init functions from all input objects.
 // This is then used either when creating the output linking section or to
 // synthesize the "__wasm_call_ctors" function.
@@ -829,6 +896,12 @@
     createCallCtorsFunction();
   }
 
+  if (config->sharedMemory && !config->shared)
+    createInitTLSFunction();
+
+  if (errorCount())
+    return;
+
   log("-- calculateTypes");
   calculateTypes();
   log("-- calculateExports");