support for TLS in dynamic-loaded (dlopen) modules

unlike other implementations, this one reserves memory for new TLS in
all pre-existing threads at dlopen-time, and dlopen will fail with no
resources consumed and no new libraries loaded if memory is not
available. memory is not immediately distributed to running threads;
that would be too complex and too costly. instead, assurances are made
that threads needing the new TLS can obtain it in an async-signal-safe
way from a buffer belonging to the dynamic linker/new module (via
atomic fetch-and-add based allocator).

I've re-appropriated the lock that was previously used for __synccall
(synchronizing set*id() syscalls between threads) as a general
pthread_create lock. it's a "backwards" rwlock where the "read"
operation is safe atomic modification of the live thread count, which
multiple threads can perform at the same time, and the "write"
operation is making sure the count does not increase during an
operation that depends on it remaining bounded (__synccall or dlopen).
in static-linked programs that don't use __synccall, this lock is a
no-op and has no cost.
diff --git a/src/thread/pthread_create.c b/src/thread/pthread_create.c
index f53fc1b..92ce9ff 100644
--- a/src/thread/pthread_create.c
+++ b/src/thread/pthread_create.c
@@ -4,8 +4,8 @@
 static void dummy_0()
 {
 }
-weak_alias(dummy_0, __synccall_lock);
-weak_alias(dummy_0, __synccall_unlock);
+weak_alias(dummy_0, __acquire_ptc);
+weak_alias(dummy_0, __release_ptc);
 weak_alias(dummy_0, __pthread_tsd_run_dtors);
 
 _Noreturn void pthread_exit(void *result)
@@ -84,7 +84,7 @@
 	if (f && f->lock<0) f->lock = 0;
 }
 
-void *__copy_tls(unsigned char *, size_t);
+void *__copy_tls(unsigned char *);
 
 int pthread_create(pthread_t *restrict res, const pthread_attr_t *restrict attr, void *(*entry)(void *), void *restrict arg)
 {
@@ -94,8 +94,6 @@
 	struct pthread *self = pthread_self(), *new;
 	unsigned char *map, *stack, *tsd;
 	unsigned flags = 0x7d8f00;
-	size_t tls_cnt = libc.tls_cnt;
-	size_t tls_size = libc.tls_size;
 
 	if (!self) return ENOSYS;
 	if (!libc.threaded) {
@@ -107,6 +105,8 @@
 		libc.threaded = 1;
 	}
 
+	__acquire_ptc();
+
 	if (attr && attr->_a_stackaddr) {
 		map = 0;
 		tsd = (void *)(attr->_a_stackaddr-__pthread_tsd_size & -16);
@@ -114,7 +114,7 @@
 		if (attr) {
 			guard = ROUND(attr->_a_guardsize + DEFAULT_GUARD_SIZE);
 			size = guard + ROUND(attr->_a_stacksize
-				+ DEFAULT_STACK_SIZE + tls_size);
+				+ DEFAULT_STACK_SIZE + libc.tls_size);
 		}
 		size += __pthread_tsd_size;
 		if (guard) {
@@ -130,7 +130,7 @@
 		}
 		tsd = map + size - __pthread_tsd_size;
 	}
-	new = __copy_tls(tsd - tls_size, tls_cnt);
+	new = __copy_tls(tsd - libc.tls_size);
 	new->map_base = map;
 	new->map_size = size;
 	new->pid = self->pid;
@@ -147,12 +147,10 @@
 	new->canary = self->canary ^ (uintptr_t)&new;
 	stack = (void *)new;
 
-	__synccall_lock();
-
 	a_inc(&libc.threads_minus_1);
 	ret = __clone(start, stack, flags, new, &new->tid, new, &new->tid);
 
-	__synccall_unlock();
+	__release_ptc();
 
 	if (ret < 0) {
 		a_dec(&libc.threads_minus_1);