add proper fuxed-based locking for stdio

previously, stdio used spinlocks, which would be unacceptable if we
ever add support for thread priorities, and which yielded
pathologically bad performance if an application attempted to use
flockfile on a key file as a major/primary locking mechanism.

i had held off on making this change for fear that it would hurt
performance in the non-threaded case, but actually support for
recursive locking had already inflicted that cost. by having the
internal locking functions store a flag indicating whether they need
to perform unlocking, rather than using the actual recursive lock
counter, i was able to combine the conditionals at unlock time,
eliminating any additional cost, and also avoid a nasty corner case
where a huge number of calls to ftrylockfile could cause deadlock
later at the point of internal locking.

this commit also fixes some issues with usage of pthread_self
conflicting with __attribute__((const)) which resulted in crashes with
some compiler versions/optimizations, mainly in flockfile prior to
pthread_create.
diff --git a/src/internal/libc.h b/src/internal/libc.h
index 906de2c..929ff97 100644
--- a/src/internal/libc.h
+++ b/src/internal/libc.h
@@ -38,7 +38,8 @@
 
 /* Designed to avoid any overhead in non-threaded processes */
 void __lock(volatile int *);
-void __lockfile(FILE *);
+int __lockfile(FILE *);
+void __unlockfile(FILE *);
 #define LOCK(x) (libc.threads_minus_1 ? (__lock(x),1) : ((void)(x),1))
 #define UNLOCK(x) (*(volatile int *)(x)=0)
 
diff --git a/src/internal/pthread_impl.h b/src/internal/pthread_impl.h
index e6089f0..03af4c1 100644
--- a/src/internal/pthread_impl.h
+++ b/src/internal/pthread_impl.h
@@ -87,6 +87,8 @@
 #define SIGTIMER_SET ((sigset_t *)(unsigned long [1+(sizeof(long)==4)]){ \
 	 0x80000000 })
 
+pthread_t __pthread_self_init(void);
+
 int __set_thread_area(void *);
 int __libc_sigaction(int, const struct sigaction *, struct sigaction *);
 int __libc_sigprocmask(int, const sigset_t *, sigset_t *);
diff --git a/src/internal/stdio_impl.h b/src/internal/stdio_impl.h
index 76b58be..c5f45eb 100644
--- a/src/internal/stdio_impl.h
+++ b/src/internal/stdio_impl.h
@@ -24,8 +24,8 @@
 
 #define UNGET 8
 
-#define FLOCK(f) ((libc.threads_minus_1 && (f)->lock>=0) ? (__lockfile((f)),0) : 0)
-#define FUNLOCK(f) ((f)->lockcount && (--(f)->lockcount || ((f)->lock=0)))
+#define FLOCK(f) int __need_unlock = ((f)->lock>=0 ? __lockfile((f)) : 0)
+#define FUNLOCK(f) if (__need_unlock) __unlockfile((f)); else
 
 #define F_PERM 1
 #define F_NORD 4
@@ -49,12 +49,12 @@
 	FILE *prev, *next;
 	int fd;
 	int pipe_pid;
-	long dummy2;
+	long lockcount;
 	short dummy3;
 	signed char mode;
 	signed char lbf;
 	int lock;
-	int lockcount;
+	int waiters;
 	void *cookie;
 	off_t off;
 	int (*flush)(FILE *);
@@ -86,7 +86,6 @@
 
 #define OFLLOCK() LOCK(&libc.ofl_lock)
 #define OFLUNLOCK() UNLOCK(&libc.ofl_lock)
-#define ofl_head (libc.ofl_head)
 
 #define feof(f) ((f)->flags & F_EOF)
 #define ferror(f) ((f)->flags & F_ERR)
diff --git a/src/stdio/__fdopen.c b/src/stdio/__fdopen.c
index b13318e..07f966a 100644
--- a/src/stdio/__fdopen.c
+++ b/src/stdio/__fdopen.c
@@ -39,11 +39,13 @@
 	f->seek = __stdio_seek;
 	f->close = __stdio_close;
 
+	if (!libc.threaded) f->lock = -1;
+
 	/* Add new FILE to open file list */
 	OFLLOCK();
-	f->next = ofl_head;
-	if (ofl_head) ofl_head->prev = f;
-	ofl_head = f;
+	f->next = libc.ofl_head;
+	if (libc.ofl_head) libc.ofl_head->prev = f;
+	libc.ofl_head = f;
 	OFLUNLOCK();
 
 	return f;
diff --git a/src/stdio/__lockfile.c b/src/stdio/__lockfile.c
index 66a4d26..6ebf620 100644
--- a/src/stdio/__lockfile.c
+++ b/src/stdio/__lockfile.c
@@ -1,20 +1,18 @@
 #include "stdio_impl.h"
 #include "pthread_impl.h"
 
-void __lockfile(FILE *f)
+int __lockfile(FILE *f)
 {
-	int spins=10000;
-	int tid;
+	int owner, tid = __pthread_self()->tid;
+	if (f->lock == tid)
+		return 0;
+	while ((owner = a_cas(&f->lock, 0, tid)))
+		__wait(&f->lock, &f->waiters, owner, 1);
+	return f->lockcount = 1;
+}
 
-	if (f->lock < 0) return;
-	tid = __pthread_self()->tid;
-	if (f->lock == tid) {
-		while (f->lockcount == INT_MAX);
-		f->lockcount++;
-		return;
-	}
-	while (a_cas(&f->lock, 0, tid))
-		if (spins) spins--, a_spin();
-		else __syscall(SYS_sched_yield);
-	f->lockcount = 1;
+void __unlockfile(FILE *f)
+{
+	a_store(&f->lock, 0);
+	if (f->waiters) __wake(&f->lock, 1, 1);
 }
diff --git a/src/stdio/fclose.c b/src/stdio/fclose.c
index ee772fb..373a2c7 100644
--- a/src/stdio/fclose.c
+++ b/src/stdio/fclose.c
@@ -9,7 +9,7 @@
 		OFLLOCK();
 		if (f->prev) f->prev->next = f->next;
 		if (f->next) f->next->prev = f->prev;
-		if (ofl_head == f) ofl_head = f->next;
+		if (libc.ofl_head == f) libc.ofl_head = f->next;
 		OFLUNLOCK();
 	}
 
diff --git a/src/stdio/fflush.c b/src/stdio/fflush.c
index cdbd39b..4c1647b 100644
--- a/src/stdio/fflush.c
+++ b/src/stdio/fflush.c
@@ -22,8 +22,8 @@
 }
 
 /* stdout.c will override this if linked */
-static FILE *const __dummy = 0;
-weak_alias(__dummy, __stdout_to_flush);
+static FILE *const dummy = 0;
+weak_alias(dummy, __stdout_used);
 
 int fflush(FILE *f)
 {
@@ -37,13 +37,13 @@
 		return r;
 	}
 
-	r = __stdout_to_flush ? fflush(__stdout_to_flush) : 0;
+	r = __stdout_used ? fflush(__stdout_used) : 0;
 
 	OFLLOCK();
-	for (f=ofl_head; f; f=next) {
+	for (f=libc.ofl_head; f; f=next) {
 		FLOCK(f);
 		//OFLUNLOCK();
-		r |= __fflush_unlocked(f);
+		if (f->wpos > f->wbase) r |= __fflush_unlocked(f);
 		//OFLLOCK();
 		next = f->next;
 		FUNLOCK(f);
diff --git a/src/stdio/fgetc.c b/src/stdio/fgetc.c
index da63868..4d8aca3 100644
--- a/src/stdio/fgetc.c
+++ b/src/stdio/fgetc.c
@@ -3,9 +3,10 @@
 int fgetc(FILE *f)
 {
 	int c;
-	FLOCK(f);
+	if (f->lock < 0 || !__lockfile(f))
+		return getc_unlocked(f);
 	c = getc_unlocked(f);
-	FUNLOCK(f);
+	__unlockfile(f);
 	return c;
 }
 
diff --git a/src/stdio/fgetwc.c b/src/stdio/fgetwc.c
index 5e42059..6f9f9ec 100644
--- a/src/stdio/fgetwc.c
+++ b/src/stdio/fgetwc.c
@@ -34,7 +34,6 @@
 		if (l == -1) return WEOF;
 	}
 
-	FUNLOCK(f);
 	return wc;
 }
 
diff --git a/src/stdio/flockfile.c b/src/stdio/flockfile.c
index 0d4c92c..a196c1e 100644
--- a/src/stdio/flockfile.c
+++ b/src/stdio/flockfile.c
@@ -3,6 +3,8 @@
 
 void flockfile(FILE *f)
 {
-	if (!libc.threaded) pthread_self();
-	__lockfile(f);
+	while (ftrylockfile(f)) {
+		int owner = f->lock;
+		if (owner) __wait(&f->lock, &f->waiters, owner, 1);
+	}
 }
diff --git a/src/stdio/fputc.c b/src/stdio/fputc.c
index 98d0a20..6a144a5 100644
--- a/src/stdio/fputc.c
+++ b/src/stdio/fputc.c
@@ -2,9 +2,10 @@
 
 int fputc(int c, FILE *f)
 {
-	FLOCK(f);
+	if (f->lock < 0 || !__lockfile(f))
+		return putc_unlocked(c, f);
 	c = putc_unlocked(c, f);
-	FUNLOCK(f);
+	__unlockfile(f);
 	return c;
 }
 
diff --git a/src/stdio/ftell.c b/src/stdio/ftell.c
index aa1f538..3904a1d 100644
--- a/src/stdio/ftell.c
+++ b/src/stdio/ftell.c
@@ -3,10 +3,8 @@
 off_t __ftello_unlocked(FILE *f)
 {
 	off_t pos = f->seek(f, 0, SEEK_CUR);
-	if (pos < 0) {
-		FUNLOCK(f);
-		return pos;
-	}
+	if (pos < 0) return pos;
+
 	/* Adjust for data in buffer. */
 	return pos - (f->rend - f->rpos) + (f->wpos - f->wbase);
 }
diff --git a/src/stdio/ftrylockfile.c b/src/stdio/ftrylockfile.c
index 0b0e44a..725c4c3 100644
--- a/src/stdio/ftrylockfile.c
+++ b/src/stdio/ftrylockfile.c
@@ -5,11 +5,12 @@
 {
 	int tid = pthread_self()->tid;
 	if (f->lock == tid) {
-		if (f->lockcount == INT_MAX)
+		if (f->lockcount == LONG_MAX)
 			return -1;
 		f->lockcount++;
 		return 0;
 	}
+	if (f->lock < 0) f->lock = 0;
 	if (f->lock || a_cas(&f->lock, 0, tid))
 		return -1;
 	f->lockcount = 1;
diff --git a/src/stdio/funlockfile.c b/src/stdio/funlockfile.c
index d69f68e..f8a2a07 100644
--- a/src/stdio/funlockfile.c
+++ b/src/stdio/funlockfile.c
@@ -3,5 +3,5 @@
 
 void funlockfile(FILE *f)
 {
-	FUNLOCK(f);
+	if (!--f->lockcount) __unlockfile(f);
 }
diff --git a/src/stdio/stderr.c b/src/stdio/stderr.c
index 9a70700..3fd8f81 100644
--- a/src/stdio/stderr.c
+++ b/src/stdio/stderr.c
@@ -10,5 +10,7 @@
 	.write = __stdio_write,
 	.seek = __stdio_seek,
 	.close = __stdio_close,
+	.lock = -1,
 };
 FILE *const stderr = &f;
+FILE *const __stderr_used = &f;
diff --git a/src/stdio/stdin.c b/src/stdio/stdin.c
index 51c9923..476dc70 100644
--- a/src/stdio/stdin.c
+++ b/src/stdio/stdin.c
@@ -9,5 +9,7 @@
 	.read = __stdio_read,
 	.seek = __stdio_seek,
 	.close = __stdio_close,
+	.lock = -1,
 };
 FILE *const stdin = &f;
+FILE *const __stdin_used = &f;
diff --git a/src/stdio/stdout.c b/src/stdio/stdout.c
index 552d729..3855dd0 100644
--- a/src/stdio/stdout.c
+++ b/src/stdio/stdout.c
@@ -10,8 +10,7 @@
 	.write = __stdout_write,
 	.seek = __stdio_seek,
 	.close = __stdio_close,
+	.lock = -1,
 };
 FILE *const stdout = &f;
-
-/* overrides symbol in fflush.c, used for flushing NULL */
-FILE *const __stdout_to_flush = &f;
+FILE *const __stdout_used = &f;
diff --git a/src/thread/pthread_create.c b/src/thread/pthread_create.c
index 856015f..adef510 100644
--- a/src/thread/pthread_create.c
+++ b/src/thread/pthread_create.c
@@ -1,4 +1,5 @@
 #include "pthread_impl.h"
+#include "stdio_impl.h"
 
 static void dummy_0()
 {
@@ -60,6 +61,16 @@
 static const size_t dummy = 0;
 weak_alias(dummy, __pthread_tsd_size);
 
+static FILE *const dummy_file = 0;
+weak_alias(dummy_file, __stdin_used);
+weak_alias(dummy_file, __stdout_used);
+weak_alias(dummy_file, __stderr_used);
+
+static void init_file_lock(FILE *f)
+{
+	if (f && f->lock<0) f->lock = 0;
+}
+
 int pthread_create(pthread_t *res, const pthread_attr_t *attr, void *(*entry)(void *), void *arg)
 {
 	int ret;
@@ -70,6 +81,11 @@
 
 	if (!self) return ENOSYS;
 	if (!libc.threaded) {
+		for (FILE *f=libc.ofl_head; f; f=f->next)
+			init_file_lock(f);
+		init_file_lock(__stdin_used);
+		init_file_lock(__stdout_used);
+		init_file_lock(__stderr_used);
 		__syscall(SYS_rt_sigprocmask, SIG_UNBLOCK, SIGPT_SET, 0, 8);
 		libc.threaded = 1;
 	}
diff --git a/src/thread/pthread_key_create.c b/src/thread/pthread_key_create.c
index c9ca48a..e51cb02 100644
--- a/src/thread/pthread_key_create.c
+++ b/src/thread/pthread_key_create.c
@@ -14,7 +14,7 @@
 	unsigned i = (uintptr_t)&k / 16 % PTHREAD_KEYS_MAX;
 	unsigned j = i;
 
-	pthread_self();
+	__pthread_self_init();
 	if (!dtor) dtor = nodtor;
 	do {
 		if (!a_cas_p(keys+j, 0, dtor)) {
diff --git a/src/thread/pthread_self.c b/src/thread/pthread_self.c
index 55d20c9..9f885d9 100644
--- a/src/thread/pthread_self.c
+++ b/src/thread/pthread_self.c
@@ -35,3 +35,5 @@
 	}
 	return __pthread_self();
 }
+
+weak_alias(pthread_self, __pthread_self_init);