add pthread_attr_setstack interface (and get)

i originally omitted these (optional, per POSIX) interfaces because i
considered them backwards implementation details. however, someone
later brought to my attention a fairly legitimate use case: allocating
thread stacks in memory that's setup for sharing and/or fast transfer
between CPU and GPU so that the thread can move data to a GPU directly
from automatic-storage buffers without having to go through additional
buffer copies.

perhaps there are other situations in which these interfaces are
useful too.
diff --git a/src/thread/pthread_create.c b/src/thread/pthread_create.c
index 5b34e7e..48290d3 100644
--- a/src/thread/pthread_create.c
+++ b/src/thread/pthread_create.c
@@ -98,16 +98,20 @@
 		libc.threaded = 1;
 	}
 
-	if (attr) {
-		guard = ROUND(attr->_a_guardsize + DEFAULT_GUARD_SIZE);
-		size = guard + ROUND(attr->_a_stacksize + DEFAULT_STACK_SIZE);
+	if (attr && attr->_a_stackaddr) {
+		map = 0;
+		tsd = (void *)(attr->_a_stackaddr-__pthread_tsd_size & -16);
+	} else {
+		if (attr) {
+			guard = ROUND(attr->_a_guardsize + DEFAULT_GUARD_SIZE);
+			size = guard + ROUND(attr->_a_stacksize + DEFAULT_STACK_SIZE);
+		}
+		size += __pthread_tsd_size;
+		map = mmap(0, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0);
+		if (map == MAP_FAILED) return EAGAIN;
+		if (guard) mprotect(map, guard, PROT_NONE);
+		tsd = map + size - __pthread_tsd_size;
 	}
-	size += __pthread_tsd_size;
-	map = mmap(0, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0);
-	if (map == MAP_FAILED) return EAGAIN;
-	if (guard) mprotect(map, guard, PROT_NONE);
-
-	tsd = map + size - __pthread_tsd_size;
 	new = (void *)(tsd - sizeof *new - PAGE_SIZE%sizeof *new);
 	new->map_base = map;
 	new->map_size = size;