overhaul clone syscall wrapping

several things are changed. first, i have removed the old __uniclone
function signature and replaced it with the "standard" linux
__clone/clone signature. this was necessary to expose clone to
applications anyway, and it makes it easier to port __clone to new
archs, since it's now testable independently of pthread_create.

secondly, i have removed all references to the ugly ldt descriptor
structure (i386 only) from the c code and pthread structure. in places
where it is needed, it is now created on the stack just when it's
needed, in assembly code. thus, the i386 __clone function takes the
desired thread pointer as its argument, rather than an ldt descriptor
pointer, just like on all other sane archs. this should not affect
applications since there is really no way an application can use clone
with threads/tls in a way that doesn't horribly conflict with and
clobber the underlying implementation's use. applications are expected
to use clone only for creating actual processes, possibly with new
namespace features and whatnot.
diff --git a/src/internal/pthread_impl.h b/src/internal/pthread_impl.h
index 95ec948..26164d8 100644
--- a/src/internal/pthread_impl.h
+++ b/src/internal/pthread_impl.h
@@ -22,7 +22,6 @@
 
 struct pthread {
 	struct pthread *self;
-	unsigned long tlsdesc[4];
 	pid_t tid, pid;
 	int tsd_used, errno_val, *errno_ptr;
 	volatile uintptr_t cp_sp, cp_ip;
@@ -87,7 +86,7 @@
 
 pthread_t __pthread_self_init(void);
 
-int __uniclone(void *, void (*)(pthread_t), void *);
+int __clone(int (*)(void *), void *, int, void *, ...);
 int __set_thread_area(void *);
 int __libc_sigaction(int, const struct sigaction *, struct sigaction *);
 int __libc_sigprocmask(int, const sigset_t *, sigset_t *);
diff --git a/src/thread/__set_thread_area.c b/src/thread/__set_thread_area.c
index f2ac6e9..136be57 100644
--- a/src/thread/__set_thread_area.c
+++ b/src/thread/__set_thread_area.c
@@ -1,4 +1,4 @@
-#include <pthread_impl.h>
+#include "pthread_impl.h"
 
 int __set_thread_area(void *p)
 {
diff --git a/src/thread/clone.c b/src/thread/clone.c
index e69de29..339e28a 100644
--- a/src/thread/clone.c
+++ b/src/thread/clone.c
@@ -0,0 +1,10 @@
+#include <errno.h>
+#include "libc.h"
+
+int __clone(int (*func)(void *), void *stack, int flags, void *arg, ...)
+{
+	errno = ENOSYS;
+	return -1;
+}
+
+weak_alias(__clone, clone);
diff --git a/src/thread/forkall.c b/src/thread/forkall.c
index 403818e..6810ea5 100644
--- a/src/thread/forkall.c
+++ b/src/thread/forkall.c
@@ -1,3 +1,4 @@
+#if 0
 #include "pthread_impl.h"
 #include <setjmp.h>
 
@@ -64,3 +65,4 @@
 	__synccall(do_forkall, &c);
 	return c.pid;
 }
+#endif
diff --git a/src/thread/i386/__set_thread_area.s b/src/thread/i386/__set_thread_area.s
index a43525e..cccf1cd 100644
--- a/src/thread/i386/__set_thread_area.s
+++ b/src/thread/i386/__set_thread_area.s
@@ -2,20 +2,21 @@
 .global __set_thread_area
 .type   __set_thread_area,@function
 __set_thread_area:
-	pushl %ebx
-	movl 8(%esp),%ecx
-	movl $-1,4(%ecx)
-	movl %ecx,8(%ecx)
-	movl $0xfffff,12(%ecx)
-	movl $0x51,16(%ecx)
-	leal 4(%ecx),%ebx
-	movl $243,%eax
+	push %ebx
+	push $0x51
+	push $0xfffff
+	push 16(%esp)
+	push $-1
+	mov %esp,%ebx
+	xor %eax,%eax
+	mov $243,%al
 	int $128
-	popl %ebx
 	testl %eax,%eax
 	jnz 1f
-	movl 4(%ecx),%ecx
+	movl (%esp),%ecx
 	leal 3(,%ecx,8),%ecx
 	movw %cx,%gs
 1:
+	addl $16,%esp
+	popl %ebx
 	ret
diff --git a/src/thread/i386/clone.s b/src/thread/i386/clone.s
index 7af5f5d..bebf01a 100644
--- a/src/thread/i386/clone.s
+++ b/src/thread/i386/clone.s
@@ -1,26 +1,51 @@
 .text
-.global __uniclone
-.type   __uniclone,@function
-__uniclone:
-	movl	4(%esp),%ecx
-	subl	$24,%ecx
-	movl	8(%esp),%eax
-	movl	%eax,16(%ecx)
-	movl	12(%esp),%eax
-	movl	%eax,24(%ecx)
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-	pushl   %ebp
-	movl    %eax,8(%eax)
-	leal    20(%eax),%edx
-	leal    4(%eax),%esi
-	movl	%edx,%edi
-	movl	$0x7d0f00,%ebx
-	movl	$120,%eax
-	int	$128
-	popl    %ebp
-	popl	%edi
-	popl	%esi
-	popl	%ebx
+.global __clone
+.weak clone
+.type   __clone,@function
+.type   clone,@function
+__clone:
+clone:
+	push %ebp
+	mov %esp,%ebp
+	push %ebx
+	push %esi
+	push %edi
+
+	xor %eax,%eax
+	push $0x51
+	mov %gs,%ax
+	push $0xfffff
+	shr $3,%eax
+	push 28(%ebp)
+	push %eax
+	mov $120,%al
+
+	mov 12(%ebp),%ecx
+	mov 16(%ebp),%ebx
+	and $-16,%ecx
+	sub $16,%ecx
+	mov 20(%ebp),%edi
+	mov %edi,(%ecx)
+	mov 24(%ebp),%edx
+	mov %esp,%esi
+	mov 32(%ebp),%edi
+	mov 8(%ebp),%ebp
+	int $128
+	test %eax,%eax
+	jnz 1f
+
+	mov %ebp,%eax
+	xor %ebp,%ebp
+	call *%eax
+	mov %eax,%ebx
+	xor %eax,%eax
+	inc %eax
+	int $128
+	hlt
+
+1:	add $16,%esp
+	pop %edi
+	pop %esi
+	pop %ebx
+	pop %ebp
 	ret
diff --git a/src/thread/pthread_create.c b/src/thread/pthread_create.c
index bef2553..f7768d8 100644
--- a/src/thread/pthread_create.c
+++ b/src/thread/pthread_create.c
@@ -52,11 +52,13 @@
 	self->cancelbuf = self->cancelbuf->__next;
 }
 
-static void start(pthread_t self)
+static int start(void *p)
 {
+	pthread_t self = p;
 	if (self->unblock_cancel)
 		__syscall(SYS_rt_sigprocmask, SIG_UNBLOCK, SIGPT_SET, 0, 8);
 	pthread_exit(self->start(self->start_arg));
+	return 0;
 }
 
 #define ROUND(x) (((x)+PAGE_SIZE-1)&-PAGE_SIZE)
@@ -115,14 +117,12 @@
 	new->tsd = (void *)tsd;
 	if (attr) new->detached = attr->_a_detach;
 	new->unblock_cancel = self->cancel;
-	memcpy(new->tlsdesc, self->tlsdesc, sizeof new->tlsdesc);
-	new->tlsdesc[1] = (uintptr_t)new;
-	stack = (void *)((uintptr_t)new-1 & ~(uintptr_t)15);
+	stack = (void *)new;
 
 	__synccall_lock();
 
 	a_inc(&libc.threads_minus_1);
-	ret = __uniclone(stack, start, new);
+	ret = __clone(start, stack, 0x7d8f00, new, &new->tid, new, &new->tid);
 
 	__synccall_unlock();
 
diff --git a/src/thread/x86_64/clone.s b/src/thread/x86_64/clone.s
index bf128a4..4db081c 100644
--- a/src/thread/x86_64/clone.s
+++ b/src/thread/x86_64/clone.s
@@ -1,21 +1,30 @@
-/* Copyright 2011 Nicholas J. Kain, licensed GNU LGPL 2.1 or later */
 .text
-.global __uniclone
-.type   __uniclone,@function
-/* rdi = child_stack, rsi = start, rdx = pthread_struct */
-__uniclone:
-        subq    $8,%rsp         /* pad parent stack to prevent branch later */
-        subq    $24,%rdi        /* grow child_stack */
-        mov     %rsi,8(%rdi)    /* push start onto child_stack as return ptr */
-        mov     %rdx,0(%rdi)    /* push pthread_struct onto child_stack */
-        mov     %rdx,%r8        /* r8 = tls */
-        mov     %rdi,%rsi       /* rsi = child_stack */
-        leaq    40(%rdx),%r10   /* r10 = child_id */
-        movl    $56,%eax        /* clone syscall number */
-        movl    $0x7d0f00,%edi  /* rdi = flags */
-        mov     %r10,%rdx       /* rdx = parent_id */
-        syscall                 /* clone(flags, child_stack, parent_id,
-                                 *       child_id, tls) */
-        pop     %rdi            /* child stack: restore pthread_struct
-                                 * parent stack: undo rsp displacement */
-        ret
+.global __clone
+.weak clone
+.type   __clone,@function
+.type   clone,@function
+__clone:
+clone:
+	xor %eax,%eax
+	mov $56,%al
+	mov %rdi,%r11
+	mov %rdx,%rdi
+	mov %r8,%rdx
+	mov %r9,%r8
+	mov 8(%rsp),%r10
+	mov %r11,%r9
+	and $-16,%rsi
+	sub $8,%rsi
+	mov %rcx,(%rsi)
+	syscall
+	test %eax,%eax
+	jnz 1f
+	xor %ebp,%ebp
+	pop %rdi
+	call *%r9
+	mov %eax,%edi
+	xor %eax,%eax
+	mov $60,%al
+	syscall
+	hlt
+1:	ret