align stack properly for calling global ctors/dtors on x86[_64]

failure to do so was causing crashes on x86_64 when ctors used SSE,
which was first observed when ctors called variadic functions due to
the SSE prologue code inserted into every variadic function.
diff --git a/crt/i386/crtn.s b/crt/i386/crtn.s
index 055451e..f3b61e0 100644
--- a/crt/i386/crtn.s
+++ b/crt/i386/crtn.s
@@ -1,5 +1,7 @@
 .section .init
+	add $12,%esp
 	ret
 
 .section .fini
+	add $12,%esp
 	ret