EABI syscall cleanup.

We cleaned up the auto-generated ones a while back to not touch
the stack unnecessarily if they have <= 4 arguments. This patch
cleans up some hand-crafted ones.

Also improve comments in clone.S.

Change-Id: I8850bf98f2b26829385315304472a760e6880ed8
diff --git a/libc/arch-arm/bionic/clone.S b/libc/arch-arm/bionic/clone.S
index 01eb966..d00b6a6 100644
--- a/libc/arch-arm/bionic/clone.S
+++ b/libc/arch-arm/bionic/clone.S
@@ -32,33 +32,33 @@
 
 // int  __pthread_clone(void* (*fn)(void*), void* child_stack, int flags, void* arg);
 ENTRY(__pthread_clone)
-    # Copy the args onto the new stack.
+    # Push 'fn' and 'arg' onto 'child_stack'.
     stmdb   r1!, {r0, r3}
 
     # The sys_clone system call only takes two arguments: 'flags' and 'child_stack'.
     # 'child_stack' is already in r1, but we need to move 'flags' into position.
     mov     r0, r2
-    stmfd   sp!, {r4, r7}
 
     # System call.
+    mov     ip, r7
     ldr     r7, =__NR_clone
     swi     #0
+
+    # Child?
     movs    r0, r0
     beq     1f
 
-    # In parent, reload saved registers then either return or set errno.
-    ldmfd   sp!, {r4, r7}
+    # Parent.
+    mov     r7, ip
     cmn     r0, #(MAX_ERRNO + 1)
     bxls    lr
     neg     r0, r0
     b       __set_errno
 
-1:  # The child.
-    # pick the function arg and call address off the stack and jump
-    # to the C __thread_entry function which does some setup and then
-    # calls the thread's start function
+1:  # Child.
+    # Pop 'fn' and 'arg' back off the stack and call __thread_entry.
     pop     {r0, r1}
-    # __thread_entry needs the TLS pointer
+    # __thread_entry also needs our stack pointer.
     mov     r2, sp
     b       __thread_entry
 END(__pthread_clone)
diff --git a/libc/arch-arm/bionic/futex_arm.S b/libc/arch-arm/bionic/futex_arm.S
index e21a385..4131cdb 100644
--- a/libc/arch-arm/bionic/futex_arm.S
+++ b/libc/arch-arm/bionic/futex_arm.S
@@ -34,11 +34,10 @@
 
 // __futex_syscall3(*ftx, op, val)
 ENTRY(__futex_syscall3)
-    stmdb   sp!, {r4, r7}
-    .save   {r4, r7}
+    mov     ip, r7
     ldr     r7, =__NR_futex
     swi     #0
-    ldmia   sp!, {r4, r7}
+    mov     r7, ip
     bx      lr
 END(__futex_syscall3)
 
@@ -49,25 +48,23 @@
 
 // __futex_wait(*ftx, val, *timespec)
 ENTRY(__futex_wait)
-    stmdb   sp!, {r4, r7}
-    .save   {r4, r7}
+    mov     ip, r7
     mov     r3, r2
     mov     r2, r1
     mov     r1, #FUTEX_WAIT
     ldr     r7, =__NR_futex
     swi     #0
-    ldmia   sp!, {r4, r7}
+    mov     r7, ip
     bx      lr
 END(__futex_wait)
 
 // __futex_wake(*ftx, counter)
 ENTRY(__futex_wake)
-    .save   {r4, r7}
-    stmdb   sp!, {r4, r7}
+    mov     ip, r7
     mov     r2, r1
     mov     r1, #FUTEX_WAKE
     ldr     r7, =__NR_futex
     swi     #0
-    ldmia   sp!, {r4, r7}
+    mov     r7, ip
     bx      lr
 END(__futex_wake)