avr32/lib: fix unaligned memcpy()

memcpy(p, unaligned, 4..) returns (p + num_of_unaligned_by_copied)
instead of p because p is not preserved in the unaligned case.

Noticed by Herbert Xu's superior parameter recycling coding technique
which let the md4 self-test fail on avr32.

Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
Signed-off-by: Haavard Skinnemoen <haavard.skinnemoen@atmel.com>
diff --git a/arch/avr32/lib/memcpy.S b/arch/avr32/lib/memcpy.S
index 0abb261..93e74b6 100644
--- a/arch/avr32/lib/memcpy.S
+++ b/arch/avr32/lib/memcpy.S
@@ -24,8 +24,8 @@
 	brne	1f
 
 	/* At this point, "from" is word-aligned */
-2:	sub	r10, 4
-	mov	r9, r12
+2:	mov	r9, r12
+5:	sub	r10, 4
 	brlt	4f
 
 3:	ld.w	r8, r11++
@@ -59,4 +59,13 @@
 	st.b	r12++, r8
 	ld.ub	r8, r11++
 	st.b	r12++, r8
-	rjmp	2b
+	mov	r8, r12
+	add	pc, pc, r9
+	sub	r8, 1
+	nop
+	sub	r8, 1
+	nop
+	sub	r8, 1
+	nop
+	mov	r9, r8
+	rjmp	5b