Rewrite memset for cortexa15 to use strd.
Merge from internal master.
(cherry-picked from commit 7ffad9c120054eedebd5f56f8bed01144e93eafa)
Change-Id: Ia67f2a545399f4fa37b63d5634a3565e4f5482f9
diff --git a/libc/arch-arm/cortex-a15/bionic/memset.S b/libc/arch-arm/cortex-a15/bionic/memset.S
index 7bb3297..2e1ad54 100644
--- a/libc/arch-arm/cortex-a15/bionic/memset.S
+++ b/libc/arch-arm/cortex-a15/bionic/memset.S
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2008 The Android Open Source Project
+ * Copyright (C) 2013 The Android Open Source Project
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -35,11 +35,12 @@
* memset() returns its first argument.
*/
- .fpu neon
+ .fpu neon
+ .syntax unified
ENTRY(bzero)
- mov r2, r1
- mov r1, #0
+ mov r2, r1
+ mov r1, #0
// Fall through to memset...
END(bzero)
@@ -47,60 +48,117 @@
.save {r0}
stmfd sp!, {r0}
- vdup.8 q0, r1
-
- /* do we have at least 16-bytes to write (needed for alignment below) */
+ // The new algorithm is slower for copies < 16 so use the old
+ // neon code in that case.
cmp r2, #16
- blo 3f
+ blo set_less_than_16_unknown_align
- /* align destination to 16 bytes for the write-buffer */
- rsb r3, r0, #0
- ands r3, r3, #0xF
- beq 2f
+ // Use strd which requires an even and odd register so move the
+ // values so that:
+ // r0 and r1 contain the memset value
+ // r2 is the number of bytes to set
+ // r3 is the destination pointer
+ mov r3, r0
- /* write up to 15-bytes (count in r3) */
- sub r2, r2, r3
- movs ip, r3, lsl #31
- strmib r1, [r0], #1
- strcsb r1, [r0], #1
- strcsb r1, [r0], #1
- movs ip, r3, lsl #29
- bge 1f
+ // Copy the byte value in every byte of r1.
+ mov r1, r1, lsl #24
+ orr r1, r1, r1, lsr #8
+ orr r1, r1, r1, lsr #16
- // writes 4 bytes, 32-bits aligned
- vst1.32 {d0[0]}, [r0, :32]!
-1: bcc 2f
+check_alignment:
+ // Align destination to a double word to avoid the strd crossing
+ // a cache line boundary.
+ ands ip, r3, #7
+ bne do_double_word_align
- // writes 8 bytes, 64-bits aligned
- vst1.8 {d0}, [r0, :64]!
-2:
- /* make sure we have at least 32 bytes to write */
- subs r2, r2, #32
- blo 2f
- vmov q1, q0
+double_word_aligned:
+ mov r0, r1
-1: /* The main loop writes 32 bytes at a time */
- subs r2, r2, #32
- vst1.8 {d0 - d3}, [r0, :128]!
- bhs 1b
+ subs r2, #64
+ blo set_less_than_64
-2: /* less than 32 left */
- add r2, r2, #32
- tst r2, #0x10
- beq 3f
+1: // Main loop sets 64 bytes at a time.
+ .irp offset, #0, #8, #16, #24, #32, #40, #48, #56
+ strd r0, r1, [r3, \offset]
+ .endr
- // writes 16 bytes, 128-bits aligned
- vst1.8 {d0, d1}, [r0, :128]!
-3: /* write up to 15-bytes (count in r2) */
+ add r3, #64
+ subs r2, #64
+ bge 1b
+
+set_less_than_64:
+ // Restore r2 to the count of bytes left to set.
+ add r2, #64
+ lsls ip, r2, #27
+ bcc set_less_than_32
+ // Set 32 bytes.
+ .irp offset, #0, #8, #16, #24
+ strd r0, r1, [r3, \offset]
+ .endr
+ add r3, #32
+
+set_less_than_32:
+ bpl set_less_than_16
+ // Set 16 bytes.
+ .irp offset, #0, #8
+ strd r0, r1, [r3, \offset]
+ .endr
+ add r3, #16
+
+set_less_than_16:
+ // Less than 16 bytes to set.
+ lsls ip, r2, #29
+ bcc set_less_than_8
+
+ // Set 8 bytes.
+ strd r0, r1, [r3], #8
+
+set_less_than_8:
+ bpl set_less_than_4
+ // Set 4 bytes
+ str r1, [r3], #4
+
+set_less_than_4:
+ lsls ip, r2, #31
+ it ne
+ strbne r1, [r3], #1
+ itt cs
+ strbcs r1, [r3], #1
+ strbcs r1, [r3]
+
+ ldmfd sp!, {r0}
+ bx lr
+
+do_double_word_align:
+ rsb ip, ip, #8
+ sub r2, r2, ip
+ movs r0, ip, lsl #31
+ it mi
+ strbmi r1, [r3], #1
+ itt cs
+ strbcs r1, [r3], #1
+ strbcs r1, [r3], #1
+
+ // Dst is at least word aligned by this point.
+ cmp ip, #4
+ blo double_word_aligned
+ str r1, [r3], #4
+ b double_word_aligned
+
+set_less_than_16_unknown_align:
+ // Set up to 15 bytes.
+ vdup.8 d0, r1
movs ip, r2, lsl #29
bcc 1f
vst1.8 {d0}, [r0]!
1: bge 2f
vst1.32 {d0[0]}, [r0]!
2: movs ip, r2, lsl #31
- strmib r1, [r0], #1
- strcsb r1, [r0], #1
- strcsb r1, [r0], #1
+ it mi
+ strbmi r1, [r0], #1
+ itt cs
+ strbcs r1, [r0], #1
+ strbcs r1, [r0], #1
ldmfd sp!, {r0}
bx lr
END(memset)