libc/arch-arm64/generic/bionic/strnlen.S - fp2-dev/platform/bionic - Gitiles

 /* Copyright (c) 2014, Linaro Limited
    All rights reserved.

    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are met:
        * Redistributions of source code must retain the above copyright
          notice, this list of conditions and the following disclaimer.
        * Redistributions in binary form must reproduce the above copyright
          notice, this list of conditions and the following disclaimer in the
          documentation and/or other materials provided with the distribution.
        * Neither the name of the Linaro nor the
          names of its contributors may be used to endorse or promote products
          derived from this software without specific prior written permission.

    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 /* Assumptions:
  *
  * ARMv8-a, AArch64
  */

 #include <private/bionic_asm.h>

 /* Arguments and results.  */
 #define srcin		x0
 #define len		x0
 #define limit		x1

 /* Locals and temporaries.  */
 #define src		x2
 #define data1		x3
 #define data2		x4
 #define data2a		x5
 #define has_nul1	x6
 #define has_nul2	x7
 #define tmp1		x8
 #define tmp2		x9
 #define tmp3		x10
 #define tmp4		x11
 #define zeroones	x12
 #define pos		x13
 #define limit_wd	x14

 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
 #define REP8_80 0x8080808080808080

 	.text
 	.p2align	6
 .Lstart:
 	/* Pre-pad to ensure critical loop begins an icache line.  */
 	.rep 7
 	nop
 	.endr
 	/* Put this code here to avoid wasting more space with pre-padding.  */
 .Lhit_limit:
 	mov	len, limit
 	ret

 ENTRY(strnlen)
 	cbz	limit, .Lhit_limit
 	mov	zeroones, #REP8_01
 	bic	src, srcin, #15
 	ands	tmp1, srcin, #15
 	b.ne	.Lmisaligned
 	/* Calculate the number of full and partial words -1.  */
 	sub	limit_wd, limit, #1	/* Limit != 0, so no underflow.  */
 	lsr	limit_wd, limit_wd, #4	/* Convert to Qwords.  */

 	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
 	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
 	   can be done in parallel across the entire word.  */
 	/* The inner loop deals with two Dwords at a time.  This has a
 	   slightly higher start-up cost, but we should win quite quickly,
 	   especially on cores with a high number of issue slots per
 	   cycle, as we get much better parallelism out of the operations.  */

 	/* Start of critial section -- keep to one 64Byte cache line.  */
 .Lloop:
 	ldp	data1, data2, [src], #16
 .Lrealigned:
 	sub	tmp1, data1, zeroones
 	orr	tmp2, data1, #REP8_7f
 	sub	tmp3, data2, zeroones
 	orr	tmp4, data2, #REP8_7f
 	bic	has_nul1, tmp1, tmp2
 	bic	has_nul2, tmp3, tmp4
 	subs	limit_wd, limit_wd, #1
 	orr	tmp1, has_nul1, has_nul2
 	ccmp	tmp1, #0, #0, pl	/* NZCV = 0000  */
 	b.eq	.Lloop
 	/* End of critical section -- keep to one 64Byte cache line.  */

 	orr	tmp1, has_nul1, has_nul2
 	cbz	tmp1, .Lhit_limit	/* No null in final Qword.  */

 	/* We know there's a null in the final Qword.  The easiest thing
 	   to do now is work out the length of the string and return
 	   MIN (len, limit).  */

 	sub	len, src, srcin
 	cbz	has_nul1, .Lnul_in_data2
 #ifdef __AARCH64EB__
 	mov	data2, data1
 #endif
 	sub	len, len, #8
 	mov	has_nul2, has_nul1
 .Lnul_in_data2:
 #ifdef __AARCH64EB__
 	/* For big-endian, carry propagation (if the final byte in the
 	   string is 0x01) means we cannot use has_nul directly.  The
 	   easiest way to get the correct byte is to byte-swap the data
 	   and calculate the syndrome a second time.  */
 	rev	data2, data2
 	sub	tmp1, data2, zeroones
 	orr	tmp2, data2, #REP8_7f
 	bic	has_nul2, tmp1, tmp2
 #endif
 	sub	len, len, #8
 	rev	has_nul2, has_nul2
 	clz	pos, has_nul2
 	add	len, len, pos, lsr #3		/* Bits to bytes.  */
 	cmp	len, limit
 	csel	len, len, limit, ls		/* Return the lower value.  */
 	ret

 .Lmisaligned:
 	/* Deal with a partial first word.
 	   We're doing two things in parallel here;
 	   1) Calculate the number of words (but avoiding overflow if
 	      limit is near ULONG_MAX) - to do this we need to work out
 	      limit + tmp1 - 1 as a 65-bit value before shifting it;
 	   2) Load and mask the initial data words - we force the bytes
 	      before the ones we are interested in to 0xff - this ensures
 	      early bytes will not hit any zero detection.  */
 	sub	limit_wd, limit, #1
 	neg	tmp4, tmp1
 	cmp	tmp1, #8

 	and	tmp3, limit_wd, #15
 	lsr	limit_wd, limit_wd, #4
 	mov	tmp2, #~0

 	ldp	data1, data2, [src], #16
 	lsl	tmp4, tmp4, #3		/* Bytes beyond alignment -> bits.  */
 	add	tmp3, tmp3, tmp1

 #ifdef __AARCH64EB__
 	/* Big-endian.  Early bytes are at MSB.  */
 	lsl	tmp2, tmp2, tmp4	/* Shift (tmp1 & 63).  */
 #else
 	/* Little-endian.  Early bytes are at LSB.  */
 	lsr	tmp2, tmp2, tmp4	/* Shift (tmp1 & 63).  */
 #endif
 	add	limit_wd, limit_wd, tmp3, lsr #4

 	orr	data1, data1, tmp2
 	orr	data2a, data2, tmp2

 	csinv	data1, data1, xzr, le
 	csel	data2, data2, data2a, le
 	b	.Lrealigned
 END(strnlen)
	/* Copyright (c) 2014, Linaro Limited
	All rights reserved.

	Redistribution and use in source and binary forms, with or without
	modification, are permitted provided that the following conditions are met:
	* Redistributions of source code must retain the above copyright
	notice, this list of conditions and the following disclaimer.
	* Redistributions in binary form must reproduce the above copyright
	notice, this list of conditions and the following disclaimer in the
	documentation and/or other materials provided with the distribution.
	* Neither the name of the Linaro nor the
	names of its contributors may be used to endorse or promote products
	derived from this software without specific prior written permission.

	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	/* Assumptions:
	*
	* ARMv8-a, AArch64
	*/

	#include <private/bionic_asm.h>

	/* Arguments and results. */
	#define srcin x0
	#define len x0
	#define limit x1

	/* Locals and temporaries. */
	#define src x2
	#define data1 x3
	#define data2 x4
	#define data2a x5
	#define has_nul1 x6
	#define has_nul2 x7
	#define tmp1 x8
	#define tmp2 x9
	#define tmp3 x10
	#define tmp4 x11
	#define zeroones x12
	#define pos x13
	#define limit_wd x14

	#define REP8_01 0x0101010101010101
	#define REP8_7f 0x7f7f7f7f7f7f7f7f
	#define REP8_80 0x8080808080808080

	.text
	.p2align 6
	.Lstart:
	/* Pre-pad to ensure critical loop begins an icache line. */
	.rep 7
	nop
	.endr
	/* Put this code here to avoid wasting more space with pre-padding. */
	.Lhit_limit:
	mov len, limit
	ret

	ENTRY(strnlen)
	cbz limit, .Lhit_limit
	mov zeroones, #REP8_01
	bic src, srcin, #15
	ands tmp1, srcin, #15
	b.ne .Lmisaligned
	/* Calculate the number of full and partial words -1. */
	sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */
	lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */

	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
	(=> (X - 1) & ~(X \| 0x7f)) is non-zero iff a byte is zero, and
	can be done in parallel across the entire word. */
	/* The inner loop deals with two Dwords at a time. This has a
	slightly higher start-up cost, but we should win quite quickly,
	especially on cores with a high number of issue slots per
	cycle, as we get much better parallelism out of the operations. */

	/* Start of critial section -- keep to one 64Byte cache line. */
	.Lloop:
	ldp data1, data2, [src], #16
	.Lrealigned:
	sub tmp1, data1, zeroones
	orr tmp2, data1, #REP8_7f
	sub tmp3, data2, zeroones
	orr tmp4, data2, #REP8_7f
	bic has_nul1, tmp1, tmp2
	bic has_nul2, tmp3, tmp4
	subs limit_wd, limit_wd, #1
	orr tmp1, has_nul1, has_nul2
	ccmp tmp1, #0, #0, pl /* NZCV = 0000 */
	b.eq .Lloop
	/* End of critical section -- keep to one 64Byte cache line. */

	orr tmp1, has_nul1, has_nul2
	cbz tmp1, .Lhit_limit /* No null in final Qword. */

	/* We know there's a null in the final Qword. The easiest thing
	to do now is work out the length of the string and return
	MIN (len, limit). */

	sub len, src, srcin
	cbz has_nul1, .Lnul_in_data2
	#ifdef __AARCH64EB__
	mov data2, data1
	#endif
	sub len, len, #8
	mov has_nul2, has_nul1
	.Lnul_in_data2:
	#ifdef __AARCH64EB__
	/* For big-endian, carry propagation (if the final byte in the
	string is 0x01) means we cannot use has_nul directly. The
	easiest way to get the correct byte is to byte-swap the data
	and calculate the syndrome a second time. */
	rev data2, data2
	sub tmp1, data2, zeroones
	orr tmp2, data2, #REP8_7f
	bic has_nul2, tmp1, tmp2
	#endif
	sub len, len, #8
	rev has_nul2, has_nul2
	clz pos, has_nul2
	add len, len, pos, lsr #3 /* Bits to bytes. */
	cmp len, limit
	csel len, len, limit, ls /* Return the lower value. */
	ret

	.Lmisaligned:
	/* Deal with a partial first word.
	We're doing two things in parallel here;
	1) Calculate the number of words (but avoiding overflow if
	limit is near ULONG_MAX) - to do this we need to work out
	limit + tmp1 - 1 as a 65-bit value before shifting it;
	2) Load and mask the initial data words - we force the bytes
	before the ones we are interested in to 0xff - this ensures
	early bytes will not hit any zero detection. */
	sub limit_wd, limit, #1
	neg tmp4, tmp1
	cmp tmp1, #8

	and tmp3, limit_wd, #15
	lsr limit_wd, limit_wd, #4
	mov tmp2, #~0

	ldp data1, data2, [src], #16
	lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */
	add tmp3, tmp3, tmp1

	#ifdef __AARCH64EB__
	/* Big-endian. Early bytes are at MSB. */
	lsl tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */
	#else
	/* Little-endian. Early bytes are at LSB. */
	lsr tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */
	#endif
	add limit_wd, limit_wd, tmp3, lsr #4

	orr data1, data1, tmp2
	orr data2a, data2, tmp2

	csinv data1, data1, xzr, le
	csel data2, data2, data2a, le
	b .Lrealigned
	END(strnlen)