Blame - string/arm/strcmp.S - platform/external/arm-optimized-routines

blob: fb9cae3fae6f90e9db1c571ffd641a3353d6d828 [file] [log] [blame]

Adhemerval Zanella	d30faa5	2019-08-06 16:21:09 -0300	[diff] [blame]	1	/*
Szabolcs Nagy	558699c	2019-08-27 15:57:58 +0100	[diff] [blame^]	2	* strcmp for ARMv7
Adhemerval Zanella	d30faa5	2019-08-06 16:21:09 -0300	[diff] [blame]	3	*
Szabolcs Nagy	558699c	2019-08-27 15:57:58 +0100	[diff] [blame^]	4	* Copyright (c) 2012-2019, Arm Limited.
				5	* SPDX-License-Identifier: MIT
Adhemerval Zanella	d30faa5	2019-08-06 16:21:09 -0300	[diff] [blame]	6	*/
				7
				8	/* Implementation of strcmp for ARMv7 when DSP instructions are
				9	available. Use ldrd to support wider loads, provided the data
				10	is sufficiently aligned. Use saturating arithmetic to optimize
				11	the compares. */
				12
				13	/* Build Options:
				14	STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
				15	byte in the string. If comparing completely random strings
				16	the pre-check will save time, since there is a very high
				17	probability of a mismatch in the first character: we save
				18	significant overhead if this is the common case. However,
				19	if strings are likely to be identical (eg because we're
				20	verifying a hit in a hash table), then this check is largely
				21	redundant. */
				22
				23	#define STRCMP_NO_PRECHECK 0
				24
				25	/* This version uses Thumb-2 code. */
				26	.thumb
				27	.syntax unified
				28
				29	#ifdef __ARM_BIG_ENDIAN
				30	#define S2LO lsl
				31	#define S2LOEQ lsleq
				32	#define S2HI lsr
				33	#define MSB 0x000000ff
				34	#define LSB 0xff000000
				35	#define BYTE0_OFFSET 24
				36	#define BYTE1_OFFSET 16
				37	#define BYTE2_OFFSET 8
				38	#define BYTE3_OFFSET 0
				39	#else /* not __ARM_BIG_ENDIAN */
				40	#define S2LO lsr
				41	#define S2LOEQ lsreq
				42	#define S2HI lsl
				43	#define BYTE0_OFFSET 0
				44	#define BYTE1_OFFSET 8
				45	#define BYTE2_OFFSET 16
				46	#define BYTE3_OFFSET 24
				47	#define MSB 0xff000000
				48	#define LSB 0x000000ff
				49	#endif /* not __ARM_BIG_ENDIAN */
				50
				51	.macro def_fn f p2align=0
				52	.text
				53	.p2align \p2align
				54	.global \f
				55	.type \f, %function
				56	\f:
				57	.endm
				58
				59	/* Parameters and result. */
				60	#define src1 r0
				61	#define src2 r1
				62	#define result r0 /* Overlaps src1. */
				63
				64	/* Internal variables. */
				65	#define tmp1 r4
				66	#define tmp2 r5
				67	#define const_m1 r12
				68
				69	/* Additional internal variables for 64-bit aligned data. */
				70	#define data1a r2
				71	#define data1b r3
				72	#define data2a r6
				73	#define data2b r7
				74	#define syndrome_a tmp1
				75	#define syndrome_b tmp2
				76
				77	/* Additional internal variables for 32-bit aligned data. */
				78	#define data1 r2
				79	#define data2 r3
				80	#define syndrome tmp2
				81
				82
				83	/* Macro to compute and return the result value for word-aligned
				84	cases. */
				85	.macro strcmp_epilogue_aligned synd d1 d2 restore_r6
				86	#ifdef __ARM_BIG_ENDIAN
				87	/* If data1 contains a zero byte, then syndrome will contain a 1 in
				88	bit 7 of that byte. Otherwise, the highest set bit in the
				89	syndrome will highlight the first different bit. It is therefore
				90	sufficient to extract the eight bits starting with the syndrome
				91	bit. */
				92	clz tmp1, \synd
				93	lsl r1, \d2, tmp1
				94	.if \restore_r6
				95	ldrd r6, r7, [sp, #8]
				96	.endif
				97	.cfi_restore 6
				98	.cfi_restore 7
				99	lsl \d1, \d1, tmp1
				100	.cfi_remember_state
				101	lsr result, \d1, #24
				102	ldrd r4, r5, [sp], #16
				103	.cfi_restore 4
				104	.cfi_restore 5
				105	sub result, result, r1, lsr #24
				106	bx lr
				107	#else
				108	/* To use the big-endian trick we'd have to reverse all three words.
				109	that's slower than this approach. */
				110	rev \synd, \synd
				111	clz tmp1, \synd
				112	bic tmp1, tmp1, #7
				113	lsr r1, \d2, tmp1
				114	.cfi_remember_state
				115	.if \restore_r6
				116	ldrd r6, r7, [sp, #8]
				117	.endif
				118	.cfi_restore 6
				119	.cfi_restore 7
				120	lsr \d1, \d1, tmp1
				121	and result, \d1, #255
				122	and r1, r1, #255
				123	ldrd r4, r5, [sp], #16
				124	.cfi_restore 4
				125	.cfi_restore 5
				126	sub result, result, r1
				127
				128	bx lr
				129	#endif
				130	.endm
				131
				132	.text
				133	.p2align 5
				134	.Lstrcmp_start_addr:
				135	#if STRCMP_NO_PRECHECK == 0
				136	.Lfastpath_exit:
				137	sub r0, r2, r3
				138	bx lr
				139	nop
				140	#endif
				141	def_fn __strcmp_arm
				142	#if STRCMP_NO_PRECHECK == 0
				143	ldrb r2, [src1]
				144	ldrb r3, [src2]
				145	cmp r2, #1
				146	it cs
				147	cmpcs r2, r3
				148	bne .Lfastpath_exit
				149	#endif
				150	.cfi_startproc
				151	strd r4, r5, [sp, #-16]!
				152	.cfi_def_cfa_offset 16
				153	.cfi_offset 4, -16
				154	.cfi_offset 5, -12
				155	orr tmp1, src1, src2
				156	strd r6, r7, [sp, #8]
				157	.cfi_offset 6, -8
				158	.cfi_offset 7, -4
				159	mvn const_m1, #0
				160	lsl r2, tmp1, #29
				161	cbz r2, .Lloop_aligned8
				162
				163	.Lnot_aligned:
				164	eor tmp1, src1, src2
				165	tst tmp1, #7
				166	bne .Lmisaligned8
				167
				168	/* Deal with mutual misalignment by aligning downwards and then
				169	masking off the unwanted loaded data to prevent a difference. */
				170	and tmp1, src1, #7
				171	bic src1, src1, #7
				172	and tmp2, tmp1, #3
				173	bic src2, src2, #7
				174	lsl tmp2, tmp2, #3 /* Bytes -> bits. */
				175	ldrd data1a, data1b, [src1], #16
				176	tst tmp1, #4
				177	ldrd data2a, data2b, [src2], #16
				178	/* In thumb code we can't use MVN with a register shift, but
				179	we do have ORN. */
				180	S2HI tmp1, const_m1, tmp2
				181	orn data1a, data1a, tmp1
				182	orn data2a, data2a, tmp1
				183	beq .Lstart_realigned8
				184	orn data1b, data1b, tmp1
				185	mov data1a, const_m1
				186	orn data2b, data2b, tmp1
				187	mov data2a, const_m1
				188	b .Lstart_realigned8
				189
				190	/* Unwind the inner loop by a factor of 2, giving 16 bytes per
				191	pass. */
				192	.p2align 5,,12 /* Don't start in the tail bytes of a cache line. */
				193	.p2align 2 /* Always word aligned. */
				194	.Lloop_aligned8:
				195	ldrd data1a, data1b, [src1], #16
				196	ldrd data2a, data2b, [src2], #16
				197	.Lstart_realigned8:
				198	uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
				199	eor syndrome_a, data1a, data2a
				200	sel syndrome_a, syndrome_a, const_m1
				201	cbnz syndrome_a, .Ldiff_in_a
				202	uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
				203	eor syndrome_b, data1b, data2b
				204	sel syndrome_b, syndrome_b, const_m1
				205	cbnz syndrome_b, .Ldiff_in_b
				206
				207	ldrd data1a, data1b, [src1, #-8]
				208	ldrd data2a, data2b, [src2, #-8]
				209	uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
				210	eor syndrome_a, data1a, data2a
				211	sel syndrome_a, syndrome_a, const_m1
				212	uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
				213	eor syndrome_b, data1b, data2b
				214	sel syndrome_b, syndrome_b, const_m1
				215	/* Can't use CBZ for backwards branch. */
				216	orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
				217	beq .Lloop_aligned8
				218
				219	.Ldiff_found:
				220	cbnz syndrome_a, .Ldiff_in_a
				221
				222	.Ldiff_in_b:
				223	strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
				224
				225	.Ldiff_in_a:
				226	.cfi_restore_state
				227	strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
				228
				229	.cfi_restore_state
				230	.Lmisaligned8:
				231	tst tmp1, #3
				232	bne .Lmisaligned4
				233	ands tmp1, src1, #3
				234	bne .Lmutual_align4
				235
				236	/* Unrolled by a factor of 2, to reduce the number of post-increment
				237	operations. */
				238	.Lloop_aligned4:
				239	ldr data1, [src1], #8
				240	ldr data2, [src2], #8
				241	.Lstart_realigned4:
				242	uadd8 syndrome, data1, const_m1 /* Only need GE bits. */
				243	eor syndrome, data1, data2
				244	sel syndrome, syndrome, const_m1
				245	cbnz syndrome, .Laligned4_done
				246	ldr data1, [src1, #-4]
				247	ldr data2, [src2, #-4]
				248	uadd8 syndrome, data1, const_m1
				249	eor syndrome, data1, data2
				250	sel syndrome, syndrome, const_m1
				251	cmp syndrome, #0
				252	beq .Lloop_aligned4
				253
				254	.Laligned4_done:
				255	strcmp_epilogue_aligned syndrome, data1, data2, 0
				256
				257	.Lmutual_align4:
				258	.cfi_restore_state
				259	/* Deal with mutual misalignment by aligning downwards and then
				260	masking off the unwanted loaded data to prevent a difference. */
				261	lsl tmp1, tmp1, #3 /* Bytes -> bits. */
				262	bic src1, src1, #3
				263	ldr data1, [src1], #8
				264	bic src2, src2, #3
				265	ldr data2, [src2], #8
				266
				267	/* In thumb code we can't use MVN with a register shift, but
				268	we do have ORN. */
				269	S2HI tmp1, const_m1, tmp1
				270	orn data1, data1, tmp1
				271	orn data2, data2, tmp1
				272	b .Lstart_realigned4
				273
				274	.Lmisaligned4:
				275	ands tmp1, src1, #3
				276	beq .Lsrc1_aligned
				277	sub src2, src2, tmp1
				278	bic src1, src1, #3
				279	lsls tmp1, tmp1, #31
				280	ldr data1, [src1], #4
				281	beq .Laligned_m2
				282	bcs .Laligned_m1
				283
				284	#if STRCMP_NO_PRECHECK == 1
				285	ldrb data2, [src2, #1]
				286	uxtb tmp1, data1, ror #BYTE1_OFFSET
				287	subs tmp1, tmp1, data2
				288	bne .Lmisaligned_exit
				289	cbz data2, .Lmisaligned_exit
				290
				291	.Laligned_m2:
				292	ldrb data2, [src2, #2]
				293	uxtb tmp1, data1, ror #BYTE2_OFFSET
				294	subs tmp1, tmp1, data2
				295	bne .Lmisaligned_exit
				296	cbz data2, .Lmisaligned_exit
				297
				298	.Laligned_m1:
				299	ldrb data2, [src2, #3]
				300	uxtb tmp1, data1, ror #BYTE3_OFFSET
				301	subs tmp1, tmp1, data2
				302	bne .Lmisaligned_exit
				303	add src2, src2, #4
				304	cbnz data2, .Lsrc1_aligned
				305	#else /* STRCMP_NO_PRECHECK */
				306	/* If we've done the pre-check, then we don't need to check the
				307	first byte again here. */
				308	ldrb data2, [src2, #2]
				309	uxtb tmp1, data1, ror #BYTE2_OFFSET
				310	subs tmp1, tmp1, data2
				311	bne .Lmisaligned_exit
				312	cbz data2, .Lmisaligned_exit
				313
				314	.Laligned_m2:
				315	ldrb data2, [src2, #3]
				316	uxtb tmp1, data1, ror #BYTE3_OFFSET
				317	subs tmp1, tmp1, data2
				318	bne .Lmisaligned_exit
				319	cbnz data2, .Laligned_m1
				320	#endif
				321
				322	.Lmisaligned_exit:
				323	.cfi_remember_state
				324	mov result, tmp1
				325	ldr r4, [sp], #16
				326	.cfi_restore 4
				327	bx lr
				328
				329	#if STRCMP_NO_PRECHECK == 0
				330	.Laligned_m1:
				331	add src2, src2, #4
				332	#endif
				333	.Lsrc1_aligned:
				334	.cfi_restore_state
				335	/* src1 is word aligned, but src2 has no common alignment
				336	with it. */
				337	ldr data1, [src1], #4
				338	lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */
				339
				340	bic src2, src2, #3
				341	ldr data2, [src2], #4
				342	bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */
				343	bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */
				344
				345	/* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */
				346	.Loverlap3:
				347	bic tmp1, data1, #MSB
				348	uadd8 syndrome, data1, const_m1
				349	eors syndrome, tmp1, data2, S2LO #8
				350	sel syndrome, syndrome, const_m1
				351	bne 4f
				352	cbnz syndrome, 5f
				353	ldr data2, [src2], #4
				354	eor tmp1, tmp1, data1
				355	cmp tmp1, data2, S2HI #24
				356	bne 6f
				357	ldr data1, [src1], #4
				358	b .Loverlap3
				359	4:
				360	S2LO data2, data2, #8
				361	b .Lstrcmp_tail
				362
				363	5:
				364	bics syndrome, syndrome, #MSB
				365	bne .Lstrcmp_done_equal
				366
				367	/* We can only get here if the MSB of data1 contains 0, so
				368	fast-path the exit. */
				369	ldrb result, [src2]
				370	.cfi_remember_state
				371	ldrd r4, r5, [sp], #16
				372	.cfi_restore 4
				373	.cfi_restore 5
				374	/* R6/7 Not used in this sequence. */
				375	.cfi_restore 6
				376	.cfi_restore 7
				377	neg result, result
				378	bx lr
				379
				380	6:
				381	.cfi_restore_state
				382	S2LO data1, data1, #24
				383	and data2, data2, #LSB
				384	b .Lstrcmp_tail
				385
				386	.p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
				387	.Loverlap2:
				388	and tmp1, data1, const_m1, S2LO #16
				389	uadd8 syndrome, data1, const_m1
				390	eors syndrome, tmp1, data2, S2LO #16
				391	sel syndrome, syndrome, const_m1
				392	bne 4f
				393	cbnz syndrome, 5f
				394	ldr data2, [src2], #4
				395	eor tmp1, tmp1, data1
				396	cmp tmp1, data2, S2HI #16
				397	bne 6f
				398	ldr data1, [src1], #4
				399	b .Loverlap2
				400	4:
				401	S2LO data2, data2, #16
				402	b .Lstrcmp_tail
				403	5:
				404	ands syndrome, syndrome, const_m1, S2LO #16
				405	bne .Lstrcmp_done_equal
				406
				407	ldrh data2, [src2]
				408	S2LO data1, data1, #16
				409	#ifdef __ARM_BIG_ENDIAN
				410	lsl data2, data2, #16
				411	#endif
				412	b .Lstrcmp_tail
				413
				414	6:
				415	S2LO data1, data1, #16
				416	and data2, data2, const_m1, S2LO #16
				417	b .Lstrcmp_tail
				418
				419	.p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
				420	.Loverlap1:
				421	and tmp1, data1, #LSB
				422	uadd8 syndrome, data1, const_m1
				423	eors syndrome, tmp1, data2, S2LO #24
				424	sel syndrome, syndrome, const_m1
				425	bne 4f
				426	cbnz syndrome, 5f
				427	ldr data2, [src2], #4
				428	eor tmp1, tmp1, data1
				429	cmp tmp1, data2, S2HI #8
				430	bne 6f
				431	ldr data1, [src1], #4
				432	b .Loverlap1
				433	4:
				434	S2LO data2, data2, #24
				435	b .Lstrcmp_tail
				436	5:
				437	tst syndrome, #LSB
				438	bne .Lstrcmp_done_equal
				439	ldr data2, [src2]
				440	6:
				441	S2LO data1, data1, #8
				442	bic data2, data2, #MSB
				443	b .Lstrcmp_tail
				444
				445	.Lstrcmp_done_equal:
				446	mov result, #0
				447	.cfi_remember_state
				448	ldrd r4, r5, [sp], #16
				449	.cfi_restore 4
				450	.cfi_restore 5
				451	/* R6/7 not used in this sequence. */
				452	.cfi_restore 6
				453	.cfi_restore 7
				454	bx lr
				455
				456	.Lstrcmp_tail:
				457	.cfi_restore_state
				458	#ifndef __ARM_BIG_ENDIAN
				459	rev data1, data1
				460	rev data2, data2
				461	/* Now everything looks big-endian... */
				462	#endif
				463	uadd8 tmp1, data1, const_m1
				464	eor tmp1, data1, data2
				465	sel syndrome, tmp1, const_m1
				466	clz tmp1, syndrome
				467	lsl data1, data1, tmp1
				468	lsl data2, data2, tmp1
				469	lsr result, data1, #24
				470	ldrd r4, r5, [sp], #16
				471	.cfi_restore 4
				472	.cfi_restore 5
				473	/* R6/7 not used in this sequence. */
				474	.cfi_restore 6
				475	.cfi_restore 7
				476	sub result, result, data2, lsr #24
				477	bx lr
				478	.cfi_endproc
				479	.size __strcmp, . - .Lstrcmp_start_addr