blob: b46bc6d556f2116f31908e790f49a7c644702da8 [file] [log] [blame]
Adhemerval Zanellad30faa52019-08-06 16:21:09 -03001/*
Szabolcs Nagy558699c2019-08-27 15:57:58 +01002 * strcmp for ARMv7
Adhemerval Zanellad30faa52019-08-06 16:21:09 -03003 *
Szabolcs Nagy558699c2019-08-27 15:57:58 +01004 * Copyright (c) 2012-2019, Arm Limited.
5 * SPDX-License-Identifier: MIT
Adhemerval Zanellad30faa52019-08-06 16:21:09 -03006 */
7
8/* Implementation of strcmp for ARMv7 when DSP instructions are
9 available. Use ldrd to support wider loads, provided the data
10 is sufficiently aligned. Use saturating arithmetic to optimize
11 the compares. */
12
Wilco Dijkstra31b560b2020-01-02 09:14:42 +000013#include "../asmdefs.h"
14
Adhemerval Zanellad30faa52019-08-06 16:21:09 -030015/* Build Options:
16 STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
17 byte in the string. If comparing completely random strings
18 the pre-check will save time, since there is a very high
19 probability of a mismatch in the first character: we save
20 significant overhead if this is the common case. However,
21 if strings are likely to be identical (eg because we're
22 verifying a hit in a hash table), then this check is largely
23 redundant. */
24
25#define STRCMP_NO_PRECHECK 0
26
27 /* This version uses Thumb-2 code. */
28 .thumb
29 .syntax unified
30
31#ifdef __ARM_BIG_ENDIAN
32#define S2LO lsl
33#define S2LOEQ lsleq
34#define S2HI lsr
35#define MSB 0x000000ff
36#define LSB 0xff000000
37#define BYTE0_OFFSET 24
38#define BYTE1_OFFSET 16
39#define BYTE2_OFFSET 8
40#define BYTE3_OFFSET 0
41#else /* not __ARM_BIG_ENDIAN */
42#define S2LO lsr
43#define S2LOEQ lsreq
44#define S2HI lsl
45#define BYTE0_OFFSET 0
46#define BYTE1_OFFSET 8
47#define BYTE2_OFFSET 16
48#define BYTE3_OFFSET 24
49#define MSB 0xff000000
50#define LSB 0x000000ff
51#endif /* not __ARM_BIG_ENDIAN */
52
Adhemerval Zanellad30faa52019-08-06 16:21:09 -030053/* Parameters and result. */
54#define src1 r0
55#define src2 r1
56#define result r0 /* Overlaps src1. */
57
58/* Internal variables. */
59#define tmp1 r4
60#define tmp2 r5
61#define const_m1 r12
62
63/* Additional internal variables for 64-bit aligned data. */
64#define data1a r2
65#define data1b r3
66#define data2a r6
67#define data2b r7
68#define syndrome_a tmp1
69#define syndrome_b tmp2
70
71/* Additional internal variables for 32-bit aligned data. */
72#define data1 r2
73#define data2 r3
74#define syndrome tmp2
75
76
77 /* Macro to compute and return the result value for word-aligned
78 cases. */
79 .macro strcmp_epilogue_aligned synd d1 d2 restore_r6
80#ifdef __ARM_BIG_ENDIAN
81 /* If data1 contains a zero byte, then syndrome will contain a 1 in
82 bit 7 of that byte. Otherwise, the highest set bit in the
83 syndrome will highlight the first different bit. It is therefore
84 sufficient to extract the eight bits starting with the syndrome
85 bit. */
86 clz tmp1, \synd
87 lsl r1, \d2, tmp1
88 .if \restore_r6
89 ldrd r6, r7, [sp, #8]
90 .endif
91 .cfi_restore 6
92 .cfi_restore 7
93 lsl \d1, \d1, tmp1
94 .cfi_remember_state
95 lsr result, \d1, #24
96 ldrd r4, r5, [sp], #16
97 .cfi_restore 4
98 .cfi_restore 5
99 sub result, result, r1, lsr #24
100 bx lr
101#else
102 /* To use the big-endian trick we'd have to reverse all three words.
103 that's slower than this approach. */
104 rev \synd, \synd
105 clz tmp1, \synd
106 bic tmp1, tmp1, #7
107 lsr r1, \d2, tmp1
108 .cfi_remember_state
109 .if \restore_r6
110 ldrd r6, r7, [sp, #8]
111 .endif
112 .cfi_restore 6
113 .cfi_restore 7
114 lsr \d1, \d1, tmp1
115 and result, \d1, #255
116 and r1, r1, #255
117 ldrd r4, r5, [sp], #16
118 .cfi_restore 4
119 .cfi_restore 5
120 sub result, result, r1
121
122 bx lr
123#endif
124 .endm
125
126 .text
127 .p2align 5
128.Lstrcmp_start_addr:
129#if STRCMP_NO_PRECHECK == 0
130.Lfastpath_exit:
131 sub r0, r2, r3
132 bx lr
133 nop
134#endif
Wilco Dijkstra31b560b2020-01-02 09:14:42 +0000135ENTRY_ALIGN (__strcmp_arm, 0)
Adhemerval Zanellad30faa52019-08-06 16:21:09 -0300136#if STRCMP_NO_PRECHECK == 0
137 ldrb r2, [src1]
138 ldrb r3, [src2]
139 cmp r2, #1
140 it cs
141 cmpcs r2, r3
142 bne .Lfastpath_exit
143#endif
Adhemerval Zanellad30faa52019-08-06 16:21:09 -0300144 strd r4, r5, [sp, #-16]!
145 .cfi_def_cfa_offset 16
146 .cfi_offset 4, -16
147 .cfi_offset 5, -12
148 orr tmp1, src1, src2
149 strd r6, r7, [sp, #8]
150 .cfi_offset 6, -8
151 .cfi_offset 7, -4
152 mvn const_m1, #0
153 lsl r2, tmp1, #29
154 cbz r2, .Lloop_aligned8
155
156.Lnot_aligned:
157 eor tmp1, src1, src2
158 tst tmp1, #7
159 bne .Lmisaligned8
160
161 /* Deal with mutual misalignment by aligning downwards and then
162 masking off the unwanted loaded data to prevent a difference. */
163 and tmp1, src1, #7
164 bic src1, src1, #7
165 and tmp2, tmp1, #3
166 bic src2, src2, #7
167 lsl tmp2, tmp2, #3 /* Bytes -> bits. */
168 ldrd data1a, data1b, [src1], #16
169 tst tmp1, #4
170 ldrd data2a, data2b, [src2], #16
171 /* In thumb code we can't use MVN with a register shift, but
172 we do have ORN. */
173 S2HI tmp1, const_m1, tmp2
174 orn data1a, data1a, tmp1
175 orn data2a, data2a, tmp1
176 beq .Lstart_realigned8
177 orn data1b, data1b, tmp1
178 mov data1a, const_m1
179 orn data2b, data2b, tmp1
180 mov data2a, const_m1
181 b .Lstart_realigned8
182
183 /* Unwind the inner loop by a factor of 2, giving 16 bytes per
184 pass. */
185 .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */
186 .p2align 2 /* Always word aligned. */
187.Lloop_aligned8:
188 ldrd data1a, data1b, [src1], #16
189 ldrd data2a, data2b, [src2], #16
190.Lstart_realigned8:
191 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
192 eor syndrome_a, data1a, data2a
193 sel syndrome_a, syndrome_a, const_m1
194 cbnz syndrome_a, .Ldiff_in_a
195 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
196 eor syndrome_b, data1b, data2b
197 sel syndrome_b, syndrome_b, const_m1
198 cbnz syndrome_b, .Ldiff_in_b
199
200 ldrd data1a, data1b, [src1, #-8]
201 ldrd data2a, data2b, [src2, #-8]
202 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
203 eor syndrome_a, data1a, data2a
204 sel syndrome_a, syndrome_a, const_m1
205 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
206 eor syndrome_b, data1b, data2b
207 sel syndrome_b, syndrome_b, const_m1
208 /* Can't use CBZ for backwards branch. */
209 orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
210 beq .Lloop_aligned8
211
212.Ldiff_found:
213 cbnz syndrome_a, .Ldiff_in_a
214
215.Ldiff_in_b:
216 strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
217
218.Ldiff_in_a:
219 .cfi_restore_state
220 strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
221
222 .cfi_restore_state
223.Lmisaligned8:
224 tst tmp1, #3
225 bne .Lmisaligned4
226 ands tmp1, src1, #3
227 bne .Lmutual_align4
228
229 /* Unrolled by a factor of 2, to reduce the number of post-increment
230 operations. */
231.Lloop_aligned4:
232 ldr data1, [src1], #8
233 ldr data2, [src2], #8
234.Lstart_realigned4:
235 uadd8 syndrome, data1, const_m1 /* Only need GE bits. */
236 eor syndrome, data1, data2
237 sel syndrome, syndrome, const_m1
238 cbnz syndrome, .Laligned4_done
239 ldr data1, [src1, #-4]
240 ldr data2, [src2, #-4]
241 uadd8 syndrome, data1, const_m1
242 eor syndrome, data1, data2
243 sel syndrome, syndrome, const_m1
244 cmp syndrome, #0
245 beq .Lloop_aligned4
246
247.Laligned4_done:
248 strcmp_epilogue_aligned syndrome, data1, data2, 0
249
250.Lmutual_align4:
251 .cfi_restore_state
252 /* Deal with mutual misalignment by aligning downwards and then
253 masking off the unwanted loaded data to prevent a difference. */
254 lsl tmp1, tmp1, #3 /* Bytes -> bits. */
255 bic src1, src1, #3
256 ldr data1, [src1], #8
257 bic src2, src2, #3
258 ldr data2, [src2], #8
259
260 /* In thumb code we can't use MVN with a register shift, but
261 we do have ORN. */
262 S2HI tmp1, const_m1, tmp1
263 orn data1, data1, tmp1
264 orn data2, data2, tmp1
265 b .Lstart_realigned4
266
267.Lmisaligned4:
268 ands tmp1, src1, #3
269 beq .Lsrc1_aligned
270 sub src2, src2, tmp1
271 bic src1, src1, #3
272 lsls tmp1, tmp1, #31
273 ldr data1, [src1], #4
274 beq .Laligned_m2
275 bcs .Laligned_m1
276
277#if STRCMP_NO_PRECHECK == 1
278 ldrb data2, [src2, #1]
279 uxtb tmp1, data1, ror #BYTE1_OFFSET
280 subs tmp1, tmp1, data2
281 bne .Lmisaligned_exit
282 cbz data2, .Lmisaligned_exit
283
284.Laligned_m2:
285 ldrb data2, [src2, #2]
286 uxtb tmp1, data1, ror #BYTE2_OFFSET
287 subs tmp1, tmp1, data2
288 bne .Lmisaligned_exit
289 cbz data2, .Lmisaligned_exit
290
291.Laligned_m1:
292 ldrb data2, [src2, #3]
293 uxtb tmp1, data1, ror #BYTE3_OFFSET
294 subs tmp1, tmp1, data2
295 bne .Lmisaligned_exit
296 add src2, src2, #4
297 cbnz data2, .Lsrc1_aligned
298#else /* STRCMP_NO_PRECHECK */
299 /* If we've done the pre-check, then we don't need to check the
300 first byte again here. */
301 ldrb data2, [src2, #2]
302 uxtb tmp1, data1, ror #BYTE2_OFFSET
303 subs tmp1, tmp1, data2
304 bne .Lmisaligned_exit
305 cbz data2, .Lmisaligned_exit
306
307.Laligned_m2:
308 ldrb data2, [src2, #3]
309 uxtb tmp1, data1, ror #BYTE3_OFFSET
310 subs tmp1, tmp1, data2
311 bne .Lmisaligned_exit
312 cbnz data2, .Laligned_m1
313#endif
314
315.Lmisaligned_exit:
316 .cfi_remember_state
317 mov result, tmp1
318 ldr r4, [sp], #16
319 .cfi_restore 4
320 bx lr
321
322#if STRCMP_NO_PRECHECK == 0
323.Laligned_m1:
324 add src2, src2, #4
325#endif
326.Lsrc1_aligned:
327 .cfi_restore_state
328 /* src1 is word aligned, but src2 has no common alignment
329 with it. */
330 ldr data1, [src1], #4
331 lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */
332
333 bic src2, src2, #3
334 ldr data2, [src2], #4
335 bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */
336 bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */
337
338 /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */
339.Loverlap3:
340 bic tmp1, data1, #MSB
341 uadd8 syndrome, data1, const_m1
342 eors syndrome, tmp1, data2, S2LO #8
343 sel syndrome, syndrome, const_m1
344 bne 4f
345 cbnz syndrome, 5f
346 ldr data2, [src2], #4
347 eor tmp1, tmp1, data1
348 cmp tmp1, data2, S2HI #24
349 bne 6f
350 ldr data1, [src1], #4
351 b .Loverlap3
3524:
353 S2LO data2, data2, #8
354 b .Lstrcmp_tail
355
3565:
357 bics syndrome, syndrome, #MSB
358 bne .Lstrcmp_done_equal
359
360 /* We can only get here if the MSB of data1 contains 0, so
361 fast-path the exit. */
362 ldrb result, [src2]
363 .cfi_remember_state
364 ldrd r4, r5, [sp], #16
365 .cfi_restore 4
366 .cfi_restore 5
367 /* R6/7 Not used in this sequence. */
368 .cfi_restore 6
369 .cfi_restore 7
370 neg result, result
371 bx lr
372
3736:
374 .cfi_restore_state
375 S2LO data1, data1, #24
376 and data2, data2, #LSB
377 b .Lstrcmp_tail
378
379 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
380.Loverlap2:
381 and tmp1, data1, const_m1, S2LO #16
382 uadd8 syndrome, data1, const_m1
383 eors syndrome, tmp1, data2, S2LO #16
384 sel syndrome, syndrome, const_m1
385 bne 4f
386 cbnz syndrome, 5f
387 ldr data2, [src2], #4
388 eor tmp1, tmp1, data1
389 cmp tmp1, data2, S2HI #16
390 bne 6f
391 ldr data1, [src1], #4
392 b .Loverlap2
3934:
394 S2LO data2, data2, #16
395 b .Lstrcmp_tail
3965:
397 ands syndrome, syndrome, const_m1, S2LO #16
398 bne .Lstrcmp_done_equal
399
400 ldrh data2, [src2]
401 S2LO data1, data1, #16
402#ifdef __ARM_BIG_ENDIAN
403 lsl data2, data2, #16
404#endif
405 b .Lstrcmp_tail
406
4076:
408 S2LO data1, data1, #16
409 and data2, data2, const_m1, S2LO #16
410 b .Lstrcmp_tail
411
412 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
413.Loverlap1:
414 and tmp1, data1, #LSB
415 uadd8 syndrome, data1, const_m1
416 eors syndrome, tmp1, data2, S2LO #24
417 sel syndrome, syndrome, const_m1
418 bne 4f
419 cbnz syndrome, 5f
420 ldr data2, [src2], #4
421 eor tmp1, tmp1, data1
422 cmp tmp1, data2, S2HI #8
423 bne 6f
424 ldr data1, [src1], #4
425 b .Loverlap1
4264:
427 S2LO data2, data2, #24
428 b .Lstrcmp_tail
4295:
430 tst syndrome, #LSB
431 bne .Lstrcmp_done_equal
432 ldr data2, [src2]
4336:
434 S2LO data1, data1, #8
435 bic data2, data2, #MSB
436 b .Lstrcmp_tail
437
438.Lstrcmp_done_equal:
439 mov result, #0
440 .cfi_remember_state
441 ldrd r4, r5, [sp], #16
442 .cfi_restore 4
443 .cfi_restore 5
444 /* R6/7 not used in this sequence. */
445 .cfi_restore 6
446 .cfi_restore 7
447 bx lr
448
449.Lstrcmp_tail:
450 .cfi_restore_state
451#ifndef __ARM_BIG_ENDIAN
452 rev data1, data1
453 rev data2, data2
454 /* Now everything looks big-endian... */
455#endif
456 uadd8 tmp1, data1, const_m1
457 eor tmp1, data1, data2
458 sel syndrome, tmp1, const_m1
459 clz tmp1, syndrome
460 lsl data1, data1, tmp1
461 lsl data2, data2, tmp1
462 lsr result, data1, #24
463 ldrd r4, r5, [sp], #16
464 .cfi_restore 4
465 .cfi_restore 5
466 /* R6/7 not used in this sequence. */
467 .cfi_restore 6
468 .cfi_restore 7
469 sub result, result, data2, lsr #24
470 bx lr
Wilco Dijkstra31b560b2020-01-02 09:14:42 +0000471
472END (__strcmp_arm)