blob: 8b3c838b8b6e5f6ca478ead2d02a0685a5e786b1 [file] [log] [blame]
Adhemerval Zanellad30faa52019-08-06 16:21:09 -03001/*
2 * Copyright (c) 2012-2014 ARM Ltd
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. The name of the company may not be used to endorse or promote
14 * products derived from this software without specific prior written
15 * permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
22 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/* Implementation of strcmp for ARMv7 when DSP instructions are
30 available. Use ldrd to support wider loads, provided the data
31 is sufficiently aligned. Use saturating arithmetic to optimize
32 the compares. */
33
34/* Build Options:
35 STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
36 byte in the string. If comparing completely random strings
37 the pre-check will save time, since there is a very high
38 probability of a mismatch in the first character: we save
39 significant overhead if this is the common case. However,
40 if strings are likely to be identical (eg because we're
41 verifying a hit in a hash table), then this check is largely
42 redundant. */
43
44#define STRCMP_NO_PRECHECK 0
45
46 /* This version uses Thumb-2 code. */
47 .thumb
48 .syntax unified
49
50#ifdef __ARM_BIG_ENDIAN
51#define S2LO lsl
52#define S2LOEQ lsleq
53#define S2HI lsr
54#define MSB 0x000000ff
55#define LSB 0xff000000
56#define BYTE0_OFFSET 24
57#define BYTE1_OFFSET 16
58#define BYTE2_OFFSET 8
59#define BYTE3_OFFSET 0
60#else /* not __ARM_BIG_ENDIAN */
61#define S2LO lsr
62#define S2LOEQ lsreq
63#define S2HI lsl
64#define BYTE0_OFFSET 0
65#define BYTE1_OFFSET 8
66#define BYTE2_OFFSET 16
67#define BYTE3_OFFSET 24
68#define MSB 0xff000000
69#define LSB 0x000000ff
70#endif /* not __ARM_BIG_ENDIAN */
71
72 .macro def_fn f p2align=0
73 .text
74 .p2align \p2align
75 .global \f
76 .type \f, %function
77\f:
78 .endm
79
80/* Parameters and result. */
81#define src1 r0
82#define src2 r1
83#define result r0 /* Overlaps src1. */
84
85/* Internal variables. */
86#define tmp1 r4
87#define tmp2 r5
88#define const_m1 r12
89
90/* Additional internal variables for 64-bit aligned data. */
91#define data1a r2
92#define data1b r3
93#define data2a r6
94#define data2b r7
95#define syndrome_a tmp1
96#define syndrome_b tmp2
97
98/* Additional internal variables for 32-bit aligned data. */
99#define data1 r2
100#define data2 r3
101#define syndrome tmp2
102
103
104 /* Macro to compute and return the result value for word-aligned
105 cases. */
106 .macro strcmp_epilogue_aligned synd d1 d2 restore_r6
107#ifdef __ARM_BIG_ENDIAN
108 /* If data1 contains a zero byte, then syndrome will contain a 1 in
109 bit 7 of that byte. Otherwise, the highest set bit in the
110 syndrome will highlight the first different bit. It is therefore
111 sufficient to extract the eight bits starting with the syndrome
112 bit. */
113 clz tmp1, \synd
114 lsl r1, \d2, tmp1
115 .if \restore_r6
116 ldrd r6, r7, [sp, #8]
117 .endif
118 .cfi_restore 6
119 .cfi_restore 7
120 lsl \d1, \d1, tmp1
121 .cfi_remember_state
122 lsr result, \d1, #24
123 ldrd r4, r5, [sp], #16
124 .cfi_restore 4
125 .cfi_restore 5
126 sub result, result, r1, lsr #24
127 bx lr
128#else
129 /* To use the big-endian trick we'd have to reverse all three words.
130 that's slower than this approach. */
131 rev \synd, \synd
132 clz tmp1, \synd
133 bic tmp1, tmp1, #7
134 lsr r1, \d2, tmp1
135 .cfi_remember_state
136 .if \restore_r6
137 ldrd r6, r7, [sp, #8]
138 .endif
139 .cfi_restore 6
140 .cfi_restore 7
141 lsr \d1, \d1, tmp1
142 and result, \d1, #255
143 and r1, r1, #255
144 ldrd r4, r5, [sp], #16
145 .cfi_restore 4
146 .cfi_restore 5
147 sub result, result, r1
148
149 bx lr
150#endif
151 .endm
152
153 .text
154 .p2align 5
155.Lstrcmp_start_addr:
156#if STRCMP_NO_PRECHECK == 0
157.Lfastpath_exit:
158 sub r0, r2, r3
159 bx lr
160 nop
161#endif
162def_fn __strcmp_arm
163#if STRCMP_NO_PRECHECK == 0
164 ldrb r2, [src1]
165 ldrb r3, [src2]
166 cmp r2, #1
167 it cs
168 cmpcs r2, r3
169 bne .Lfastpath_exit
170#endif
171 .cfi_startproc
172 strd r4, r5, [sp, #-16]!
173 .cfi_def_cfa_offset 16
174 .cfi_offset 4, -16
175 .cfi_offset 5, -12
176 orr tmp1, src1, src2
177 strd r6, r7, [sp, #8]
178 .cfi_offset 6, -8
179 .cfi_offset 7, -4
180 mvn const_m1, #0
181 lsl r2, tmp1, #29
182 cbz r2, .Lloop_aligned8
183
184.Lnot_aligned:
185 eor tmp1, src1, src2
186 tst tmp1, #7
187 bne .Lmisaligned8
188
189 /* Deal with mutual misalignment by aligning downwards and then
190 masking off the unwanted loaded data to prevent a difference. */
191 and tmp1, src1, #7
192 bic src1, src1, #7
193 and tmp2, tmp1, #3
194 bic src2, src2, #7
195 lsl tmp2, tmp2, #3 /* Bytes -> bits. */
196 ldrd data1a, data1b, [src1], #16
197 tst tmp1, #4
198 ldrd data2a, data2b, [src2], #16
199 /* In thumb code we can't use MVN with a register shift, but
200 we do have ORN. */
201 S2HI tmp1, const_m1, tmp2
202 orn data1a, data1a, tmp1
203 orn data2a, data2a, tmp1
204 beq .Lstart_realigned8
205 orn data1b, data1b, tmp1
206 mov data1a, const_m1
207 orn data2b, data2b, tmp1
208 mov data2a, const_m1
209 b .Lstart_realigned8
210
211 /* Unwind the inner loop by a factor of 2, giving 16 bytes per
212 pass. */
213 .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */
214 .p2align 2 /* Always word aligned. */
215.Lloop_aligned8:
216 ldrd data1a, data1b, [src1], #16
217 ldrd data2a, data2b, [src2], #16
218.Lstart_realigned8:
219 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
220 eor syndrome_a, data1a, data2a
221 sel syndrome_a, syndrome_a, const_m1
222 cbnz syndrome_a, .Ldiff_in_a
223 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
224 eor syndrome_b, data1b, data2b
225 sel syndrome_b, syndrome_b, const_m1
226 cbnz syndrome_b, .Ldiff_in_b
227
228 ldrd data1a, data1b, [src1, #-8]
229 ldrd data2a, data2b, [src2, #-8]
230 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
231 eor syndrome_a, data1a, data2a
232 sel syndrome_a, syndrome_a, const_m1
233 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
234 eor syndrome_b, data1b, data2b
235 sel syndrome_b, syndrome_b, const_m1
236 /* Can't use CBZ for backwards branch. */
237 orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
238 beq .Lloop_aligned8
239
240.Ldiff_found:
241 cbnz syndrome_a, .Ldiff_in_a
242
243.Ldiff_in_b:
244 strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
245
246.Ldiff_in_a:
247 .cfi_restore_state
248 strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
249
250 .cfi_restore_state
251.Lmisaligned8:
252 tst tmp1, #3
253 bne .Lmisaligned4
254 ands tmp1, src1, #3
255 bne .Lmutual_align4
256
257 /* Unrolled by a factor of 2, to reduce the number of post-increment
258 operations. */
259.Lloop_aligned4:
260 ldr data1, [src1], #8
261 ldr data2, [src2], #8
262.Lstart_realigned4:
263 uadd8 syndrome, data1, const_m1 /* Only need GE bits. */
264 eor syndrome, data1, data2
265 sel syndrome, syndrome, const_m1
266 cbnz syndrome, .Laligned4_done
267 ldr data1, [src1, #-4]
268 ldr data2, [src2, #-4]
269 uadd8 syndrome, data1, const_m1
270 eor syndrome, data1, data2
271 sel syndrome, syndrome, const_m1
272 cmp syndrome, #0
273 beq .Lloop_aligned4
274
275.Laligned4_done:
276 strcmp_epilogue_aligned syndrome, data1, data2, 0
277
278.Lmutual_align4:
279 .cfi_restore_state
280 /* Deal with mutual misalignment by aligning downwards and then
281 masking off the unwanted loaded data to prevent a difference. */
282 lsl tmp1, tmp1, #3 /* Bytes -> bits. */
283 bic src1, src1, #3
284 ldr data1, [src1], #8
285 bic src2, src2, #3
286 ldr data2, [src2], #8
287
288 /* In thumb code we can't use MVN with a register shift, but
289 we do have ORN. */
290 S2HI tmp1, const_m1, tmp1
291 orn data1, data1, tmp1
292 orn data2, data2, tmp1
293 b .Lstart_realigned4
294
295.Lmisaligned4:
296 ands tmp1, src1, #3
297 beq .Lsrc1_aligned
298 sub src2, src2, tmp1
299 bic src1, src1, #3
300 lsls tmp1, tmp1, #31
301 ldr data1, [src1], #4
302 beq .Laligned_m2
303 bcs .Laligned_m1
304
305#if STRCMP_NO_PRECHECK == 1
306 ldrb data2, [src2, #1]
307 uxtb tmp1, data1, ror #BYTE1_OFFSET
308 subs tmp1, tmp1, data2
309 bne .Lmisaligned_exit
310 cbz data2, .Lmisaligned_exit
311
312.Laligned_m2:
313 ldrb data2, [src2, #2]
314 uxtb tmp1, data1, ror #BYTE2_OFFSET
315 subs tmp1, tmp1, data2
316 bne .Lmisaligned_exit
317 cbz data2, .Lmisaligned_exit
318
319.Laligned_m1:
320 ldrb data2, [src2, #3]
321 uxtb tmp1, data1, ror #BYTE3_OFFSET
322 subs tmp1, tmp1, data2
323 bne .Lmisaligned_exit
324 add src2, src2, #4
325 cbnz data2, .Lsrc1_aligned
326#else /* STRCMP_NO_PRECHECK */
327 /* If we've done the pre-check, then we don't need to check the
328 first byte again here. */
329 ldrb data2, [src2, #2]
330 uxtb tmp1, data1, ror #BYTE2_OFFSET
331 subs tmp1, tmp1, data2
332 bne .Lmisaligned_exit
333 cbz data2, .Lmisaligned_exit
334
335.Laligned_m2:
336 ldrb data2, [src2, #3]
337 uxtb tmp1, data1, ror #BYTE3_OFFSET
338 subs tmp1, tmp1, data2
339 bne .Lmisaligned_exit
340 cbnz data2, .Laligned_m1
341#endif
342
343.Lmisaligned_exit:
344 .cfi_remember_state
345 mov result, tmp1
346 ldr r4, [sp], #16
347 .cfi_restore 4
348 bx lr
349
350#if STRCMP_NO_PRECHECK == 0
351.Laligned_m1:
352 add src2, src2, #4
353#endif
354.Lsrc1_aligned:
355 .cfi_restore_state
356 /* src1 is word aligned, but src2 has no common alignment
357 with it. */
358 ldr data1, [src1], #4
359 lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */
360
361 bic src2, src2, #3
362 ldr data2, [src2], #4
363 bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */
364 bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */
365
366 /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */
367.Loverlap3:
368 bic tmp1, data1, #MSB
369 uadd8 syndrome, data1, const_m1
370 eors syndrome, tmp1, data2, S2LO #8
371 sel syndrome, syndrome, const_m1
372 bne 4f
373 cbnz syndrome, 5f
374 ldr data2, [src2], #4
375 eor tmp1, tmp1, data1
376 cmp tmp1, data2, S2HI #24
377 bne 6f
378 ldr data1, [src1], #4
379 b .Loverlap3
3804:
381 S2LO data2, data2, #8
382 b .Lstrcmp_tail
383
3845:
385 bics syndrome, syndrome, #MSB
386 bne .Lstrcmp_done_equal
387
388 /* We can only get here if the MSB of data1 contains 0, so
389 fast-path the exit. */
390 ldrb result, [src2]
391 .cfi_remember_state
392 ldrd r4, r5, [sp], #16
393 .cfi_restore 4
394 .cfi_restore 5
395 /* R6/7 Not used in this sequence. */
396 .cfi_restore 6
397 .cfi_restore 7
398 neg result, result
399 bx lr
400
4016:
402 .cfi_restore_state
403 S2LO data1, data1, #24
404 and data2, data2, #LSB
405 b .Lstrcmp_tail
406
407 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
408.Loverlap2:
409 and tmp1, data1, const_m1, S2LO #16
410 uadd8 syndrome, data1, const_m1
411 eors syndrome, tmp1, data2, S2LO #16
412 sel syndrome, syndrome, const_m1
413 bne 4f
414 cbnz syndrome, 5f
415 ldr data2, [src2], #4
416 eor tmp1, tmp1, data1
417 cmp tmp1, data2, S2HI #16
418 bne 6f
419 ldr data1, [src1], #4
420 b .Loverlap2
4214:
422 S2LO data2, data2, #16
423 b .Lstrcmp_tail
4245:
425 ands syndrome, syndrome, const_m1, S2LO #16
426 bne .Lstrcmp_done_equal
427
428 ldrh data2, [src2]
429 S2LO data1, data1, #16
430#ifdef __ARM_BIG_ENDIAN
431 lsl data2, data2, #16
432#endif
433 b .Lstrcmp_tail
434
4356:
436 S2LO data1, data1, #16
437 and data2, data2, const_m1, S2LO #16
438 b .Lstrcmp_tail
439
440 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
441.Loverlap1:
442 and tmp1, data1, #LSB
443 uadd8 syndrome, data1, const_m1
444 eors syndrome, tmp1, data2, S2LO #24
445 sel syndrome, syndrome, const_m1
446 bne 4f
447 cbnz syndrome, 5f
448 ldr data2, [src2], #4
449 eor tmp1, tmp1, data1
450 cmp tmp1, data2, S2HI #8
451 bne 6f
452 ldr data1, [src1], #4
453 b .Loverlap1
4544:
455 S2LO data2, data2, #24
456 b .Lstrcmp_tail
4575:
458 tst syndrome, #LSB
459 bne .Lstrcmp_done_equal
460 ldr data2, [src2]
4616:
462 S2LO data1, data1, #8
463 bic data2, data2, #MSB
464 b .Lstrcmp_tail
465
466.Lstrcmp_done_equal:
467 mov result, #0
468 .cfi_remember_state
469 ldrd r4, r5, [sp], #16
470 .cfi_restore 4
471 .cfi_restore 5
472 /* R6/7 not used in this sequence. */
473 .cfi_restore 6
474 .cfi_restore 7
475 bx lr
476
477.Lstrcmp_tail:
478 .cfi_restore_state
479#ifndef __ARM_BIG_ENDIAN
480 rev data1, data1
481 rev data2, data2
482 /* Now everything looks big-endian... */
483#endif
484 uadd8 tmp1, data1, const_m1
485 eor tmp1, data1, data2
486 sel syndrome, tmp1, const_m1
487 clz tmp1, syndrome
488 lsl data1, data1, tmp1
489 lsl data2, data2, tmp1
490 lsr result, data1, #24
491 ldrd r4, r5, [sp], #16
492 .cfi_restore 4
493 .cfi_restore 5
494 /* R6/7 not used in this sequence. */
495 .cfi_restore 6
496 .cfi_restore 7
497 sub result, result, data2, lsr #24
498 bx lr
499 .cfi_endproc
500 .size __strcmp, . - .Lstrcmp_start_addr