blob: fb9cae3fae6f90e9db1c571ffd641a3353d6d828 [file] [log] [blame]
Adhemerval Zanellad30faa52019-08-06 16:21:09 -03001/*
Szabolcs Nagy558699c2019-08-27 15:57:58 +01002 * strcmp for ARMv7
Adhemerval Zanellad30faa52019-08-06 16:21:09 -03003 *
Szabolcs Nagy558699c2019-08-27 15:57:58 +01004 * Copyright (c) 2012-2019, Arm Limited.
5 * SPDX-License-Identifier: MIT
Adhemerval Zanellad30faa52019-08-06 16:21:09 -03006 */
7
8/* Implementation of strcmp for ARMv7 when DSP instructions are
9 available. Use ldrd to support wider loads, provided the data
10 is sufficiently aligned. Use saturating arithmetic to optimize
11 the compares. */
12
13/* Build Options:
14 STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
15 byte in the string. If comparing completely random strings
16 the pre-check will save time, since there is a very high
17 probability of a mismatch in the first character: we save
18 significant overhead if this is the common case. However,
19 if strings are likely to be identical (eg because we're
20 verifying a hit in a hash table), then this check is largely
21 redundant. */
22
23#define STRCMP_NO_PRECHECK 0
24
25 /* This version uses Thumb-2 code. */
26 .thumb
27 .syntax unified
28
29#ifdef __ARM_BIG_ENDIAN
30#define S2LO lsl
31#define S2LOEQ lsleq
32#define S2HI lsr
33#define MSB 0x000000ff
34#define LSB 0xff000000
35#define BYTE0_OFFSET 24
36#define BYTE1_OFFSET 16
37#define BYTE2_OFFSET 8
38#define BYTE3_OFFSET 0
39#else /* not __ARM_BIG_ENDIAN */
40#define S2LO lsr
41#define S2LOEQ lsreq
42#define S2HI lsl
43#define BYTE0_OFFSET 0
44#define BYTE1_OFFSET 8
45#define BYTE2_OFFSET 16
46#define BYTE3_OFFSET 24
47#define MSB 0xff000000
48#define LSB 0x000000ff
49#endif /* not __ARM_BIG_ENDIAN */
50
51 .macro def_fn f p2align=0
52 .text
53 .p2align \p2align
54 .global \f
55 .type \f, %function
56\f:
57 .endm
58
59/* Parameters and result. */
60#define src1 r0
61#define src2 r1
62#define result r0 /* Overlaps src1. */
63
64/* Internal variables. */
65#define tmp1 r4
66#define tmp2 r5
67#define const_m1 r12
68
69/* Additional internal variables for 64-bit aligned data. */
70#define data1a r2
71#define data1b r3
72#define data2a r6
73#define data2b r7
74#define syndrome_a tmp1
75#define syndrome_b tmp2
76
77/* Additional internal variables for 32-bit aligned data. */
78#define data1 r2
79#define data2 r3
80#define syndrome tmp2
81
82
83 /* Macro to compute and return the result value for word-aligned
84 cases. */
85 .macro strcmp_epilogue_aligned synd d1 d2 restore_r6
86#ifdef __ARM_BIG_ENDIAN
87 /* If data1 contains a zero byte, then syndrome will contain a 1 in
88 bit 7 of that byte. Otherwise, the highest set bit in the
89 syndrome will highlight the first different bit. It is therefore
90 sufficient to extract the eight bits starting with the syndrome
91 bit. */
92 clz tmp1, \synd
93 lsl r1, \d2, tmp1
94 .if \restore_r6
95 ldrd r6, r7, [sp, #8]
96 .endif
97 .cfi_restore 6
98 .cfi_restore 7
99 lsl \d1, \d1, tmp1
100 .cfi_remember_state
101 lsr result, \d1, #24
102 ldrd r4, r5, [sp], #16
103 .cfi_restore 4
104 .cfi_restore 5
105 sub result, result, r1, lsr #24
106 bx lr
107#else
108 /* To use the big-endian trick we'd have to reverse all three words.
109 that's slower than this approach. */
110 rev \synd, \synd
111 clz tmp1, \synd
112 bic tmp1, tmp1, #7
113 lsr r1, \d2, tmp1
114 .cfi_remember_state
115 .if \restore_r6
116 ldrd r6, r7, [sp, #8]
117 .endif
118 .cfi_restore 6
119 .cfi_restore 7
120 lsr \d1, \d1, tmp1
121 and result, \d1, #255
122 and r1, r1, #255
123 ldrd r4, r5, [sp], #16
124 .cfi_restore 4
125 .cfi_restore 5
126 sub result, result, r1
127
128 bx lr
129#endif
130 .endm
131
132 .text
133 .p2align 5
134.Lstrcmp_start_addr:
135#if STRCMP_NO_PRECHECK == 0
136.Lfastpath_exit:
137 sub r0, r2, r3
138 bx lr
139 nop
140#endif
141def_fn __strcmp_arm
142#if STRCMP_NO_PRECHECK == 0
143 ldrb r2, [src1]
144 ldrb r3, [src2]
145 cmp r2, #1
146 it cs
147 cmpcs r2, r3
148 bne .Lfastpath_exit
149#endif
150 .cfi_startproc
151 strd r4, r5, [sp, #-16]!
152 .cfi_def_cfa_offset 16
153 .cfi_offset 4, -16
154 .cfi_offset 5, -12
155 orr tmp1, src1, src2
156 strd r6, r7, [sp, #8]
157 .cfi_offset 6, -8
158 .cfi_offset 7, -4
159 mvn const_m1, #0
160 lsl r2, tmp1, #29
161 cbz r2, .Lloop_aligned8
162
163.Lnot_aligned:
164 eor tmp1, src1, src2
165 tst tmp1, #7
166 bne .Lmisaligned8
167
168 /* Deal with mutual misalignment by aligning downwards and then
169 masking off the unwanted loaded data to prevent a difference. */
170 and tmp1, src1, #7
171 bic src1, src1, #7
172 and tmp2, tmp1, #3
173 bic src2, src2, #7
174 lsl tmp2, tmp2, #3 /* Bytes -> bits. */
175 ldrd data1a, data1b, [src1], #16
176 tst tmp1, #4
177 ldrd data2a, data2b, [src2], #16
178 /* In thumb code we can't use MVN with a register shift, but
179 we do have ORN. */
180 S2HI tmp1, const_m1, tmp2
181 orn data1a, data1a, tmp1
182 orn data2a, data2a, tmp1
183 beq .Lstart_realigned8
184 orn data1b, data1b, tmp1
185 mov data1a, const_m1
186 orn data2b, data2b, tmp1
187 mov data2a, const_m1
188 b .Lstart_realigned8
189
190 /* Unwind the inner loop by a factor of 2, giving 16 bytes per
191 pass. */
192 .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */
193 .p2align 2 /* Always word aligned. */
194.Lloop_aligned8:
195 ldrd data1a, data1b, [src1], #16
196 ldrd data2a, data2b, [src2], #16
197.Lstart_realigned8:
198 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
199 eor syndrome_a, data1a, data2a
200 sel syndrome_a, syndrome_a, const_m1
201 cbnz syndrome_a, .Ldiff_in_a
202 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
203 eor syndrome_b, data1b, data2b
204 sel syndrome_b, syndrome_b, const_m1
205 cbnz syndrome_b, .Ldiff_in_b
206
207 ldrd data1a, data1b, [src1, #-8]
208 ldrd data2a, data2b, [src2, #-8]
209 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
210 eor syndrome_a, data1a, data2a
211 sel syndrome_a, syndrome_a, const_m1
212 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
213 eor syndrome_b, data1b, data2b
214 sel syndrome_b, syndrome_b, const_m1
215 /* Can't use CBZ for backwards branch. */
216 orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
217 beq .Lloop_aligned8
218
219.Ldiff_found:
220 cbnz syndrome_a, .Ldiff_in_a
221
222.Ldiff_in_b:
223 strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
224
225.Ldiff_in_a:
226 .cfi_restore_state
227 strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
228
229 .cfi_restore_state
230.Lmisaligned8:
231 tst tmp1, #3
232 bne .Lmisaligned4
233 ands tmp1, src1, #3
234 bne .Lmutual_align4
235
236 /* Unrolled by a factor of 2, to reduce the number of post-increment
237 operations. */
238.Lloop_aligned4:
239 ldr data1, [src1], #8
240 ldr data2, [src2], #8
241.Lstart_realigned4:
242 uadd8 syndrome, data1, const_m1 /* Only need GE bits. */
243 eor syndrome, data1, data2
244 sel syndrome, syndrome, const_m1
245 cbnz syndrome, .Laligned4_done
246 ldr data1, [src1, #-4]
247 ldr data2, [src2, #-4]
248 uadd8 syndrome, data1, const_m1
249 eor syndrome, data1, data2
250 sel syndrome, syndrome, const_m1
251 cmp syndrome, #0
252 beq .Lloop_aligned4
253
254.Laligned4_done:
255 strcmp_epilogue_aligned syndrome, data1, data2, 0
256
257.Lmutual_align4:
258 .cfi_restore_state
259 /* Deal with mutual misalignment by aligning downwards and then
260 masking off the unwanted loaded data to prevent a difference. */
261 lsl tmp1, tmp1, #3 /* Bytes -> bits. */
262 bic src1, src1, #3
263 ldr data1, [src1], #8
264 bic src2, src2, #3
265 ldr data2, [src2], #8
266
267 /* In thumb code we can't use MVN with a register shift, but
268 we do have ORN. */
269 S2HI tmp1, const_m1, tmp1
270 orn data1, data1, tmp1
271 orn data2, data2, tmp1
272 b .Lstart_realigned4
273
274.Lmisaligned4:
275 ands tmp1, src1, #3
276 beq .Lsrc1_aligned
277 sub src2, src2, tmp1
278 bic src1, src1, #3
279 lsls tmp1, tmp1, #31
280 ldr data1, [src1], #4
281 beq .Laligned_m2
282 bcs .Laligned_m1
283
284#if STRCMP_NO_PRECHECK == 1
285 ldrb data2, [src2, #1]
286 uxtb tmp1, data1, ror #BYTE1_OFFSET
287 subs tmp1, tmp1, data2
288 bne .Lmisaligned_exit
289 cbz data2, .Lmisaligned_exit
290
291.Laligned_m2:
292 ldrb data2, [src2, #2]
293 uxtb tmp1, data1, ror #BYTE2_OFFSET
294 subs tmp1, tmp1, data2
295 bne .Lmisaligned_exit
296 cbz data2, .Lmisaligned_exit
297
298.Laligned_m1:
299 ldrb data2, [src2, #3]
300 uxtb tmp1, data1, ror #BYTE3_OFFSET
301 subs tmp1, tmp1, data2
302 bne .Lmisaligned_exit
303 add src2, src2, #4
304 cbnz data2, .Lsrc1_aligned
305#else /* STRCMP_NO_PRECHECK */
306 /* If we've done the pre-check, then we don't need to check the
307 first byte again here. */
308 ldrb data2, [src2, #2]
309 uxtb tmp1, data1, ror #BYTE2_OFFSET
310 subs tmp1, tmp1, data2
311 bne .Lmisaligned_exit
312 cbz data2, .Lmisaligned_exit
313
314.Laligned_m2:
315 ldrb data2, [src2, #3]
316 uxtb tmp1, data1, ror #BYTE3_OFFSET
317 subs tmp1, tmp1, data2
318 bne .Lmisaligned_exit
319 cbnz data2, .Laligned_m1
320#endif
321
322.Lmisaligned_exit:
323 .cfi_remember_state
324 mov result, tmp1
325 ldr r4, [sp], #16
326 .cfi_restore 4
327 bx lr
328
329#if STRCMP_NO_PRECHECK == 0
330.Laligned_m1:
331 add src2, src2, #4
332#endif
333.Lsrc1_aligned:
334 .cfi_restore_state
335 /* src1 is word aligned, but src2 has no common alignment
336 with it. */
337 ldr data1, [src1], #4
338 lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */
339
340 bic src2, src2, #3
341 ldr data2, [src2], #4
342 bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */
343 bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */
344
345 /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */
346.Loverlap3:
347 bic tmp1, data1, #MSB
348 uadd8 syndrome, data1, const_m1
349 eors syndrome, tmp1, data2, S2LO #8
350 sel syndrome, syndrome, const_m1
351 bne 4f
352 cbnz syndrome, 5f
353 ldr data2, [src2], #4
354 eor tmp1, tmp1, data1
355 cmp tmp1, data2, S2HI #24
356 bne 6f
357 ldr data1, [src1], #4
358 b .Loverlap3
3594:
360 S2LO data2, data2, #8
361 b .Lstrcmp_tail
362
3635:
364 bics syndrome, syndrome, #MSB
365 bne .Lstrcmp_done_equal
366
367 /* We can only get here if the MSB of data1 contains 0, so
368 fast-path the exit. */
369 ldrb result, [src2]
370 .cfi_remember_state
371 ldrd r4, r5, [sp], #16
372 .cfi_restore 4
373 .cfi_restore 5
374 /* R6/7 Not used in this sequence. */
375 .cfi_restore 6
376 .cfi_restore 7
377 neg result, result
378 bx lr
379
3806:
381 .cfi_restore_state
382 S2LO data1, data1, #24
383 and data2, data2, #LSB
384 b .Lstrcmp_tail
385
386 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
387.Loverlap2:
388 and tmp1, data1, const_m1, S2LO #16
389 uadd8 syndrome, data1, const_m1
390 eors syndrome, tmp1, data2, S2LO #16
391 sel syndrome, syndrome, const_m1
392 bne 4f
393 cbnz syndrome, 5f
394 ldr data2, [src2], #4
395 eor tmp1, tmp1, data1
396 cmp tmp1, data2, S2HI #16
397 bne 6f
398 ldr data1, [src1], #4
399 b .Loverlap2
4004:
401 S2LO data2, data2, #16
402 b .Lstrcmp_tail
4035:
404 ands syndrome, syndrome, const_m1, S2LO #16
405 bne .Lstrcmp_done_equal
406
407 ldrh data2, [src2]
408 S2LO data1, data1, #16
409#ifdef __ARM_BIG_ENDIAN
410 lsl data2, data2, #16
411#endif
412 b .Lstrcmp_tail
413
4146:
415 S2LO data1, data1, #16
416 and data2, data2, const_m1, S2LO #16
417 b .Lstrcmp_tail
418
419 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
420.Loverlap1:
421 and tmp1, data1, #LSB
422 uadd8 syndrome, data1, const_m1
423 eors syndrome, tmp1, data2, S2LO #24
424 sel syndrome, syndrome, const_m1
425 bne 4f
426 cbnz syndrome, 5f
427 ldr data2, [src2], #4
428 eor tmp1, tmp1, data1
429 cmp tmp1, data2, S2HI #8
430 bne 6f
431 ldr data1, [src1], #4
432 b .Loverlap1
4334:
434 S2LO data2, data2, #24
435 b .Lstrcmp_tail
4365:
437 tst syndrome, #LSB
438 bne .Lstrcmp_done_equal
439 ldr data2, [src2]
4406:
441 S2LO data1, data1, #8
442 bic data2, data2, #MSB
443 b .Lstrcmp_tail
444
445.Lstrcmp_done_equal:
446 mov result, #0
447 .cfi_remember_state
448 ldrd r4, r5, [sp], #16
449 .cfi_restore 4
450 .cfi_restore 5
451 /* R6/7 not used in this sequence. */
452 .cfi_restore 6
453 .cfi_restore 7
454 bx lr
455
456.Lstrcmp_tail:
457 .cfi_restore_state
458#ifndef __ARM_BIG_ENDIAN
459 rev data1, data1
460 rev data2, data2
461 /* Now everything looks big-endian... */
462#endif
463 uadd8 tmp1, data1, const_m1
464 eor tmp1, data1, data2
465 sel syndrome, tmp1, const_m1
466 clz tmp1, syndrome
467 lsl data1, data1, tmp1
468 lsl data2, data2, tmp1
469 lsr result, data1, #24
470 ldrd r4, r5, [sp], #16
471 .cfi_restore 4
472 .cfi_restore 5
473 /* R6/7 not used in this sequence. */
474 .cfi_restore 6
475 .cfi_restore 7
476 sub result, result, data2, lsr #24
477 bx lr
478 .cfi_endproc
479 .size __strcmp, . - .Lstrcmp_start_addr