blob: d99cfc5ce460371faed81c1c8e215bb0f37e12f7 [file] [log] [blame]
Elliott Hughes5b808042021-10-01 10:56:10 -07001/*************************************************
2* Perl-Compatible Regular Expressions *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 This module by Zoltan Herczeg
10 Original API code Copyright (c) 1997-2012 University of Cambridge
11 New API code Copyright (c) 2016-2019 University of Cambridge
12
13-----------------------------------------------------------------------------
14Redistribution and use in source and binary forms, with or without
15modification, are permitted provided that the following conditions are met:
16
17 * Redistributions of source code must retain the above copyright notice,
18 this list of conditions and the following disclaimer.
19
20 * Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions and the following disclaimer in the
22 documentation and/or other materials provided with the distribution.
23
24 * Neither the name of the University of Cambridge nor the names of its
25 contributors may be used to endorse or promote products derived from
26 this software without specific prior written permission.
27
28THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38POSSIBILITY OF SUCH DAMAGE.
39-----------------------------------------------------------------------------
40*/
41
42#if !(defined SUPPORT_VALGRIND)
43
44#if ((defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) \
45 || (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X))
46
47typedef enum {
48 vector_compare_match1,
49 vector_compare_match1i,
50 vector_compare_match2,
51} vector_compare_type;
52
53static SLJIT_INLINE sljit_u32 max_fast_forward_char_pair_offset(void)
54{
55#if PCRE2_CODE_UNIT_WIDTH == 8
56return 15;
57#elif PCRE2_CODE_UNIT_WIDTH == 16
58return 7;
59#elif PCRE2_CODE_UNIT_WIDTH == 32
60return 3;
61#else
62#error "Unsupported unit width"
63#endif
64}
65
66#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
67static struct sljit_jump *jump_if_utf_char_start(struct sljit_compiler *compiler, sljit_s32 reg)
68{
69#if PCRE2_CODE_UNIT_WIDTH == 8
70OP2(SLJIT_AND, reg, 0, reg, 0, SLJIT_IMM, 0xc0);
71return CMP(SLJIT_NOT_EQUAL, reg, 0, SLJIT_IMM, 0x80);
72#elif PCRE2_CODE_UNIT_WIDTH == 16
73OP2(SLJIT_AND, reg, 0, reg, 0, SLJIT_IMM, 0xfc00);
74return CMP(SLJIT_NOT_EQUAL, reg, 0, SLJIT_IMM, 0xdc00);
75#else
76#error "Unknown code width"
77#endif
78}
79#endif
80
81#endif /* SLJIT_CONFIG_X86 || SLJIT_CONFIG_S390X */
82
83#if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)
84
85static sljit_s32 character_to_int32(PCRE2_UCHAR chr)
86{
87sljit_u32 value = chr;
88#if PCRE2_CODE_UNIT_WIDTH == 8
89#define SSE2_COMPARE_TYPE_INDEX 0
90return (sljit_s32)((value << 24) | (value << 16) | (value << 8) | value);
91#elif PCRE2_CODE_UNIT_WIDTH == 16
92#define SSE2_COMPARE_TYPE_INDEX 1
93return (sljit_s32)((value << 16) | value);
94#elif PCRE2_CODE_UNIT_WIDTH == 32
95#define SSE2_COMPARE_TYPE_INDEX 2
96return (sljit_s32)(value);
97#else
98#error "Unsupported unit width"
99#endif
100}
101
102static void load_from_mem_sse2(struct sljit_compiler *compiler, sljit_s32 dst_xmm_reg, sljit_s32 src_general_reg, sljit_s8 offset)
103{
104sljit_u8 instruction[5];
105
106SLJIT_ASSERT(dst_xmm_reg < 8);
107SLJIT_ASSERT(src_general_reg < 8);
108
109/* MOVDQA xmm1, xmm2/m128 */
110instruction[0] = ((sljit_u8)offset & 0xf) == 0 ? 0x66 : 0xf3;
111instruction[1] = 0x0f;
112instruction[2] = 0x6f;
113
114if (offset == 0)
115 {
116 instruction[3] = (dst_xmm_reg << 3) | src_general_reg;
117 sljit_emit_op_custom(compiler, instruction, 4);
118 return;
119 }
120
121instruction[3] = 0x40 | (dst_xmm_reg << 3) | src_general_reg;
122instruction[4] = (sljit_u8)offset;
123sljit_emit_op_custom(compiler, instruction, 5);
124}
125
126static void fast_forward_char_pair_sse2_compare(struct sljit_compiler *compiler, vector_compare_type compare_type,
127 int step, sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)
128{
129sljit_u8 instruction[4];
130instruction[0] = 0x66;
131instruction[1] = 0x0f;
132
133SLJIT_ASSERT(step >= 0 && step <= 3);
134
135if (compare_type != vector_compare_match2)
136 {
137 if (step == 0)
138 {
139 if (compare_type == vector_compare_match1i)
140 {
141 /* POR xmm1, xmm2/m128 */
142 /* instruction[0] = 0x66; */
143 /* instruction[1] = 0x0f; */
144 instruction[2] = 0xeb;
145 instruction[3] = 0xc0 | (dst_ind << 3) | cmp2_ind;
146 sljit_emit_op_custom(compiler, instruction, 4);
147 }
148 return;
149 }
150
151 if (step != 2)
152 return;
153
154 /* PCMPEQB/W/D xmm1, xmm2/m128 */
155 /* instruction[0] = 0x66; */
156 /* instruction[1] = 0x0f; */
157 instruction[2] = 0x74 + SSE2_COMPARE_TYPE_INDEX;
158 instruction[3] = 0xc0 | (dst_ind << 3) | cmp1_ind;
159 sljit_emit_op_custom(compiler, instruction, 4);
160 return;
161 }
162
163switch (step)
164 {
165 case 0:
166 /* MOVDQA xmm1, xmm2/m128 */
167 /* instruction[0] = 0x66; */
168 /* instruction[1] = 0x0f; */
169 instruction[2] = 0x6f;
170 instruction[3] = 0xc0 | (tmp_ind << 3) | dst_ind;
171 sljit_emit_op_custom(compiler, instruction, 4);
172 return;
173
174 case 1:
175 /* PCMPEQB/W/D xmm1, xmm2/m128 */
176 /* instruction[0] = 0x66; */
177 /* instruction[1] = 0x0f; */
178 instruction[2] = 0x74 + SSE2_COMPARE_TYPE_INDEX;
179 instruction[3] = 0xc0 | (dst_ind << 3) | cmp1_ind;
180 sljit_emit_op_custom(compiler, instruction, 4);
181 return;
182
183 case 2:
184 /* PCMPEQB/W/D xmm1, xmm2/m128 */
185 /* instruction[0] = 0x66; */
186 /* instruction[1] = 0x0f; */
187 instruction[2] = 0x74 + SSE2_COMPARE_TYPE_INDEX;
188 instruction[3] = 0xc0 | (tmp_ind << 3) | cmp2_ind;
189 sljit_emit_op_custom(compiler, instruction, 4);
190 return;
191
192 case 3:
193 /* POR xmm1, xmm2/m128 */
194 /* instruction[0] = 0x66; */
195 /* instruction[1] = 0x0f; */
196 instruction[2] = 0xeb;
197 instruction[3] = 0xc0 | (dst_ind << 3) | tmp_ind;
198 sljit_emit_op_custom(compiler, instruction, 4);
199 return;
200 }
201}
202
203#define JIT_HAS_FAST_FORWARD_CHAR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SSE2))
204
205static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
206{
207DEFINE_COMPILER;
208sljit_u8 instruction[8];
209struct sljit_label *start;
210#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
211struct sljit_label *restart;
212#endif
213struct sljit_jump *quit;
214struct sljit_jump *partial_quit[2];
215vector_compare_type compare_type = vector_compare_match1;
216sljit_s32 tmp1_reg_ind = sljit_get_register_index(TMP1);
217sljit_s32 str_ptr_reg_ind = sljit_get_register_index(STR_PTR);
218sljit_s32 data_ind = 0;
219sljit_s32 tmp_ind = 1;
220sljit_s32 cmp1_ind = 2;
221sljit_s32 cmp2_ind = 3;
222sljit_u32 bit = 0;
223int i;
224
225SLJIT_UNUSED_ARG(offset);
226
227if (char1 != char2)
228 {
229 bit = char1 ^ char2;
230 compare_type = vector_compare_match1i;
231
232 if (!is_powerof2(bit))
233 {
234 bit = 0;
235 compare_type = vector_compare_match2;
236 }
237 }
238
239partial_quit[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
240if (common->mode == PCRE2_JIT_COMPLETE)
241 add_jump(compiler, &common->failed_match, partial_quit[0]);
242
243/* First part (unaligned start) */
244
245OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1 | bit));
246
247SLJIT_ASSERT(tmp1_reg_ind < 8);
248
249/* MOVD xmm, r/m32 */
250instruction[0] = 0x66;
251instruction[1] = 0x0f;
252instruction[2] = 0x6e;
253instruction[3] = 0xc0 | (cmp1_ind << 3) | tmp1_reg_ind;
254sljit_emit_op_custom(compiler, instruction, 4);
255
256if (char1 != char2)
257 {
258 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(bit != 0 ? bit : char2));
259
260 /* MOVD xmm, r/m32 */
261 instruction[3] = 0xc0 | (cmp2_ind << 3) | tmp1_reg_ind;
262 sljit_emit_op_custom(compiler, instruction, 4);
263 }
264
265OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
266
267/* PSHUFD xmm1, xmm2/m128, imm8 */
268/* instruction[0] = 0x66; */
269/* instruction[1] = 0x0f; */
270instruction[2] = 0x70;
271instruction[3] = 0xc0 | (cmp1_ind << 3) | cmp1_ind;
272instruction[4] = 0;
273sljit_emit_op_custom(compiler, instruction, 5);
274
275if (char1 != char2)
276 {
277 /* PSHUFD xmm1, xmm2/m128, imm8 */
278 instruction[3] = 0xc0 | (cmp2_ind << 3) | cmp2_ind;
279 sljit_emit_op_custom(compiler, instruction, 5);
280 }
281
282#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
283restart = LABEL();
284#endif
285OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~0xf);
286OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
287
288load_from_mem_sse2(compiler, data_ind, str_ptr_reg_ind, 0);
289for (i = 0; i < 4; i++)
290 fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
291
292/* PMOVMSKB reg, xmm */
293/* instruction[0] = 0x66; */
294/* instruction[1] = 0x0f; */
295instruction[2] = 0xd7;
296instruction[3] = 0xc0 | (tmp1_reg_ind << 3) | data_ind;
297sljit_emit_op_custom(compiler, instruction, 4);
298
299OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
300OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
301
302quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
303
304OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
305
306/* Second part (aligned) */
307start = LABEL();
308
309OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
310
311partial_quit[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
312if (common->mode == PCRE2_JIT_COMPLETE)
313 add_jump(compiler, &common->failed_match, partial_quit[1]);
314
315load_from_mem_sse2(compiler, data_ind, str_ptr_reg_ind, 0);
316for (i = 0; i < 4; i++)
317 fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
318
319/* PMOVMSKB reg, xmm */
320/* instruction[0] = 0x66; */
321/* instruction[1] = 0x0f; */
322instruction[2] = 0xd7;
323instruction[3] = 0xc0 | (tmp1_reg_ind << 3) | data_ind;
324sljit_emit_op_custom(compiler, instruction, 4);
325
326CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
327
328JUMPHERE(quit);
329
330/* BSF r32, r/m32 */
331instruction[0] = 0x0f;
332instruction[1] = 0xbc;
333instruction[2] = 0xc0 | (tmp1_reg_ind << 3) | tmp1_reg_ind;
334sljit_emit_op_custom(compiler, instruction, 3);
335
336OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
337
338if (common->mode != PCRE2_JIT_COMPLETE)
339 {
340 JUMPHERE(partial_quit[0]);
341 JUMPHERE(partial_quit[1]);
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700342 OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);
Elliott Hughes5b808042021-10-01 10:56:10 -0700343 CMOV(SLJIT_GREATER, STR_PTR, STR_END, 0);
344 }
345else
346 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
347
348#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
349if (common->utf && offset > 0)
350 {
351 SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);
352
353 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offset));
354
355 quit = jump_if_utf_char_start(compiler, TMP1);
356
357 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
358 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
359 OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
360 JUMPTO(SLJIT_JUMP, restart);
361
362 JUMPHERE(quit);
363 }
364#endif
365}
366
367#define JIT_HAS_FAST_REQUESTED_CHAR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SSE2))
368
369static jump_list *fast_requested_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2)
370{
371DEFINE_COMPILER;
372sljit_u8 instruction[8];
373struct sljit_label *start;
374struct sljit_jump *quit;
375jump_list *not_found = NULL;
376vector_compare_type compare_type = vector_compare_match1;
377sljit_s32 tmp1_reg_ind = sljit_get_register_index(TMP1);
378sljit_s32 str_ptr_reg_ind = sljit_get_register_index(STR_PTR);
379sljit_s32 data_ind = 0;
380sljit_s32 tmp_ind = 1;
381sljit_s32 cmp1_ind = 2;
382sljit_s32 cmp2_ind = 3;
383sljit_u32 bit = 0;
384int i;
385
386if (char1 != char2)
387 {
388 bit = char1 ^ char2;
389 compare_type = vector_compare_match1i;
390
391 if (!is_powerof2(bit))
392 {
393 bit = 0;
394 compare_type = vector_compare_match2;
395 }
396 }
397
398add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
399OP1(SLJIT_MOV, TMP2, 0, TMP1, 0);
400OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0);
401
402/* First part (unaligned start) */
403
404OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1 | bit));
405
406SLJIT_ASSERT(tmp1_reg_ind < 8);
407
408/* MOVD xmm, r/m32 */
409instruction[0] = 0x66;
410instruction[1] = 0x0f;
411instruction[2] = 0x6e;
412instruction[3] = 0xc0 | (cmp1_ind << 3) | tmp1_reg_ind;
413sljit_emit_op_custom(compiler, instruction, 4);
414
415if (char1 != char2)
416 {
417 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(bit != 0 ? bit : char2));
418
419 /* MOVD xmm, r/m32 */
420 instruction[3] = 0xc0 | (cmp2_ind << 3) | tmp1_reg_ind;
421 sljit_emit_op_custom(compiler, instruction, 4);
422 }
423
424OP1(SLJIT_MOV, STR_PTR, 0, TMP2, 0);
425
426/* PSHUFD xmm1, xmm2/m128, imm8 */
427/* instruction[0] = 0x66; */
428/* instruction[1] = 0x0f; */
429instruction[2] = 0x70;
430instruction[3] = 0xc0 | (cmp1_ind << 3) | cmp1_ind;
431instruction[4] = 0;
432sljit_emit_op_custom(compiler, instruction, 5);
433
434if (char1 != char2)
435 {
436 /* PSHUFD xmm1, xmm2/m128, imm8 */
437 instruction[3] = 0xc0 | (cmp2_ind << 3) | cmp2_ind;
438 sljit_emit_op_custom(compiler, instruction, 5);
439 }
440
441OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~0xf);
442OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
443
444load_from_mem_sse2(compiler, data_ind, str_ptr_reg_ind, 0);
445for (i = 0; i < 4; i++)
446 fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
447
448/* PMOVMSKB reg, xmm */
449/* instruction[0] = 0x66; */
450/* instruction[1] = 0x0f; */
451instruction[2] = 0xd7;
452instruction[3] = 0xc0 | (tmp1_reg_ind << 3) | data_ind;
453sljit_emit_op_custom(compiler, instruction, 4);
454
455OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
456OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
457
458quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
459
460OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
461
462/* Second part (aligned) */
463start = LABEL();
464
465OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
466
467add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
468
469load_from_mem_sse2(compiler, data_ind, str_ptr_reg_ind, 0);
470for (i = 0; i < 4; i++)
471 fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
472
473/* PMOVMSKB reg, xmm */
474/* instruction[0] = 0x66; */
475/* instruction[1] = 0x0f; */
476instruction[2] = 0xd7;
477instruction[3] = 0xc0 | (tmp1_reg_ind << 3) | data_ind;
478sljit_emit_op_custom(compiler, instruction, 4);
479
480CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
481
482JUMPHERE(quit);
483
484/* BSF r32, r/m32 */
485instruction[0] = 0x0f;
486instruction[1] = 0xbc;
487instruction[2] = 0xc0 | (tmp1_reg_ind << 3) | tmp1_reg_ind;
488sljit_emit_op_custom(compiler, instruction, 3);
489
490OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, STR_PTR, 0);
491add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
492
493OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0);
494return not_found;
495}
496
497#ifndef _WIN64
498
499#define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SSE2))
500
501static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,
502 PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
503{
504DEFINE_COMPILER;
505sljit_u8 instruction[8];
506vector_compare_type compare1_type = vector_compare_match1;
507vector_compare_type compare2_type = vector_compare_match1;
508sljit_u32 bit1 = 0;
509sljit_u32 bit2 = 0;
510sljit_u32 diff = IN_UCHARS(offs1 - offs2);
511sljit_s32 tmp1_reg_ind = sljit_get_register_index(TMP1);
512sljit_s32 tmp2_reg_ind = sljit_get_register_index(TMP2);
513sljit_s32 str_ptr_reg_ind = sljit_get_register_index(STR_PTR);
514sljit_s32 data1_ind = 0;
515sljit_s32 data2_ind = 1;
516sljit_s32 tmp1_ind = 2;
517sljit_s32 tmp2_ind = 3;
518sljit_s32 cmp1a_ind = 4;
519sljit_s32 cmp1b_ind = 5;
520sljit_s32 cmp2a_ind = 6;
521sljit_s32 cmp2b_ind = 7;
522struct sljit_label *start;
523#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
524struct sljit_label *restart;
525#endif
526struct sljit_jump *jump[2];
527int i;
528
529SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2);
530SLJIT_ASSERT(diff <= IN_UCHARS(max_fast_forward_char_pair_offset()));
531SLJIT_ASSERT(tmp1_reg_ind < 8 && tmp2_reg_ind == 1);
532
533/* Initialize. */
534if (common->match_end_ptr != 0)
535 {
536 OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
537 OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);
538 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
539
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700540 OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP1, 0, STR_END, 0);
Elliott Hughes5b808042021-10-01 10:56:10 -0700541 CMOV(SLJIT_LESS, STR_END, TMP1, 0);
542 }
543
544OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
545add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
546
547/* MOVD xmm, r/m32 */
548instruction[0] = 0x66;
549instruction[1] = 0x0f;
550instruction[2] = 0x6e;
551
552if (char1a == char1b)
553 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a));
554else
555 {
556 bit1 = char1a ^ char1b;
557 if (is_powerof2(bit1))
558 {
559 compare1_type = vector_compare_match1i;
560 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a | bit1));
561 OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(bit1));
562 }
563 else
564 {
565 compare1_type = vector_compare_match2;
566 bit1 = 0;
567 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a));
568 OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(char1b));
569 }
570 }
571
572instruction[3] = 0xc0 | (cmp1a_ind << 3) | tmp1_reg_ind;
573sljit_emit_op_custom(compiler, instruction, 4);
574
575if (char1a != char1b)
576 {
577 instruction[3] = 0xc0 | (cmp1b_ind << 3) | tmp2_reg_ind;
578 sljit_emit_op_custom(compiler, instruction, 4);
579 }
580
581if (char2a == char2b)
582 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a));
583else
584 {
585 bit2 = char2a ^ char2b;
586 if (is_powerof2(bit2))
587 {
588 compare2_type = vector_compare_match1i;
589 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a | bit2));
590 OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(bit2));
591 }
592 else
593 {
594 compare2_type = vector_compare_match2;
595 bit2 = 0;
596 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a));
597 OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(char2b));
598 }
599 }
600
601instruction[3] = 0xc0 | (cmp2a_ind << 3) | tmp1_reg_ind;
602sljit_emit_op_custom(compiler, instruction, 4);
603
604if (char2a != char2b)
605 {
606 instruction[3] = 0xc0 | (cmp2b_ind << 3) | tmp2_reg_ind;
607 sljit_emit_op_custom(compiler, instruction, 4);
608 }
609
610/* PSHUFD xmm1, xmm2/m128, imm8 */
611/* instruction[0] = 0x66; */
612/* instruction[1] = 0x0f; */
613instruction[2] = 0x70;
614instruction[4] = 0;
615
616instruction[3] = 0xc0 | (cmp1a_ind << 3) | cmp1a_ind;
617sljit_emit_op_custom(compiler, instruction, 5);
618
619if (char1a != char1b)
620 {
621 instruction[3] = 0xc0 | (cmp1b_ind << 3) | cmp1b_ind;
622 sljit_emit_op_custom(compiler, instruction, 5);
623 }
624
625instruction[3] = 0xc0 | (cmp2a_ind << 3) | cmp2a_ind;
626sljit_emit_op_custom(compiler, instruction, 5);
627
628if (char2a != char2b)
629 {
630 instruction[3] = 0xc0 | (cmp2b_ind << 3) | cmp2b_ind;
631 sljit_emit_op_custom(compiler, instruction, 5);
632 }
633
634#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
635restart = LABEL();
636#endif
637
638OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, diff);
639OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
640OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~0xf);
641
642load_from_mem_sse2(compiler, data1_ind, str_ptr_reg_ind, 0);
643
644jump[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_PTR, 0);
645
646load_from_mem_sse2(compiler, data2_ind, str_ptr_reg_ind, -(sljit_s8)diff);
647jump[1] = JUMP(SLJIT_JUMP);
648
649JUMPHERE(jump[0]);
650
651/* MOVDQA xmm1, xmm2/m128 */
652/* instruction[0] = 0x66; */
653/* instruction[1] = 0x0f; */
654instruction[2] = 0x6f;
655instruction[3] = 0xc0 | (data2_ind << 3) | data1_ind;
656sljit_emit_op_custom(compiler, instruction, 4);
657
658/* PSLLDQ xmm1, imm8 */
659/* instruction[0] = 0x66; */
660/* instruction[1] = 0x0f; */
661instruction[2] = 0x73;
662instruction[3] = 0xc0 | (7 << 3) | data2_ind;
663instruction[4] = diff;
664sljit_emit_op_custom(compiler, instruction, 5);
665
666JUMPHERE(jump[1]);
667
668OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
669
670for (i = 0; i < 4; i++)
671 {
672 fast_forward_char_pair_sse2_compare(compiler, compare2_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
673 fast_forward_char_pair_sse2_compare(compiler, compare1_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
674 }
675
676/* PAND xmm1, xmm2/m128 */
677/* instruction[0] = 0x66; */
678/* instruction[1] = 0x0f; */
679instruction[2] = 0xdb;
680instruction[3] = 0xc0 | (data1_ind << 3) | data2_ind;
681sljit_emit_op_custom(compiler, instruction, 4);
682
683/* PMOVMSKB reg, xmm */
684/* instruction[0] = 0x66; */
685/* instruction[1] = 0x0f; */
686instruction[2] = 0xd7;
687instruction[3] = 0xc0 | (tmp1_reg_ind << 3) | 0;
688sljit_emit_op_custom(compiler, instruction, 4);
689
690/* Ignore matches before the first STR_PTR. */
691OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
692OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
693
694jump[0] = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
695
696OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
697
698/* Main loop. */
699start = LABEL();
700
701OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
702add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
703
704load_from_mem_sse2(compiler, data1_ind, str_ptr_reg_ind, 0);
705load_from_mem_sse2(compiler, data2_ind, str_ptr_reg_ind, -(sljit_s8)diff);
706
707for (i = 0; i < 4; i++)
708 {
709 fast_forward_char_pair_sse2_compare(compiler, compare1_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp2_ind);
710 fast_forward_char_pair_sse2_compare(compiler, compare2_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp1_ind);
711 }
712
713/* PAND xmm1, xmm2/m128 */
714/* instruction[0] = 0x66; */
715/* instruction[1] = 0x0f; */
716instruction[2] = 0xdb;
717instruction[3] = 0xc0 | (data1_ind << 3) | data2_ind;
718sljit_emit_op_custom(compiler, instruction, 4);
719
720/* PMOVMSKB reg, xmm */
721/* instruction[0] = 0x66; */
722/* instruction[1] = 0x0f; */
723instruction[2] = 0xd7;
724instruction[3] = 0xc0 | (tmp1_reg_ind << 3) | 0;
725sljit_emit_op_custom(compiler, instruction, 4);
726
727CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
728
729JUMPHERE(jump[0]);
730
731/* BSF r32, r/m32 */
732instruction[0] = 0x0f;
733instruction[1] = 0xbc;
734instruction[2] = 0xc0 | (tmp1_reg_ind << 3) | tmp1_reg_ind;
735sljit_emit_op_custom(compiler, instruction, 3);
736
737OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
738
739add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
740
741#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
742if (common->utf)
743 {
744 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offs1));
745
746 jump[0] = jump_if_utf_char_start(compiler, TMP1);
747
748 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
749 CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, restart);
750
751 add_jump(compiler, &common->failed_match, JUMP(SLJIT_JUMP));
752
753 JUMPHERE(jump[0]);
754 }
755#endif
756
757OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
758
759if (common->match_end_ptr != 0)
760 OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);
761}
762
763#endif /* !_WIN64 */
764
765#undef SSE2_COMPARE_TYPE_INDEX
766
767#endif /* SLJIT_CONFIG_X86 */
768
769#if (defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64 && (defined __ARM_NEON || defined __ARM_NEON__))
770
771#include <arm_neon.h>
772
773typedef union {
774 unsigned int x;
775 struct { unsigned char c1, c2, c3, c4; } c;
776} int_char;
777
778#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
779static SLJIT_INLINE int utf_continue(sljit_u8 *s)
780{
781#if PCRE2_CODE_UNIT_WIDTH == 8
782return (*s & 0xc0) == 0x80;
783#elif PCRE2_CODE_UNIT_WIDTH == 16
784return (*s & 0xfc00) == 0xdc00;
785#else
786#error "Unknown code width"
787#endif
788}
789#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 */
790
791#if PCRE2_CODE_UNIT_WIDTH == 8
792# define VECTOR_FACTOR 16
793# define vect_t uint8x16_t
794# define VLD1Q(X) vld1q_u8((sljit_u8 *)(X))
795# define VCEQQ vceqq_u8
796# define VORRQ vorrq_u8
797# define VST1Q vst1q_u8
798# define VDUPQ vdupq_n_u8
799# define VEXTQ vextq_u8
800# define VANDQ vandq_u8
801typedef union {
802 uint8_t mem[16];
803 uint64_t dw[2];
804} quad_word;
805#elif PCRE2_CODE_UNIT_WIDTH == 16
806# define VECTOR_FACTOR 8
807# define vect_t uint16x8_t
808# define VLD1Q(X) vld1q_u16((sljit_u16 *)(X))
809# define VCEQQ vceqq_u16
810# define VORRQ vorrq_u16
811# define VST1Q vst1q_u16
812# define VDUPQ vdupq_n_u16
813# define VEXTQ vextq_u16
814# define VANDQ vandq_u16
815typedef union {
816 uint16_t mem[8];
817 uint64_t dw[2];
818} quad_word;
819#else
820# define VECTOR_FACTOR 4
821# define vect_t uint32x4_t
822# define VLD1Q(X) vld1q_u32((sljit_u32 *)(X))
823# define VCEQQ vceqq_u32
824# define VORRQ vorrq_u32
825# define VST1Q vst1q_u32
826# define VDUPQ vdupq_n_u32
827# define VEXTQ vextq_u32
828# define VANDQ vandq_u32
829typedef union {
830 uint32_t mem[4];
831 uint64_t dw[2];
832} quad_word;
833#endif
834
835#define FFCS
836#include "pcre2_jit_neon_inc.h"
837#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
838# define FF_UTF
839# include "pcre2_jit_neon_inc.h"
840# undef FF_UTF
841#endif
842#undef FFCS
843
844#define FFCS_2
845#include "pcre2_jit_neon_inc.h"
846#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
847# define FF_UTF
848# include "pcre2_jit_neon_inc.h"
849# undef FF_UTF
850#endif
851#undef FFCS_2
852
853#define FFCS_MASK
854#include "pcre2_jit_neon_inc.h"
855#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
856# define FF_UTF
857# include "pcre2_jit_neon_inc.h"
858# undef FF_UTF
859#endif
860#undef FFCS_MASK
861
862#define JIT_HAS_FAST_FORWARD_CHAR_SIMD 1
863
864static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
865{
866DEFINE_COMPILER;
867int_char ic;
868struct sljit_jump *partial_quit;
869/* Save temporary registers. */
870OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS0, STR_PTR, 0);
871OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS1, TMP3, 0);
872
873/* Prepare function arguments */
874OP1(SLJIT_MOV, SLJIT_R0, 0, STR_END, 0);
875OP1(SLJIT_MOV, SLJIT_R1, 0, STR_PTR, 0);
876OP1(SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, offset);
877
878if (char1 == char2)
879 {
880 ic.c.c1 = char1;
881 ic.c.c2 = char2;
882 OP1(SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, ic.x);
883
884#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
885 if (common->utf && offset > 0)
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700886 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
887 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_utf));
Elliott Hughes5b808042021-10-01 10:56:10 -0700888 else
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700889 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
890 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs));
Elliott Hughes5b808042021-10-01 10:56:10 -0700891#else
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700892 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
893 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs));
Elliott Hughes5b808042021-10-01 10:56:10 -0700894#endif
895 }
896else
897 {
898 PCRE2_UCHAR mask = char1 ^ char2;
899 if (is_powerof2(mask))
900 {
901 ic.c.c1 = char1 | mask;
902 ic.c.c2 = mask;
903 OP1(SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, ic.x);
904
905#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
906 if (common->utf && offset > 0)
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700907 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
908 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_mask_utf));
Elliott Hughes5b808042021-10-01 10:56:10 -0700909 else
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700910 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
911 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_mask));
Elliott Hughes5b808042021-10-01 10:56:10 -0700912#else
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700913 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
914 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_mask));
Elliott Hughes5b808042021-10-01 10:56:10 -0700915#endif
916 }
917 else
918 {
919 ic.c.c1 = char1;
920 ic.c.c2 = char2;
921 OP1(SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, ic.x);
922
923#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
924 if (common->utf && offset > 0)
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700925 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
926 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_2_utf));
Elliott Hughes5b808042021-10-01 10:56:10 -0700927 else
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700928 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
929 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_2));
Elliott Hughes5b808042021-10-01 10:56:10 -0700930#else
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700931 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
932 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_2));
Elliott Hughes5b808042021-10-01 10:56:10 -0700933#endif
934 }
935 }
936/* Restore registers. */
937OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0);
938OP1(SLJIT_MOV, TMP3, 0, SLJIT_MEM1(SLJIT_SP), LOCALS1);
939
940/* Check return value. */
941partial_quit = CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0);
942if (common->mode == PCRE2_JIT_COMPLETE)
943 add_jump(compiler, &common->failed_match, partial_quit);
944
945/* Fast forward STR_PTR to the result of memchr. */
946OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0);
947
948if (common->mode != PCRE2_JIT_COMPLETE)
949 JUMPHERE(partial_quit);
950}
951
952typedef enum {
953 compare_match1,
954 compare_match1i,
955 compare_match2,
956} compare_type;
957
958static inline vect_t fast_forward_char_pair_compare(compare_type ctype, vect_t dst, vect_t cmp1, vect_t cmp2)
959{
960if (ctype == compare_match2)
961 {
962 vect_t tmp = dst;
963 dst = VCEQQ(dst, cmp1);
964 tmp = VCEQQ(tmp, cmp2);
965 dst = VORRQ(dst, tmp);
966 return dst;
967 }
968
969if (ctype == compare_match1i)
970 dst = VORRQ(dst, cmp2);
971dst = VCEQQ(dst, cmp1);
972return dst;
973}
974
975static SLJIT_INLINE sljit_u32 max_fast_forward_char_pair_offset(void)
976{
977#if PCRE2_CODE_UNIT_WIDTH == 8
978return 15;
979#elif PCRE2_CODE_UNIT_WIDTH == 16
980return 7;
981#elif PCRE2_CODE_UNIT_WIDTH == 32
982return 3;
983#else
984#error "Unsupported unit width"
985#endif
986}
987
988/* ARM doesn't have a shift left across lanes. */
989static SLJIT_INLINE vect_t shift_left_n_lanes(vect_t a, sljit_u8 n)
990{
991vect_t zero = VDUPQ(0);
992SLJIT_ASSERT(0 < n && n < VECTOR_FACTOR);
993/* VEXTQ takes an immediate as last argument. */
994#define C(X) case X: return VEXTQ(zero, a, VECTOR_FACTOR - X);
995switch (n)
996 {
997 C(1); C(2); C(3);
998#if PCRE2_CODE_UNIT_WIDTH != 32
999 C(4); C(5); C(6); C(7);
1000# if PCRE2_CODE_UNIT_WIDTH != 16
1001 C(8); C(9); C(10); C(11); C(12); C(13); C(14); C(15);
1002# endif
1003#endif
1004 default:
1005 /* Based on the ASSERT(0 < n && n < VECTOR_FACTOR) above, this won't
1006 happen. The return is still here for compilers to not warn. */
1007 return a;
1008 }
1009}
1010
1011#define FFCPS
1012#define FFCPS_DIFF1
1013#define FFCPS_CHAR1A2A
1014
1015#define FFCPS_0
1016#include "pcre2_jit_neon_inc.h"
1017#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1018# define FF_UTF
1019# include "pcre2_jit_neon_inc.h"
1020# undef FF_UTF
1021#endif
1022#undef FFCPS_0
1023
1024#undef FFCPS_CHAR1A2A
1025
1026#define FFCPS_1
1027#include "pcre2_jit_neon_inc.h"
1028#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1029# define FF_UTF
1030# include "pcre2_jit_neon_inc.h"
1031# undef FF_UTF
1032#endif
1033#undef FFCPS_1
1034
1035#undef FFCPS_DIFF1
1036
1037#define FFCPS_DEFAULT
1038#include "pcre2_jit_neon_inc.h"
1039#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1040# define FF_UTF
1041# include "pcre2_jit_neon_inc.h"
1042# undef FF_UTF
1043#endif
1044#undef FFCPS
1045
1046#define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD 1
1047
1048static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,
1049 PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
1050{
1051DEFINE_COMPILER;
1052sljit_u32 diff = IN_UCHARS(offs1 - offs2);
1053struct sljit_jump *partial_quit;
1054int_char ic;
1055SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2);
1056SLJIT_ASSERT(diff <= IN_UCHARS(max_fast_forward_char_pair_offset()));
1057SLJIT_ASSERT(compiler->scratches == 5);
1058
1059/* Save temporary register STR_PTR. */
1060OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS0, STR_PTR, 0);
1061
1062/* Prepare arguments for the function call. */
1063if (common->match_end_ptr == 0)
1064 OP1(SLJIT_MOV, SLJIT_R0, 0, STR_END, 0);
1065else
1066 {
1067 OP1(SLJIT_MOV, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
1068 OP2(SLJIT_ADD, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
1069
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07001070 OP2U(SLJIT_SUB | SLJIT_SET_LESS, STR_END, 0, SLJIT_R0, 0);
Elliott Hughes5b808042021-10-01 10:56:10 -07001071 CMOV(SLJIT_LESS, SLJIT_R0, STR_END, 0);
1072 }
1073
1074OP1(SLJIT_MOV, SLJIT_R1, 0, STR_PTR, 0);
1075OP1(SLJIT_MOV_S32, SLJIT_R2, 0, SLJIT_IMM, offs1);
1076OP1(SLJIT_MOV_S32, SLJIT_R3, 0, SLJIT_IMM, offs2);
1077ic.c.c1 = char1a;
1078ic.c.c2 = char1b;
1079ic.c.c3 = char2a;
1080ic.c.c4 = char2b;
1081OP1(SLJIT_MOV_U32, SLJIT_R4, 0, SLJIT_IMM, ic.x);
1082
1083if (diff == 1) {
1084 if (char1a == char1b && char2a == char2b) {
1085#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1086 if (common->utf)
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07001087 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
1088 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_0_utf));
Elliott Hughes5b808042021-10-01 10:56:10 -07001089 else
1090#endif
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07001091 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
1092 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_0));
Elliott Hughes5b808042021-10-01 10:56:10 -07001093 } else {
1094#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1095 if (common->utf)
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07001096 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
1097 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_1_utf));
Elliott Hughes5b808042021-10-01 10:56:10 -07001098 else
1099#endif
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07001100 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
1101 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_1));
Elliott Hughes5b808042021-10-01 10:56:10 -07001102 }
1103} else {
1104#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1105 if (common->utf)
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07001106 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
1107 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_default_utf));
Elliott Hughes5b808042021-10-01 10:56:10 -07001108 else
1109#endif
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07001110 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
1111 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_default));
Elliott Hughes5b808042021-10-01 10:56:10 -07001112}
1113
1114/* Restore STR_PTR register. */
1115OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0);
1116
1117/* Check return value. */
1118partial_quit = CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0);
1119add_jump(compiler, &common->failed_match, partial_quit);
1120
1121/* Fast forward STR_PTR to the result of memchr. */
1122OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0);
1123
1124JUMPHERE(partial_quit);
1125}
1126
1127#endif /* SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64 */
1128
1129#if (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X)
1130
1131#if PCRE2_CODE_UNIT_WIDTH == 8
1132#define VECTOR_ELEMENT_SIZE 0
1133#elif PCRE2_CODE_UNIT_WIDTH == 16
1134#define VECTOR_ELEMENT_SIZE 1
1135#elif PCRE2_CODE_UNIT_WIDTH == 32
1136#define VECTOR_ELEMENT_SIZE 2
1137#else
1138#error "Unsupported unit width"
1139#endif
1140
1141static void load_from_mem_vector(struct sljit_compiler *compiler, BOOL vlbb, sljit_s32 dst_vreg,
1142 sljit_s32 base_reg, sljit_s32 index_reg)
1143{
1144sljit_u16 instruction[3];
1145
1146instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4) | index_reg);
1147instruction[1] = (sljit_u16)(base_reg << 12);
1148instruction[2] = (sljit_u16)((0x8 << 8) | (vlbb ? 0x07 : 0x06));
1149
1150sljit_emit_op_custom(compiler, instruction, 6);
1151}
1152
1153#if PCRE2_CODE_UNIT_WIDTH == 32
1154
1155static void replicate_imm_vector(struct sljit_compiler *compiler, int step, sljit_s32 dst_vreg,
1156 PCRE2_UCHAR chr, sljit_s32 tmp_general_reg)
1157{
1158sljit_u16 instruction[3];
1159
1160SLJIT_ASSERT(step >= 0 && step <= 1);
1161
1162if (chr < 0x7fff)
1163 {
1164 if (step == 1)
1165 return;
1166
1167 /* VREPI */
1168 instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4));
1169 instruction[1] = (sljit_u16)chr;
1170 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);
1171 sljit_emit_op_custom(compiler, instruction, 6);
1172 return;
1173 }
1174
1175if (step == 0)
1176 {
1177 OP1(SLJIT_MOV, tmp_general_reg, 0, SLJIT_IMM, chr);
1178
1179 /* VLVG */
1180 instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4) | sljit_get_register_index(tmp_general_reg));
1181 instruction[1] = 0;
1182 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x22);
1183 sljit_emit_op_custom(compiler, instruction, 6);
1184 return;
1185 }
1186
1187/* VREP */
1188instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4) | dst_vreg);
1189instruction[1] = 0;
1190instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xc << 8) | 0x4d);
1191sljit_emit_op_custom(compiler, instruction, 6);
1192}
1193
1194#endif
1195
1196static void fast_forward_char_pair_sse2_compare(struct sljit_compiler *compiler, vector_compare_type compare_type,
1197 int step, sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)
1198{
1199sljit_u16 instruction[3];
1200
1201SLJIT_ASSERT(step >= 0 && step <= 2);
1202
1203if (step == 1)
1204 {
1205 /* VCEQ */
1206 instruction[0] = (sljit_u16)(0xe700 | (dst_ind << 4) | dst_ind);
1207 instruction[1] = (sljit_u16)(cmp1_ind << 12);
1208 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0xf8);
1209 sljit_emit_op_custom(compiler, instruction, 6);
1210 return;
1211 }
1212
1213if (compare_type != vector_compare_match2)
1214 {
1215 if (step == 0 && compare_type == vector_compare_match1i)
1216 {
1217 /* VO */
1218 instruction[0] = (sljit_u16)(0xe700 | (dst_ind << 4) | dst_ind);
1219 instruction[1] = (sljit_u16)(cmp2_ind << 12);
1220 instruction[2] = (sljit_u16)((0xe << 8) | 0x6a);
1221 sljit_emit_op_custom(compiler, instruction, 6);
1222 }
1223 return;
1224 }
1225
1226switch (step)
1227 {
1228 case 0:
1229 /* VCEQ */
1230 instruction[0] = (sljit_u16)(0xe700 | (tmp_ind << 4) | dst_ind);
1231 instruction[1] = (sljit_u16)(cmp2_ind << 12);
1232 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0xf8);
1233 sljit_emit_op_custom(compiler, instruction, 6);
1234 return;
1235
1236 case 2:
1237 /* VO */
1238 instruction[0] = (sljit_u16)(0xe700 | (dst_ind << 4) | dst_ind);
1239 instruction[1] = (sljit_u16)(tmp_ind << 12);
1240 instruction[2] = (sljit_u16)((0xe << 8) | 0x6a);
1241 sljit_emit_op_custom(compiler, instruction, 6);
1242 return;
1243 }
1244}
1245
1246#define JIT_HAS_FAST_FORWARD_CHAR_SIMD 1
1247
1248static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
1249{
1250DEFINE_COMPILER;
1251sljit_u16 instruction[3];
1252struct sljit_label *start;
1253#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1254struct sljit_label *restart;
1255#endif
1256struct sljit_jump *quit;
1257struct sljit_jump *partial_quit[2];
1258vector_compare_type compare_type = vector_compare_match1;
1259sljit_s32 tmp1_reg_ind = sljit_get_register_index(TMP1);
1260sljit_s32 str_ptr_reg_ind = sljit_get_register_index(STR_PTR);
1261sljit_s32 data_ind = 0;
1262sljit_s32 tmp_ind = 1;
1263sljit_s32 cmp1_ind = 2;
1264sljit_s32 cmp2_ind = 3;
1265sljit_s32 zero_ind = 4;
1266sljit_u32 bit = 0;
1267int i;
1268
1269SLJIT_UNUSED_ARG(offset);
1270
1271if (char1 != char2)
1272 {
1273 bit = char1 ^ char2;
1274 compare_type = vector_compare_match1i;
1275
1276 if (!is_powerof2(bit))
1277 {
1278 bit = 0;
1279 compare_type = vector_compare_match2;
1280 }
1281 }
1282
1283partial_quit[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
1284if (common->mode == PCRE2_JIT_COMPLETE)
1285 add_jump(compiler, &common->failed_match, partial_quit[0]);
1286
1287/* First part (unaligned start) */
1288
1289OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 16);
1290
1291#if PCRE2_CODE_UNIT_WIDTH != 32
1292
1293/* VREPI */
1294instruction[0] = (sljit_u16)(0xe700 | (cmp1_ind << 4));
1295instruction[1] = (sljit_u16)(char1 | bit);
1296instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);
1297sljit_emit_op_custom(compiler, instruction, 6);
1298
1299if (char1 != char2)
1300 {
1301 /* VREPI */
1302 instruction[0] = (sljit_u16)(0xe700 | (cmp2_ind << 4));
1303 instruction[1] = (sljit_u16)(bit != 0 ? bit : char2);
1304 /* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1305 sljit_emit_op_custom(compiler, instruction, 6);
1306 }
1307
1308#else /* PCRE2_CODE_UNIT_WIDTH == 32 */
1309
1310for (int i = 0; i < 2; i++)
1311 {
1312 replicate_imm_vector(compiler, i, cmp1_ind, char1 | bit, TMP1);
1313
1314 if (char1 != char2)
1315 replicate_imm_vector(compiler, i, cmp2_ind, bit != 0 ? bit : char2, TMP1);
1316 }
1317
1318#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
1319
1320if (compare_type == vector_compare_match2)
1321 {
1322 /* VREPI */
1323 instruction[0] = (sljit_u16)(0xe700 | (zero_ind << 4));
1324 instruction[1] = 0;
1325 instruction[2] = (sljit_u16)((0x8 << 8) | 0x45);
1326 sljit_emit_op_custom(compiler, instruction, 6);
1327 }
1328
1329#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1330restart = LABEL();
1331#endif
1332
1333load_from_mem_vector(compiler, TRUE, data_ind, str_ptr_reg_ind, 0);
1334OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, ~15);
1335
1336if (compare_type != vector_compare_match2)
1337 {
1338 if (compare_type == vector_compare_match1i)
1339 fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1340
1341 /* VFEE */
1342 instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1343 instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));
1344 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);
1345 sljit_emit_op_custom(compiler, instruction, 6);
1346 }
1347else
1348 {
1349 for (i = 0; i < 3; i++)
1350 fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1351
1352 /* VFENE */
1353 instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1354 instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1355 instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1356 sljit_emit_op_custom(compiler, instruction, 6);
1357 }
1358
1359/* VLGVB */
1360instruction[0] = (sljit_u16)(0xe700 | (tmp1_reg_ind << 4) | data_ind);
1361instruction[1] = 7;
1362instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1363sljit_emit_op_custom(compiler, instruction, 6);
1364
1365OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
1366quit = CMP(SLJIT_LESS, STR_PTR, 0, TMP2, 0);
1367
1368OP2(SLJIT_SUB, STR_PTR, 0, TMP2, 0, SLJIT_IMM, 16);
1369
1370/* Second part (aligned) */
1371start = LABEL();
1372
1373OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
1374
1375partial_quit[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
1376if (common->mode == PCRE2_JIT_COMPLETE)
1377 add_jump(compiler, &common->failed_match, partial_quit[1]);
1378
1379load_from_mem_vector(compiler, TRUE, data_ind, str_ptr_reg_ind, 0);
1380
1381if (compare_type != vector_compare_match2)
1382 {
1383 if (compare_type == vector_compare_match1i)
1384 fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1385
1386 /* VFEE */
1387 instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1388 instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));
1389 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);
1390 sljit_emit_op_custom(compiler, instruction, 6);
1391 }
1392else
1393 {
1394 for (i = 0; i < 3; i++)
1395 fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1396
1397 /* VFENE */
1398 instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1399 instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1400 instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1401 sljit_emit_op_custom(compiler, instruction, 6);
1402 }
1403
1404sljit_set_current_flags(compiler, SLJIT_SET_OVERFLOW);
1405JUMPTO(SLJIT_OVERFLOW, start);
1406
1407/* VLGVB */
1408instruction[0] = (sljit_u16)(0xe700 | (tmp1_reg_ind << 4) | data_ind);
1409instruction[1] = 7;
1410instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1411sljit_emit_op_custom(compiler, instruction, 6);
1412
1413OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
1414
1415JUMPHERE(quit);
1416
1417if (common->mode != PCRE2_JIT_COMPLETE)
1418 {
1419 JUMPHERE(partial_quit[0]);
1420 JUMPHERE(partial_quit[1]);
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07001421 OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);
Elliott Hughes5b808042021-10-01 10:56:10 -07001422 CMOV(SLJIT_GREATER, STR_PTR, STR_END, 0);
1423 }
1424else
1425 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1426
1427#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1428if (common->utf && offset > 0)
1429 {
1430 SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);
1431
1432 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offset));
1433
1434 quit = jump_if_utf_char_start(compiler, TMP1);
1435
1436 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
1437 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1438
1439 OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 16);
1440 JUMPTO(SLJIT_JUMP, restart);
1441
1442 JUMPHERE(quit);
1443 }
1444#endif
1445}
1446
1447#define JIT_HAS_FAST_REQUESTED_CHAR_SIMD 1
1448
1449static jump_list *fast_requested_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2)
1450{
1451DEFINE_COMPILER;
1452sljit_u16 instruction[3];
1453struct sljit_label *start;
1454struct sljit_jump *quit;
1455jump_list *not_found = NULL;
1456vector_compare_type compare_type = vector_compare_match1;
1457sljit_s32 tmp1_reg_ind = sljit_get_register_index(TMP1);
1458sljit_s32 tmp3_reg_ind = sljit_get_register_index(TMP3);
1459sljit_s32 data_ind = 0;
1460sljit_s32 tmp_ind = 1;
1461sljit_s32 cmp1_ind = 2;
1462sljit_s32 cmp2_ind = 3;
1463sljit_s32 zero_ind = 4;
1464sljit_u32 bit = 0;
1465int i;
1466
1467if (char1 != char2)
1468 {
1469 bit = char1 ^ char2;
1470 compare_type = vector_compare_match1i;
1471
1472 if (!is_powerof2(bit))
1473 {
1474 bit = 0;
1475 compare_type = vector_compare_match2;
1476 }
1477 }
1478
1479add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
1480
1481/* First part (unaligned start) */
1482
1483OP2(SLJIT_ADD, TMP2, 0, TMP1, 0, SLJIT_IMM, 16);
1484
1485#if PCRE2_CODE_UNIT_WIDTH != 32
1486
1487/* VREPI */
1488instruction[0] = (sljit_u16)(0xe700 | (cmp1_ind << 4));
1489instruction[1] = (sljit_u16)(char1 | bit);
1490instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);
1491sljit_emit_op_custom(compiler, instruction, 6);
1492
1493if (char1 != char2)
1494 {
1495 /* VREPI */
1496 instruction[0] = (sljit_u16)(0xe700 | (cmp2_ind << 4));
1497 instruction[1] = (sljit_u16)(bit != 0 ? bit : char2);
1498 /* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1499 sljit_emit_op_custom(compiler, instruction, 6);
1500 }
1501
1502#else /* PCRE2_CODE_UNIT_WIDTH == 32 */
1503
1504for (int i = 0; i < 2; i++)
1505 {
1506 replicate_imm_vector(compiler, i, cmp1_ind, char1 | bit, TMP3);
1507
1508 if (char1 != char2)
1509 replicate_imm_vector(compiler, i, cmp2_ind, bit != 0 ? bit : char2, TMP3);
1510 }
1511
1512#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
1513
1514if (compare_type == vector_compare_match2)
1515 {
1516 /* VREPI */
1517 instruction[0] = (sljit_u16)(0xe700 | (zero_ind << 4));
1518 instruction[1] = 0;
1519 instruction[2] = (sljit_u16)((0x8 << 8) | 0x45);
1520 sljit_emit_op_custom(compiler, instruction, 6);
1521 }
1522
1523load_from_mem_vector(compiler, TRUE, data_ind, tmp1_reg_ind, 0);
1524OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, ~15);
1525
1526if (compare_type != vector_compare_match2)
1527 {
1528 if (compare_type == vector_compare_match1i)
1529 fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1530
1531 /* VFEE */
1532 instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1533 instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));
1534 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);
1535 sljit_emit_op_custom(compiler, instruction, 6);
1536 }
1537else
1538 {
1539 for (i = 0; i < 3; i++)
1540 fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1541
1542 /* VFENE */
1543 instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1544 instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1545 instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1546 sljit_emit_op_custom(compiler, instruction, 6);
1547 }
1548
1549/* VLGVB */
1550instruction[0] = (sljit_u16)(0xe700 | (tmp3_reg_ind << 4) | data_ind);
1551instruction[1] = 7;
1552instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1553sljit_emit_op_custom(compiler, instruction, 6);
1554
1555OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP3, 0);
1556quit = CMP(SLJIT_LESS, TMP1, 0, TMP2, 0);
1557
1558OP2(SLJIT_SUB, TMP1, 0, TMP2, 0, SLJIT_IMM, 16);
1559
1560/* Second part (aligned) */
1561start = LABEL();
1562
1563OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 16);
1564
1565add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
1566
1567load_from_mem_vector(compiler, TRUE, data_ind, tmp1_reg_ind, 0);
1568
1569if (compare_type != vector_compare_match2)
1570 {
1571 if (compare_type == vector_compare_match1i)
1572 fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1573
1574 /* VFEE */
1575 instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1576 instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));
1577 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);
1578 sljit_emit_op_custom(compiler, instruction, 6);
1579 }
1580else
1581 {
1582 for (i = 0; i < 3; i++)
1583 fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1584
1585 /* VFENE */
1586 instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1587 instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1588 instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1589 sljit_emit_op_custom(compiler, instruction, 6);
1590 }
1591
1592sljit_set_current_flags(compiler, SLJIT_SET_OVERFLOW);
1593JUMPTO(SLJIT_OVERFLOW, start);
1594
1595/* VLGVB */
1596instruction[0] = (sljit_u16)(0xe700 | (tmp3_reg_ind << 4) | data_ind);
1597instruction[1] = 7;
1598instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1599sljit_emit_op_custom(compiler, instruction, 6);
1600
1601OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP3, 0);
1602
1603JUMPHERE(quit);
1604add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
1605
1606return not_found;
1607}
1608
1609#define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD 1
1610
1611static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,
1612 PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
1613{
1614DEFINE_COMPILER;
1615sljit_u16 instruction[3];
1616struct sljit_label *start;
1617#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1618struct sljit_label *restart;
1619#endif
1620struct sljit_jump *quit;
1621struct sljit_jump *jump[2];
1622vector_compare_type compare1_type = vector_compare_match1;
1623vector_compare_type compare2_type = vector_compare_match1;
1624sljit_u32 bit1 = 0;
1625sljit_u32 bit2 = 0;
1626sljit_s32 diff = IN_UCHARS(offs2 - offs1);
1627sljit_s32 tmp1_reg_ind = sljit_get_register_index(TMP1);
1628sljit_s32 tmp2_reg_ind = sljit_get_register_index(TMP2);
1629sljit_s32 str_ptr_reg_ind = sljit_get_register_index(STR_PTR);
1630sljit_s32 data1_ind = 0;
1631sljit_s32 data2_ind = 1;
1632sljit_s32 tmp1_ind = 2;
1633sljit_s32 tmp2_ind = 3;
1634sljit_s32 cmp1a_ind = 4;
1635sljit_s32 cmp1b_ind = 5;
1636sljit_s32 cmp2a_ind = 6;
1637sljit_s32 cmp2b_ind = 7;
1638sljit_s32 zero_ind = 8;
1639int i;
1640
1641SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2);
1642SLJIT_ASSERT(-diff <= (sljit_s32)IN_UCHARS(max_fast_forward_char_pair_offset()));
1643SLJIT_ASSERT(tmp1_reg_ind != 0 && tmp2_reg_ind != 0);
1644
1645if (char1a != char1b)
1646 {
1647 bit1 = char1a ^ char1b;
1648 compare1_type = vector_compare_match1i;
1649
1650 if (!is_powerof2(bit1))
1651 {
1652 bit1 = 0;
1653 compare1_type = vector_compare_match2;
1654 }
1655 }
1656
1657if (char2a != char2b)
1658 {
1659 bit2 = char2a ^ char2b;
1660 compare2_type = vector_compare_match1i;
1661
1662 if (!is_powerof2(bit2))
1663 {
1664 bit2 = 0;
1665 compare2_type = vector_compare_match2;
1666 }
1667 }
1668
1669/* Initialize. */
1670if (common->match_end_ptr != 0)
1671 {
1672 OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
1673 OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);
1674 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
1675
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07001676 OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP1, 0, STR_END, 0);
Elliott Hughes5b808042021-10-01 10:56:10 -07001677 CMOV(SLJIT_LESS, STR_END, TMP1, 0);
1678 }
1679
1680OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
1681add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1682OP2(SLJIT_AND, TMP2, 0, STR_PTR, 0, SLJIT_IMM, ~15);
1683
1684#if PCRE2_CODE_UNIT_WIDTH != 32
1685
1686OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, -diff);
1687
1688/* VREPI */
1689instruction[0] = (sljit_u16)(0xe700 | (cmp1a_ind << 4));
1690instruction[1] = (sljit_u16)(char1a | bit1);
1691instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);
1692sljit_emit_op_custom(compiler, instruction, 6);
1693
1694if (char1a != char1b)
1695 {
1696 /* VREPI */
1697 instruction[0] = (sljit_u16)(0xe700 | (cmp1b_ind << 4));
1698 instruction[1] = (sljit_u16)(bit1 != 0 ? bit1 : char1b);
1699 /* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1700 sljit_emit_op_custom(compiler, instruction, 6);
1701 }
1702
1703/* VREPI */
1704instruction[0] = (sljit_u16)(0xe700 | (cmp2a_ind << 4));
1705instruction[1] = (sljit_u16)(char2a | bit2);
1706/* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1707sljit_emit_op_custom(compiler, instruction, 6);
1708
1709if (char2a != char2b)
1710 {
1711 /* VREPI */
1712 instruction[0] = (sljit_u16)(0xe700 | (cmp2b_ind << 4));
1713 instruction[1] = (sljit_u16)(bit2 != 0 ? bit2 : char2b);
1714 /* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1715 sljit_emit_op_custom(compiler, instruction, 6);
1716 }
1717
1718#else /* PCRE2_CODE_UNIT_WIDTH == 32 */
1719
1720for (int i = 0; i < 2; i++)
1721 {
1722 replicate_imm_vector(compiler, i, cmp1a_ind, char1a | bit1, TMP1);
1723
1724 if (char1a != char1b)
1725 replicate_imm_vector(compiler, i, cmp1b_ind, bit1 != 0 ? bit1 : char1b, TMP1);
1726
1727 replicate_imm_vector(compiler, i, cmp2a_ind, char2a | bit2, TMP1);
1728
1729 if (char2a != char2b)
1730 replicate_imm_vector(compiler, i, cmp2b_ind, bit2 != 0 ? bit2 : char2b, TMP1);
1731 }
1732
1733OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, -diff);
1734
1735#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
1736
1737/* VREPI */
1738instruction[0] = (sljit_u16)(0xe700 | (zero_ind << 4));
1739instruction[1] = 0;
1740instruction[2] = (sljit_u16)((0x8 << 8) | 0x45);
1741sljit_emit_op_custom(compiler, instruction, 6);
1742
1743#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1744restart = LABEL();
1745#endif
1746
1747jump[0] = CMP(SLJIT_LESS, TMP1, 0, TMP2, 0);
1748load_from_mem_vector(compiler, TRUE, data2_ind, tmp1_reg_ind, 0);
1749jump[1] = JUMP(SLJIT_JUMP);
1750JUMPHERE(jump[0]);
1751load_from_mem_vector(compiler, FALSE, data2_ind, tmp1_reg_ind, 0);
1752JUMPHERE(jump[1]);
1753
1754load_from_mem_vector(compiler, TRUE, data1_ind, str_ptr_reg_ind, 0);
1755OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 16);
1756
1757for (i = 0; i < 3; i++)
1758 {
1759 fast_forward_char_pair_sse2_compare(compiler, compare1_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
1760 fast_forward_char_pair_sse2_compare(compiler, compare2_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
1761 }
1762
1763/* VN */
1764instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);
1765instruction[1] = (sljit_u16)(data2_ind << 12);
1766instruction[2] = (sljit_u16)((0xe << 8) | 0x68);
1767sljit_emit_op_custom(compiler, instruction, 6);
1768
1769/* VFENE */
1770instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);
1771instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1772instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1773sljit_emit_op_custom(compiler, instruction, 6);
1774
1775/* VLGVB */
1776instruction[0] = (sljit_u16)(0xe700 | (tmp1_reg_ind << 4) | data1_ind);
1777instruction[1] = 7;
1778instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1779sljit_emit_op_custom(compiler, instruction, 6);
1780
1781OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
1782quit = CMP(SLJIT_LESS, STR_PTR, 0, TMP2, 0);
1783
1784OP2(SLJIT_SUB, STR_PTR, 0, TMP2, 0, SLJIT_IMM, 16);
1785OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, diff);
1786
1787/* Main loop. */
1788start = LABEL();
1789
1790OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
1791add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1792
1793load_from_mem_vector(compiler, FALSE, data1_ind, str_ptr_reg_ind, 0);
1794load_from_mem_vector(compiler, FALSE, data2_ind, str_ptr_reg_ind, tmp1_reg_ind);
1795
1796for (i = 0; i < 3; i++)
1797 {
1798 fast_forward_char_pair_sse2_compare(compiler, compare1_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
1799 fast_forward_char_pair_sse2_compare(compiler, compare2_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
1800 }
1801
1802/* VN */
1803instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);
1804instruction[1] = (sljit_u16)(data2_ind << 12);
1805instruction[2] = (sljit_u16)((0xe << 8) | 0x68);
1806sljit_emit_op_custom(compiler, instruction, 6);
1807
1808/* VFENE */
1809instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);
1810instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1811instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1812sljit_emit_op_custom(compiler, instruction, 6);
1813
1814sljit_set_current_flags(compiler, SLJIT_SET_OVERFLOW);
1815JUMPTO(SLJIT_OVERFLOW, start);
1816
1817/* VLGVB */
1818instruction[0] = (sljit_u16)(0xe700 | (tmp2_reg_ind << 4) | data1_ind);
1819instruction[1] = 7;
1820instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1821sljit_emit_op_custom(compiler, instruction, 6);
1822
1823OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
1824
1825JUMPHERE(quit);
1826
1827add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1828
1829#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1830if (common->utf)
1831 {
1832 SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);
1833
1834 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offs1));
1835
1836 quit = jump_if_utf_char_start(compiler, TMP1);
1837
1838 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
1839 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1840
1841 /* TMP1 contains diff. */
1842 OP2(SLJIT_AND, TMP2, 0, STR_PTR, 0, SLJIT_IMM, ~15);
1843 OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, -diff);
1844 JUMPTO(SLJIT_JUMP, restart);
1845
1846 JUMPHERE(quit);
1847 }
1848#endif
1849
1850OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
1851
1852if (common->match_end_ptr != 0)
1853 OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);
1854}
1855
1856#endif /* SLJIT_CONFIG_S390X */
1857
1858#endif /* !SUPPORT_VALGRIND */