blob: 419fd4900188fd926e63b3d0eba1f89b33b59c34 [file] [log] [blame]
Janis Danisevskis53e448c2016-03-31 13:35:25 +01001/*************************************************
2* Perl-Compatible Regular Expressions *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
Elliott Hughes4e19c8e2022-04-15 15:11:02 -070010 New API code Copyright (c) 2016-2022 University of Cambridge
Janis Danisevskis53e448c2016-03-31 13:35:25 +010011
12-----------------------------------------------------------------------------
13Redistribution and use in source and binary forms, with or without
14modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37POSSIBILITY OF SUCH DAMAGE.
38-----------------------------------------------------------------------------
39*/
40
41/* This module contains functions that scan a compiled pattern and change
42repeats into possessive repeats where possible. */
43
44
45#ifdef HAVE_CONFIG_H
46#include "config.h"
47#endif
48
49
50#include "pcre2_internal.h"
51
52
53/*************************************************
54* Tables for auto-possessification *
55*************************************************/
56
57/* This table is used to check whether auto-possessification is possible
58between adjacent character-type opcodes. The left-hand (repeated) opcode is
59used to select the row, and the right-hand opcode is use to select the column.
60A value of 1 means that auto-possessification is OK. For example, the second
61value in the first row means that \D+\d can be turned into \D++\d.
62
63The Unicode property types (\P and \p) have to be present to fill out the table
64because of what their opcode values are, but the table values should always be
65zero because property types are handled separately in the code. The last four
66columns apply to items that cannot be repeated, so there is no need to have
67rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
68*not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
69
70#define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
71#define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
72
73static const uint8_t autoposstab[APTROWS][APTCOLS] = {
74/* \D \d \S \s \W \w . .+ \C \P \p \R \H \h \V \v \X \Z \z $ $M */
75 { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \D */
76 { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \d */
77 { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \S */
78 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \s */
79 { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \W */
80 { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \w */
81 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* . */
82 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* .+ */
83 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \C */
84 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \P */
85 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \p */
86 { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \R */
87 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \H */
88 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \h */
89 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \V */
90 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 }, /* \v */
91 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 } /* \X */
92};
93
Janis Danisevskis8b979b22016-08-15 16:09:16 +010094#ifdef SUPPORT_UNICODE
Janis Danisevskis53e448c2016-03-31 13:35:25 +010095/* This table is used to check whether auto-possessification is possible
96between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
97left-hand (repeated) opcode is used to select the row, and the right-hand
98opcode is used to select the column. The values are as follows:
99
100 0 Always return FALSE (never auto-possessify)
101 1 Character groups are distinct (possessify if both are OP_PROP)
102 2 Check character categories in the same group (general or particular)
103 3 TRUE if the two opcodes are not the same (PROP vs NOTPROP)
104
105 4 Check left general category vs right particular category
106 5 Check right general category vs left particular category
107
108 6 Left alphanum vs right general category
109 7 Left space vs right general category
110 8 Left word vs right general category
111
112 9 Right alphanum vs left general category
113 10 Right space vs left general category
114 11 Right word vs left general category
115
116 12 Left alphanum vs right particular category
117 13 Left space vs right particular category
118 14 Left word vs right particular category
119
120 15 Right alphanum vs left particular category
121 16 Right space vs left particular category
122 17 Right word vs left particular category
123*/
124
125static const uint8_t propposstab[PT_TABSIZE][PT_TABSIZE] = {
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700126/* ANY LAMP GC PC SC SCX ALNUM SPACE PXSPACE WORD CLIST UCNC BIDICL BOOL */
127 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_ANY */
128 { 0, 3, 0, 0, 0, 0, 3, 1, 1, 0, 0, 0, 0, 0 }, /* PT_LAMP */
129 { 0, 0, 2, 4, 0, 0, 9, 10, 10, 11, 0, 0, 0, 0 }, /* PT_GC */
130 { 0, 0, 5, 2, 0, 0, 15, 16, 16, 17, 0, 0, 0, 0 }, /* PT_PC */
131 { 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_SC */
132 { 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_SCX */
133 { 0, 3, 6, 12, 0, 0, 3, 1, 1, 0, 0, 0, 0, 0 }, /* PT_ALNUM */
134 { 0, 1, 7, 13, 0, 0, 1, 3, 3, 1, 0, 0, 0, 0 }, /* PT_SPACE */
135 { 0, 1, 7, 13, 0, 0, 1, 3, 3, 1, 0, 0, 0, 0 }, /* PT_PXSPACE */
136 { 0, 0, 8, 14, 0, 0, 0, 1, 1, 3, 0, 0, 0, 0 }, /* PT_WORD */
137 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_CLIST */
138 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0 }, /* PT_UCNC */
139 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_BIDICL */
140 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } /* PT_BOOL */
Janis Danisevskis53e448c2016-03-31 13:35:25 +0100141};
142
143/* This table is used to check whether auto-possessification is possible
144between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
145specifies a general category and the other specifies a particular category. The
146row is selected by the general category and the column by the particular
147category. The value is 1 if the particular category is not part of the general
148category. */
149
150static const uint8_t catposstab[7][30] = {
151/* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
152 { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* C */
153 { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* L */
154 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* M */
155 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* N */
156 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 }, /* P */
157 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 }, /* S */
158 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 } /* Z */
159};
160
161/* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
162a general or particular category. The properties in each row are those
163that apply to the character set in question. Duplication means that a little
164unnecessary work is done when checking, but this keeps things much simpler
165because they can all use the same code. For more details see the comment where
166this table is used.
167
168Note: SPACE and PXSPACE used to be different because Perl excluded VT from
169"space", but from Perl 5.18 it's included, so both categories are treated the
170same here. */
171
172static const uint8_t posspropstab[3][4] = {
173 { ucp_L, ucp_N, ucp_N, ucp_Nl }, /* ALNUM, 3rd and 4th values redundant */
174 { ucp_Z, ucp_Z, ucp_C, ucp_Cc }, /* SPACE and PXSPACE, 2nd value redundant */
175 { ucp_L, ucp_N, ucp_P, ucp_Po } /* WORD */
176};
Janis Danisevskis8b979b22016-08-15 16:09:16 +0100177#endif /* SUPPORT_UNICODE */
Janis Danisevskis53e448c2016-03-31 13:35:25 +0100178
179
180
181#ifdef SUPPORT_UNICODE
182/*************************************************
183* Check a character and a property *
184*************************************************/
185
186/* This function is called by compare_opcodes() when a property item is
187adjacent to a fixed character.
188
189Arguments:
190 c the character
191 ptype the property type
192 pdata the data for the type
193 negated TRUE if it's a negated property (\P or \p{^)
194
195Returns: TRUE if auto-possessifying is OK
196*/
197
198static BOOL
199check_char_prop(uint32_t c, unsigned int ptype, unsigned int pdata,
200 BOOL negated)
201{
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700202BOOL ok;
Janis Danisevskis53e448c2016-03-31 13:35:25 +0100203const uint32_t *p;
204const ucd_record *prop = GET_UCD(c);
205
206switch(ptype)
207 {
208 case PT_LAMP:
209 return (prop->chartype == ucp_Lu ||
210 prop->chartype == ucp_Ll ||
211 prop->chartype == ucp_Lt) == negated;
212
213 case PT_GC:
214 return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
215
216 case PT_PC:
217 return (pdata == prop->chartype) == negated;
218
219 case PT_SC:
220 return (pdata == prop->script) == negated;
221
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700222 case PT_SCX:
223 ok = (pdata == prop->script
224 || MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), pdata) != 0);
225 return ok == negated;
226
Janis Danisevskis53e448c2016-03-31 13:35:25 +0100227 /* These are specials */
228
229 case PT_ALNUM:
230 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
231 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
232
233 /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
234 means that Perl space and POSIX space are now identical. PCRE was changed
235 at release 8.34. */
236
237 case PT_SPACE: /* Perl space */
238 case PT_PXSPACE: /* POSIX space */
239 switch(c)
240 {
241 HSPACE_CASES:
242 VSPACE_CASES:
243 return negated;
244
245 default:
246 return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
247 }
248 break; /* Control never reaches here */
249
250 case PT_WORD:
251 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
252 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
253 c == CHAR_UNDERSCORE) == negated;
254
255 case PT_CLIST:
256 p = PRIV(ucd_caseless_sets) + prop->caseset;
257 for (;;)
258 {
259 if (c < *p) return !negated;
260 if (c == *p++) return negated;
261 }
262 break; /* Control never reaches here */
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700263
264 /* Haven't yet thought these through. */
265
266 case PT_BIDICL:
267 return FALSE;
268
269 case PT_BOOL:
270 return FALSE;
Janis Danisevskis53e448c2016-03-31 13:35:25 +0100271 }
272
273return FALSE;
274}
275#endif /* SUPPORT_UNICODE */
276
277
278
279/*************************************************
280* Base opcode of repeated opcodes *
281*************************************************/
282
283/* Returns the base opcode for repeated single character type opcodes. If the
284opcode is not a repeated character type, it returns with the original value.
285
286Arguments: c opcode
287Returns: base opcode for the type
288*/
289
290static PCRE2_UCHAR
291get_repeat_base(PCRE2_UCHAR c)
292{
293return (c > OP_TYPEPOSUPTO)? c :
294 (c >= OP_TYPESTAR)? OP_TYPESTAR :
295 (c >= OP_NOTSTARI)? OP_NOTSTARI :
296 (c >= OP_NOTSTAR)? OP_NOTSTAR :
297 (c >= OP_STARI)? OP_STARI :
298 OP_STAR;
299}
300
301
302/*************************************************
303* Fill the character property list *
304*************************************************/
305
306/* Checks whether the code points to an opcode that can take part in auto-
307possessification, and if so, fills a list with its properties.
308
309Arguments:
310 code points to start of expression
311 utf TRUE if in UTF mode
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700312 ucp TRUE if in UCP mode
Janis Danisevskis53e448c2016-03-31 13:35:25 +0100313 fcc points to the case-flipping table
314 list points to output list
315 list[0] will be filled with the opcode
316 list[1] will be non-zero if this opcode
317 can match an empty character string
318 list[2..7] depends on the opcode
319
320Returns: points to the start of the next opcode if *code is accepted
321 NULL if *code is not accepted
322*/
323
324static PCRE2_SPTR
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700325get_chr_property_list(PCRE2_SPTR code, BOOL utf, BOOL ucp, const uint8_t *fcc,
Janis Danisevskis53e448c2016-03-31 13:35:25 +0100326 uint32_t *list)
327{
328PCRE2_UCHAR c = *code;
329PCRE2_UCHAR base;
330PCRE2_SPTR end;
331uint32_t chr;
332
333#ifdef SUPPORT_UNICODE
334uint32_t *clist_dest;
335const uint32_t *clist_src;
336#else
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700337(void)utf; /* Suppress "unused parameter" compiler warnings */
338(void)ucp;
Janis Danisevskis53e448c2016-03-31 13:35:25 +0100339#endif
340
341list[0] = c;
342list[1] = FALSE;
343code++;
344
345if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
346 {
347 base = get_repeat_base(c);
348 c -= (base - OP_STAR);
349
350 if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
351 code += IMM2_SIZE;
352
353 list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT &&
354 c != OP_POSPLUS);
355
356 switch(base)
357 {
358 case OP_STAR:
359 list[0] = OP_CHAR;
360 break;
361
362 case OP_STARI:
363 list[0] = OP_CHARI;
364 break;
365
366 case OP_NOTSTAR:
367 list[0] = OP_NOT;
368 break;
369
370 case OP_NOTSTARI:
371 list[0] = OP_NOTI;
372 break;
373
374 case OP_TYPESTAR:
375 list[0] = *code;
376 code++;
377 break;
378 }
379 c = list[0];
380 }
381
382switch(c)
383 {
384 case OP_NOT_DIGIT:
385 case OP_DIGIT:
386 case OP_NOT_WHITESPACE:
387 case OP_WHITESPACE:
388 case OP_NOT_WORDCHAR:
389 case OP_WORDCHAR:
390 case OP_ANY:
391 case OP_ALLANY:
392 case OP_ANYNL:
393 case OP_NOT_HSPACE:
394 case OP_HSPACE:
395 case OP_NOT_VSPACE:
396 case OP_VSPACE:
397 case OP_EXTUNI:
398 case OP_EODN:
399 case OP_EOD:
400 case OP_DOLL:
401 case OP_DOLLM:
402 return code;
403
404 case OP_CHAR:
405 case OP_NOT:
406 GETCHARINCTEST(chr, code);
407 list[2] = chr;
408 list[3] = NOTACHAR;
409 return code;
410
411 case OP_CHARI:
412 case OP_NOTI:
413 list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
414 GETCHARINCTEST(chr, code);
415 list[2] = chr;
416
417#ifdef SUPPORT_UNICODE
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700418 if (chr < 128 || (chr < 256 && !utf && !ucp))
Janis Danisevskis53e448c2016-03-31 13:35:25 +0100419 list[3] = fcc[chr];
420 else
421 list[3] = UCD_OTHERCASE(chr);
422#elif defined SUPPORT_WIDE_CHARS
423 list[3] = (chr < 256) ? fcc[chr] : chr;
424#else
425 list[3] = fcc[chr];
426#endif
427
428 /* The othercase might be the same value. */
429
430 if (chr == list[3])
431 list[3] = NOTACHAR;
432 else
433 list[4] = NOTACHAR;
434 return code;
435
436#ifdef SUPPORT_UNICODE
437 case OP_PROP:
438 case OP_NOTPROP:
439 if (code[0] != PT_CLIST)
440 {
441 list[2] = code[0];
442 list[3] = code[1];
443 return code + 2;
444 }
445
446 /* Convert only if we have enough space. */
447
448 clist_src = PRIV(ucd_caseless_sets) + code[1];
449 clist_dest = list + 2;
450 code += 2;
451
452 do {
453 if (clist_dest >= list + 8)
454 {
455 /* Early return if there is not enough space. This should never
456 happen, since all clists are shorter than 5 character now. */
457 list[2] = code[0];
458 list[3] = code[1];
459 return code;
460 }
461 *clist_dest++ = *clist_src;
462 }
463 while(*clist_src++ != NOTACHAR);
464
465 /* All characters are stored. The terminating NOTACHAR is copied from the
466 clist itself. */
467
468 list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
469 return code;
470#endif
471
472 case OP_NCLASS:
473 case OP_CLASS:
474#ifdef SUPPORT_WIDE_CHARS
475 case OP_XCLASS:
476 if (c == OP_XCLASS)
477 end = code + GET(code, 0) - 1;
478 else
479#endif
480 end = code + 32 / sizeof(PCRE2_UCHAR);
481
482 switch(*end)
483 {
484 case OP_CRSTAR:
485 case OP_CRMINSTAR:
486 case OP_CRQUERY:
487 case OP_CRMINQUERY:
488 case OP_CRPOSSTAR:
489 case OP_CRPOSQUERY:
490 list[1] = TRUE;
491 end++;
492 break;
493
494 case OP_CRPLUS:
495 case OP_CRMINPLUS:
496 case OP_CRPOSPLUS:
497 end++;
498 break;
499
500 case OP_CRRANGE:
501 case OP_CRMINRANGE:
502 case OP_CRPOSRANGE:
503 list[1] = (GET2(end, 1) == 0);
504 end += 1 + 2 * IMM2_SIZE;
505 break;
506 }
507 list[2] = (uint32_t)(end - code);
508 return end;
509 }
Elliott Hughes378b1752021-06-08 13:42:40 -0700510
Janis Danisevskis53e448c2016-03-31 13:35:25 +0100511return NULL; /* Opcode not accepted */
512}
513
514
515
516/*************************************************
517* Scan further character sets for match *
518*************************************************/
519
520/* Checks whether the base and the current opcode have a common character, in
521which case the base cannot be possessified.
522
523Arguments:
524 code points to the byte code
525 utf TRUE in UTF mode
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700526 ucp TRUE in UCP mode
Janis Danisevskis53e448c2016-03-31 13:35:25 +0100527 cb compile data block
528 base_list the data list of the base opcode
Elliott Hughes653c2102019-01-09 15:41:36 -0800529 base_end the end of the base opcode
Janis Danisevskis53e448c2016-03-31 13:35:25 +0100530 rec_limit points to recursion depth counter
531
532Returns: TRUE if the auto-possessification is possible
533*/
534
535static BOOL
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700536compare_opcodes(PCRE2_SPTR code, BOOL utf, BOOL ucp, const compile_block *cb,
Janis Danisevskis53e448c2016-03-31 13:35:25 +0100537 const uint32_t *base_list, PCRE2_SPTR base_end, int *rec_limit)
538{
539PCRE2_UCHAR c;
540uint32_t list[8];
541const uint32_t *chr_ptr;
542const uint32_t *ochr_ptr;
543const uint32_t *list_ptr;
544PCRE2_SPTR next_code;
545#ifdef SUPPORT_WIDE_CHARS
546PCRE2_SPTR xclass_flags;
547#endif
548const uint8_t *class_bitset;
549const uint8_t *set1, *set2, *set_end;
550uint32_t chr;
551BOOL accepted, invert_bits;
552BOOL entered_a_group = FALSE;
553
554if (--(*rec_limit) <= 0) return FALSE; /* Recursion has gone too deep */
555
556/* Note: the base_list[1] contains whether the current opcode has a greedy
557(represented by a non-zero value) quantifier. This is a different from
558other character type lists, which store here that the character iterator
559matches to an empty string (also represented by a non-zero value). */
560
561for(;;)
562 {
563 /* All operations move the code pointer forward.
564 Therefore infinite recursions are not possible. */
565
566 c = *code;
567
568 /* Skip over callouts */
569
570 if (c == OP_CALLOUT)
571 {
572 code += PRIV(OP_lengths)[c];
573 continue;
574 }
575
576 if (c == OP_CALLOUT_STR)
577 {
578 code += GET(code, 1 + 2*LINK_SIZE);
579 continue;
580 }
581
Elliott Hughes9bc971b2018-07-27 13:23:14 -0700582 /* At the end of a branch, skip to the end of the group. */
583
Janis Danisevskis53e448c2016-03-31 13:35:25 +0100584 if (c == OP_ALT)
585 {
586 do code += GET(code, 1); while (*code == OP_ALT);
587 c = *code;
588 }
589
Elliott Hughes9bc971b2018-07-27 13:23:14 -0700590 /* Inspect the next opcode. */
591
Janis Danisevskis53e448c2016-03-31 13:35:25 +0100592 switch(c)
593 {
Elliott Hughes9bc971b2018-07-27 13:23:14 -0700594 /* We can always possessify a greedy iterator at the end of the pattern,
595 which is reached after skipping over the final OP_KET. A non-greedy
596 iterator must never be possessified. */
Janis Danisevskis53e448c2016-03-31 13:35:25 +0100597
Elliott Hughes9bc971b2018-07-27 13:23:14 -0700598 case OP_END:
Janis Danisevskis53e448c2016-03-31 13:35:25 +0100599 return base_list[1] != 0;
600
Elliott Hughes9bc971b2018-07-27 13:23:14 -0700601 /* When an iterator is at the end of certain kinds of group we can inspect
602 what follows the group by skipping over the closing ket. Note that this
603 does not apply to OP_KETRMAX or OP_KETRMIN because what follows any given
604 iteration is variable (could be another iteration or could be the next
605 item). As these two opcodes are not listed in the next switch, they will
606 end up as the next code to inspect, and return FALSE by virtue of being
607 unsupported. */
608
Janis Danisevskis53e448c2016-03-31 13:35:25 +0100609 case OP_KET:
Elliott Hughes9bc971b2018-07-27 13:23:14 -0700610 case OP_KETRPOS:
611 /* The non-greedy case cannot be converted to a possessive form. */
Janis Danisevskis53e448c2016-03-31 13:35:25 +0100612
613 if (base_list[1] == 0) return FALSE;
614
Elliott Hughes9bc971b2018-07-27 13:23:14 -0700615 /* If the bracket is capturing it might be referenced by an OP_RECURSE
616 so its last iterator can never be possessified if the pattern contains
617 recursions. (This could be improved by keeping a list of group numbers that
618 are called by recursion.) */
619
Janis Danisevskis53e448c2016-03-31 13:35:25 +0100620 switch(*(code - GET(code, 1)))
621 {
Elliott Hughes9bc971b2018-07-27 13:23:14 -0700622 case OP_CBRA:
623 case OP_SCBRA:
624 case OP_CBRAPOS:
625 case OP_SCBRAPOS:
626 if (cb->had_recurse) return FALSE;
627 break;
628
Elliott Hughes0c26e192019-08-07 12:24:46 -0700629 /* A script run might have to backtrack if the iterated item can match
630 characters from more than one script. So give up unless repeating an
631 explicit character. */
632
633 case OP_SCRIPT_RUN:
634 if (base_list[0] != OP_CHAR && base_list[0] != OP_CHARI)
635 return FALSE;
636 break;
637
Elliott Hughes9bc971b2018-07-27 13:23:14 -0700638 /* Atomic sub-patterns and assertions can always auto-possessify their
639 last iterator. However, if the group was entered as a result of checking
640 a previous iterator, this is not possible. */
641
Janis Danisevskis53e448c2016-03-31 13:35:25 +0100642 case OP_ASSERT:
643 case OP_ASSERT_NOT:
644 case OP_ASSERTBACK:
645 case OP_ASSERTBACK_NOT:
646 case OP_ONCE:
Janis Danisevskis53e448c2016-03-31 13:35:25 +0100647 return !entered_a_group;
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700648
649 /* Non-atomic assertions - don't possessify last iterator. This needs
650 more thought. */
651
652 case OP_ASSERT_NA:
653 case OP_ASSERTBACK_NA:
654 return FALSE;
Janis Danisevskis53e448c2016-03-31 13:35:25 +0100655 }
656
Elliott Hughes9bc971b2018-07-27 13:23:14 -0700657 /* Skip over the bracket and inspect what comes next. */
658
Janis Danisevskis53e448c2016-03-31 13:35:25 +0100659 code += PRIV(OP_lengths)[c];
660 continue;
661
Elliott Hughes9bc971b2018-07-27 13:23:14 -0700662 /* Handle cases where the next item is a group. */
663
Janis Danisevskis53e448c2016-03-31 13:35:25 +0100664 case OP_ONCE:
Janis Danisevskis53e448c2016-03-31 13:35:25 +0100665 case OP_BRA:
666 case OP_CBRA:
667 next_code = code + GET(code, 1);
668 code += PRIV(OP_lengths)[c];
669
Elliott Hughes9bc971b2018-07-27 13:23:14 -0700670 /* Check each branch. We have to recurse a level for all but the last
671 branch. */
672
Janis Danisevskis53e448c2016-03-31 13:35:25 +0100673 while (*next_code == OP_ALT)
674 {
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700675 if (!compare_opcodes(code, utf, ucp, cb, base_list, base_end, rec_limit))
Janis Danisevskis53e448c2016-03-31 13:35:25 +0100676 return FALSE;
677 code = next_code + 1 + LINK_SIZE;
678 next_code += GET(next_code, 1);
679 }
680
681 entered_a_group = TRUE;
682 continue;
683
684 case OP_BRAZERO:
685 case OP_BRAMINZERO:
686
687 next_code = code + 1;
Elliott Hughes9bc971b2018-07-27 13:23:14 -0700688 if (*next_code != OP_BRA && *next_code != OP_CBRA &&
689 *next_code != OP_ONCE) return FALSE;
Janis Danisevskis53e448c2016-03-31 13:35:25 +0100690
691 do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
692
693 /* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */
694
695 next_code += 1 + LINK_SIZE;
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700696 if (!compare_opcodes(next_code, utf, ucp, cb, base_list, base_end,
697 rec_limit))
Janis Danisevskis53e448c2016-03-31 13:35:25 +0100698 return FALSE;
699
700 code += PRIV(OP_lengths)[c];
701 continue;
702
Elliott Hughes9bc971b2018-07-27 13:23:14 -0700703 /* The next opcode does not need special handling; fall through and use it
704 to see if the base can be possessified. */
705
Janis Danisevskis53e448c2016-03-31 13:35:25 +0100706 default:
707 break;
708 }
709
Elliott Hughes9bc971b2018-07-27 13:23:14 -0700710 /* We now have the next appropriate opcode to compare with the base. Check
711 for a supported opcode, and load its properties. */
Janis Danisevskis53e448c2016-03-31 13:35:25 +0100712
Elliott Hughes2dbd7d22020-06-03 14:32:37 -0700713 code = get_chr_property_list(code, utf, ucp, cb->fcc, list);
Janis Danisevskis53e448c2016-03-31 13:35:25 +0100714 if (code == NULL) return FALSE; /* Unsupported */
715
716 /* If either opcode is a small character list, set pointers for comparing
717 characters from that list with another list, or with a property. */
718
719 if (base_list[0] == OP_CHAR)
720 {
721 chr_ptr = base_list + 2;
722 list_ptr = list;
723 }
724 else if (list[0] == OP_CHAR)
725 {
726 chr_ptr = list + 2;
727 list_ptr = base_list;
728 }
729
730 /* Character bitsets can also be compared to certain opcodes. */
731
732 else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
733#if PCRE2_CODE_UNIT_WIDTH == 8
734 /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
735 || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
736#endif
737 )
738 {
739#if PCRE2_CODE_UNIT_WIDTH == 8
740 if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
741#else
742 if (base_list[0] == OP_CLASS)
743#endif
744 {
745 set1 = (uint8_t *)(base_end - base_list[2]);
746 list_ptr = list;
747 }
748 else
749 {
750 set1 = (uint8_t *)(code - list[2]);
751 list_ptr = base_list;
752 }
753
754 invert_bits = FALSE;
755 switch(list_ptr[0])
756 {
757 case OP_CLASS:
758 case OP_NCLASS:
759 set2 = (uint8_t *)
760 ((list_ptr == list ? code : base_end) - list_ptr[2]);
761 break;
762
763#ifdef SUPPORT_WIDE_CHARS
764 case OP_XCLASS:
765 xclass_flags = (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE;
766 if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE;
767 if ((*xclass_flags & XCL_MAP) == 0)
768 {
769 /* No bits are set for characters < 256. */
Elliott Hughes653c2102019-01-09 15:41:36 -0800770 if (list[1] == 0) return (*xclass_flags & XCL_NOT) == 0;
Janis Danisevskis53e448c2016-03-31 13:35:25 +0100771 /* Might be an empty repeat. */
772 continue;
773 }
774 set2 = (uint8_t *)(xclass_flags + 1);
775 break;
776#endif
777
778 case OP_NOT_DIGIT:
779 invert_bits = TRUE;
780 /* Fall through */
781 case OP_DIGIT:
782 set2 = (uint8_t *)(cb->cbits + cbit_digit);
783 break;
784
785 case OP_NOT_WHITESPACE:
786 invert_bits = TRUE;
787 /* Fall through */
788 case OP_WHITESPACE:
789 set2 = (uint8_t *)(cb->cbits + cbit_space);
790 break;
791
792 case OP_NOT_WORDCHAR:
793 invert_bits = TRUE;
794 /* Fall through */
795 case OP_WORDCHAR:
796 set2 = (uint8_t *)(cb->cbits + cbit_word);
797 break;
798
799 default:
800 return FALSE;
801 }
802
803 /* Because the bit sets are unaligned bytes, we need to perform byte
804 comparison here. */
805
806 set_end = set1 + 32;
807 if (invert_bits)
808 {
809 do
810 {
811 if ((*set1++ & ~(*set2++)) != 0) return FALSE;
812 }
813 while (set1 < set_end);
814 }
815 else
816 {
817 do
818 {
819 if ((*set1++ & *set2++) != 0) return FALSE;
820 }
821 while (set1 < set_end);
822 }
823
824 if (list[1] == 0) return TRUE;
825 /* Might be an empty repeat. */
826 continue;
827 }
828
829 /* Some property combinations also acceptable. Unicode property opcodes are
830 processed specially; the rest can be handled with a lookup table. */
831
832 else
833 {
834 uint32_t leftop, rightop;
835
836 leftop = base_list[0];
837 rightop = list[0];
838
839#ifdef SUPPORT_UNICODE
840 accepted = FALSE; /* Always set in non-unicode case. */
841 if (leftop == OP_PROP || leftop == OP_NOTPROP)
842 {
843 if (rightop == OP_EOD)
844 accepted = TRUE;
845 else if (rightop == OP_PROP || rightop == OP_NOTPROP)
846 {
847 int n;
848 const uint8_t *p;
849 BOOL same = leftop == rightop;
850 BOOL lisprop = leftop == OP_PROP;
851 BOOL risprop = rightop == OP_PROP;
852 BOOL bothprop = lisprop && risprop;
853
854 /* There's a table that specifies how each combination is to be
855 processed:
856 0 Always return FALSE (never auto-possessify)
857 1 Character groups are distinct (possessify if both are OP_PROP)
858 2 Check character categories in the same group (general or particular)
859 3 Return TRUE if the two opcodes are not the same
860 ... see comments below
861 */
862
863 n = propposstab[base_list[2]][list[2]];
864 switch(n)
865 {
866 case 0: break;
867 case 1: accepted = bothprop; break;
868 case 2: accepted = (base_list[3] == list[3]) != same; break;
869 case 3: accepted = !same; break;
870
871 case 4: /* Left general category, right particular category */
872 accepted = risprop && catposstab[base_list[3]][list[3]] == same;
873 break;
874
875 case 5: /* Right general category, left particular category */
876 accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
877 break;
878
879 /* This code is logically tricky. Think hard before fiddling with it.
880 The posspropstab table has four entries per row. Each row relates to
881 one of PCRE's special properties such as ALNUM or SPACE or WORD.
882 Only WORD actually needs all four entries, but using repeats for the
883 others means they can all use the same code below.
884
885 The first two entries in each row are Unicode general categories, and
886 apply always, because all the characters they include are part of the
887 PCRE character set. The third and fourth entries are a general and a
888 particular category, respectively, that include one or more relevant
889 characters. One or the other is used, depending on whether the check
890 is for a general or a particular category. However, in both cases the
891 category contains more characters than the specials that are defined
892 for the property being tested against. Therefore, it cannot be used
893 in a NOTPROP case.
894
895 Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
896 Underscore is covered by ucp_P or ucp_Po. */
897
898 case 6: /* Left alphanum vs right general category */
899 case 7: /* Left space vs right general category */
900 case 8: /* Left word vs right general category */
901 p = posspropstab[n-6];
902 accepted = risprop && lisprop ==
903 (list[3] != p[0] &&
904 list[3] != p[1] &&
905 (list[3] != p[2] || !lisprop));
906 break;
907
908 case 9: /* Right alphanum vs left general category */
909 case 10: /* Right space vs left general category */
910 case 11: /* Right word vs left general category */
911 p = posspropstab[n-9];
912 accepted = lisprop && risprop ==
913 (base_list[3] != p[0] &&
914 base_list[3] != p[1] &&
915 (base_list[3] != p[2] || !risprop));
916 break;
917
918 case 12: /* Left alphanum vs right particular category */
919 case 13: /* Left space vs right particular category */
920 case 14: /* Left word vs right particular category */
921 p = posspropstab[n-12];
922 accepted = risprop && lisprop ==
923 (catposstab[p[0]][list[3]] &&
924 catposstab[p[1]][list[3]] &&
925 (list[3] != p[3] || !lisprop));
926 break;
927
928 case 15: /* Right alphanum vs left particular category */
929 case 16: /* Right space vs left particular category */
930 case 17: /* Right word vs left particular category */
931 p = posspropstab[n-15];
932 accepted = lisprop && risprop ==
933 (catposstab[p[0]][base_list[3]] &&
934 catposstab[p[1]][base_list[3]] &&
935 (base_list[3] != p[3] || !risprop));
936 break;
937 }
938 }
939 }
940
941 else
942#endif /* SUPPORT_UNICODE */
943
944 accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
945 rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
946 autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
947
948 if (!accepted) return FALSE;
949
950 if (list[1] == 0) return TRUE;
951 /* Might be an empty repeat. */
952 continue;
953 }
954
955 /* Control reaches here only if one of the items is a small character list.
956 All characters are checked against the other side. */
957
958 do
959 {
960 chr = *chr_ptr;
961
962 switch(list_ptr[0])
963 {
964 case OP_CHAR:
965 ochr_ptr = list_ptr + 2;
966 do
967 {
968 if (chr == *ochr_ptr) return FALSE;
969 ochr_ptr++;
970 }
971 while(*ochr_ptr != NOTACHAR);
972 break;
973
974 case OP_NOT:
975 ochr_ptr = list_ptr + 2;
976 do
977 {
978 if (chr == *ochr_ptr)
979 break;
980 ochr_ptr++;
981 }
982 while(*ochr_ptr != NOTACHAR);
983 if (*ochr_ptr == NOTACHAR) return FALSE; /* Not found */
984 break;
985
986 /* Note that OP_DIGIT etc. are generated only when PCRE2_UCP is *not*
987 set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
988
989 case OP_DIGIT:
990 if (chr < 256 && (cb->ctypes[chr] & ctype_digit) != 0) return FALSE;
991 break;
992
993 case OP_NOT_DIGIT:
994 if (chr > 255 || (cb->ctypes[chr] & ctype_digit) == 0) return FALSE;
995 break;
996
997 case OP_WHITESPACE:
998 if (chr < 256 && (cb->ctypes[chr] & ctype_space) != 0) return FALSE;
999 break;
1000
1001 case OP_NOT_WHITESPACE:
1002 if (chr > 255 || (cb->ctypes[chr] & ctype_space) == 0) return FALSE;
1003 break;
1004
1005 case OP_WORDCHAR:
1006 if (chr < 255 && (cb->ctypes[chr] & ctype_word) != 0) return FALSE;
1007 break;
1008
1009 case OP_NOT_WORDCHAR:
1010 if (chr > 255 || (cb->ctypes[chr] & ctype_word) == 0) return FALSE;
1011 break;
1012
1013 case OP_HSPACE:
1014 switch(chr)
1015 {
1016 HSPACE_CASES: return FALSE;
1017 default: break;
1018 }
1019 break;
1020
1021 case OP_NOT_HSPACE:
1022 switch(chr)
1023 {
1024 HSPACE_CASES: break;
1025 default: return FALSE;
1026 }
1027 break;
1028
1029 case OP_ANYNL:
1030 case OP_VSPACE:
1031 switch(chr)
1032 {
1033 VSPACE_CASES: return FALSE;
1034 default: break;
1035 }
1036 break;
1037
1038 case OP_NOT_VSPACE:
1039 switch(chr)
1040 {
1041 VSPACE_CASES: break;
1042 default: return FALSE;
1043 }
1044 break;
1045
1046 case OP_DOLL:
1047 case OP_EODN:
1048 switch (chr)
1049 {
1050 case CHAR_CR:
1051 case CHAR_LF:
1052 case CHAR_VT:
1053 case CHAR_FF:
1054 case CHAR_NEL:
1055#ifndef EBCDIC
1056 case 0x2028:
1057 case 0x2029:
1058#endif /* Not EBCDIC */
1059 return FALSE;
1060 }
1061 break;
1062
1063 case OP_EOD: /* Can always possessify before \z */
1064 break;
1065
1066#ifdef SUPPORT_UNICODE
1067 case OP_PROP:
1068 case OP_NOTPROP:
1069 if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
1070 list_ptr[0] == OP_NOTPROP))
1071 return FALSE;
1072 break;
1073#endif
1074
1075 case OP_NCLASS:
1076 if (chr > 255) return FALSE;
1077 /* Fall through */
1078
1079 case OP_CLASS:
1080 if (chr > 255) break;
1081 class_bitset = (uint8_t *)
1082 ((list_ptr == list ? code : base_end) - list_ptr[2]);
Elliott Hughes0c26e192019-08-07 12:24:46 -07001083 if ((class_bitset[chr >> 3] & (1u << (chr & 7))) != 0) return FALSE;
Janis Danisevskis53e448c2016-03-31 13:35:25 +01001084 break;
1085
1086#ifdef SUPPORT_WIDE_CHARS
1087 case OP_XCLASS:
1088 if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
1089 list_ptr[2] + LINK_SIZE, utf)) return FALSE;
1090 break;
1091#endif
1092
1093 default:
1094 return FALSE;
1095 }
1096
1097 chr_ptr++;
1098 }
1099 while(*chr_ptr != NOTACHAR);
1100
1101 /* At least one character must be matched from this opcode. */
1102
1103 if (list[1] == 0) return TRUE;
1104 }
1105
1106/* Control never reaches here. There used to be a fail-save return FALSE; here,
1107but some compilers complain about an unreachable statement. */
1108}
1109
1110
1111
1112/*************************************************
1113* Scan compiled regex for auto-possession *
1114*************************************************/
1115
1116/* Replaces single character iterations with their possessive alternatives
1117if appropriate. This function modifies the compiled opcode! Hitting a
Elliott Hughes9bc971b2018-07-27 13:23:14 -07001118non-existent opcode may indicate a bug in PCRE2, but it can also be caused if a
1119bad UTF string was compiled with PCRE2_NO_UTF_CHECK. The rec_limit catches
1120overly complicated or large patterns. In these cases, the check just stops,
1121leaving the remainder of the pattern unpossessified.
Janis Danisevskis53e448c2016-03-31 13:35:25 +01001122
1123Arguments:
1124 code points to start of the byte code
Janis Danisevskis53e448c2016-03-31 13:35:25 +01001125 cb compile data block
1126
1127Returns: 0 for success
1128 -1 if a non-existant opcode is encountered
1129*/
1130
1131int
Elliott Hughes2dbd7d22020-06-03 14:32:37 -07001132PRIV(auto_possessify)(PCRE2_UCHAR *code, const compile_block *cb)
Janis Danisevskis53e448c2016-03-31 13:35:25 +01001133{
Elliott Hughes9bc971b2018-07-27 13:23:14 -07001134PCRE2_UCHAR c;
Janis Danisevskis53e448c2016-03-31 13:35:25 +01001135PCRE2_SPTR end;
1136PCRE2_UCHAR *repeat_opcode;
1137uint32_t list[8];
Elliott Hughes9bc971b2018-07-27 13:23:14 -07001138int rec_limit = 1000; /* Was 10,000 but clang+ASAN uses a lot of stack. */
Elliott Hughes2dbd7d22020-06-03 14:32:37 -07001139BOOL utf = (cb->external_options & PCRE2_UTF) != 0;
1140BOOL ucp = (cb->external_options & PCRE2_UCP) != 0;
Janis Danisevskis53e448c2016-03-31 13:35:25 +01001141
1142for (;;)
1143 {
1144 c = *code;
1145
Elliott Hughes9bc971b2018-07-27 13:23:14 -07001146 if (c >= OP_TABLE_LENGTH) return -1; /* Something gone wrong */
Janis Danisevskis53e448c2016-03-31 13:35:25 +01001147
1148 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
1149 {
1150 c -= get_repeat_base(c) - OP_STAR;
1151 end = (c <= OP_MINUPTO) ?
Elliott Hughes2dbd7d22020-06-03 14:32:37 -07001152 get_chr_property_list(code, utf, ucp, cb->fcc, list) : NULL;
Janis Danisevskis53e448c2016-03-31 13:35:25 +01001153 list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
1154
Elliott Hughes2dbd7d22020-06-03 14:32:37 -07001155 if (end != NULL && compare_opcodes(end, utf, ucp, cb, list, end,
1156 &rec_limit))
Janis Danisevskis53e448c2016-03-31 13:35:25 +01001157 {
1158 switch(c)
1159 {
1160 case OP_STAR:
1161 *code += OP_POSSTAR - OP_STAR;
1162 break;
1163
1164 case OP_MINSTAR:
1165 *code += OP_POSSTAR - OP_MINSTAR;
1166 break;
1167
1168 case OP_PLUS:
1169 *code += OP_POSPLUS - OP_PLUS;
1170 break;
1171
1172 case OP_MINPLUS:
1173 *code += OP_POSPLUS - OP_MINPLUS;
1174 break;
1175
1176 case OP_QUERY:
1177 *code += OP_POSQUERY - OP_QUERY;
1178 break;
1179
1180 case OP_MINQUERY:
1181 *code += OP_POSQUERY - OP_MINQUERY;
1182 break;
1183
1184 case OP_UPTO:
1185 *code += OP_POSUPTO - OP_UPTO;
1186 break;
1187
1188 case OP_MINUPTO:
1189 *code += OP_POSUPTO - OP_MINUPTO;
1190 break;
1191 }
1192 }
1193 c = *code;
1194 }
1195 else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
1196 {
1197#ifdef SUPPORT_WIDE_CHARS
1198 if (c == OP_XCLASS)
1199 repeat_opcode = code + GET(code, 1);
1200 else
1201#endif
1202 repeat_opcode = code + 1 + (32 / sizeof(PCRE2_UCHAR));
1203
1204 c = *repeat_opcode;
1205 if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
1206 {
Elliott Hughes378b1752021-06-08 13:42:40 -07001207 /* The return from get_chr_property_list() will never be NULL when
1208 *code (aka c) is one of the three class opcodes. However, gcc with
1209 -fanalyzer notes that a NULL return is possible, and grumbles. Hence we
1210 put in a check. */
Janis Danisevskis53e448c2016-03-31 13:35:25 +01001211
Elliott Hughes378b1752021-06-08 13:42:40 -07001212 end = get_chr_property_list(code, utf, ucp, cb->fcc, list);
Janis Danisevskis53e448c2016-03-31 13:35:25 +01001213 list[1] = (c & 1) == 0;
1214
Elliott Hughes378b1752021-06-08 13:42:40 -07001215 if (end != NULL &&
1216 compare_opcodes(end, utf, ucp, cb, list, end, &rec_limit))
Janis Danisevskis53e448c2016-03-31 13:35:25 +01001217 {
1218 switch (c)
1219 {
1220 case OP_CRSTAR:
1221 case OP_CRMINSTAR:
1222 *repeat_opcode = OP_CRPOSSTAR;
1223 break;
1224
1225 case OP_CRPLUS:
1226 case OP_CRMINPLUS:
1227 *repeat_opcode = OP_CRPOSPLUS;
1228 break;
1229
1230 case OP_CRQUERY:
1231 case OP_CRMINQUERY:
1232 *repeat_opcode = OP_CRPOSQUERY;
1233 break;
1234
1235 case OP_CRRANGE:
1236 case OP_CRMINRANGE:
1237 *repeat_opcode = OP_CRPOSRANGE;
1238 break;
1239 }
1240 }
1241 }
1242 c = *code;
1243 }
1244
1245 switch(c)
1246 {
1247 case OP_END:
1248 return 0;
1249
1250 case OP_TYPESTAR:
1251 case OP_TYPEMINSTAR:
1252 case OP_TYPEPLUS:
1253 case OP_TYPEMINPLUS:
1254 case OP_TYPEQUERY:
1255 case OP_TYPEMINQUERY:
1256 case OP_TYPEPOSSTAR:
1257 case OP_TYPEPOSPLUS:
1258 case OP_TYPEPOSQUERY:
1259 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1260 break;
1261
1262 case OP_TYPEUPTO:
1263 case OP_TYPEMINUPTO:
1264 case OP_TYPEEXACT:
1265 case OP_TYPEPOSUPTO:
1266 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
1267 code += 2;
1268 break;
1269
1270 case OP_CALLOUT_STR:
1271 code += GET(code, 1 + 2*LINK_SIZE);
1272 break;
1273
1274#ifdef SUPPORT_WIDE_CHARS
1275 case OP_XCLASS:
1276 code += GET(code, 1);
1277 break;
1278#endif
1279
1280 case OP_MARK:
Elliott Hughes653c2102019-01-09 15:41:36 -08001281 case OP_COMMIT_ARG:
Janis Danisevskis53e448c2016-03-31 13:35:25 +01001282 case OP_PRUNE_ARG:
1283 case OP_SKIP_ARG:
1284 case OP_THEN_ARG:
1285 code += code[1];
1286 break;
1287 }
1288
1289 /* Add in the fixed length from the table */
1290
1291 code += PRIV(OP_lengths)[c];
1292
1293 /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may be
1294 followed by a multi-byte character. The length in the table is a minimum, so
1295 we have to arrange to skip the extra code units. */
1296
1297#ifdef MAYBE_UTF_MULTI
1298 if (utf) switch(c)
1299 {
1300 case OP_CHAR:
1301 case OP_CHARI:
1302 case OP_NOT:
1303 case OP_NOTI:
1304 case OP_STAR:
1305 case OP_MINSTAR:
1306 case OP_PLUS:
1307 case OP_MINPLUS:
1308 case OP_QUERY:
1309 case OP_MINQUERY:
1310 case OP_UPTO:
1311 case OP_MINUPTO:
1312 case OP_EXACT:
1313 case OP_POSSTAR:
1314 case OP_POSPLUS:
1315 case OP_POSQUERY:
1316 case OP_POSUPTO:
1317 case OP_STARI:
1318 case OP_MINSTARI:
1319 case OP_PLUSI:
1320 case OP_MINPLUSI:
1321 case OP_QUERYI:
1322 case OP_MINQUERYI:
1323 case OP_UPTOI:
1324 case OP_MINUPTOI:
1325 case OP_EXACTI:
1326 case OP_POSSTARI:
1327 case OP_POSPLUSI:
1328 case OP_POSQUERYI:
1329 case OP_POSUPTOI:
1330 case OP_NOTSTAR:
1331 case OP_NOTMINSTAR:
1332 case OP_NOTPLUS:
1333 case OP_NOTMINPLUS:
1334 case OP_NOTQUERY:
1335 case OP_NOTMINQUERY:
1336 case OP_NOTUPTO:
1337 case OP_NOTMINUPTO:
1338 case OP_NOTEXACT:
1339 case OP_NOTPOSSTAR:
1340 case OP_NOTPOSPLUS:
1341 case OP_NOTPOSQUERY:
1342 case OP_NOTPOSUPTO:
1343 case OP_NOTSTARI:
1344 case OP_NOTMINSTARI:
1345 case OP_NOTPLUSI:
1346 case OP_NOTMINPLUSI:
1347 case OP_NOTQUERYI:
1348 case OP_NOTMINQUERYI:
1349 case OP_NOTUPTOI:
1350 case OP_NOTMINUPTOI:
1351 case OP_NOTEXACTI:
1352 case OP_NOTPOSSTARI:
1353 case OP_NOTPOSPLUSI:
1354 case OP_NOTPOSQUERYI:
1355 case OP_NOTPOSUPTOI:
1356 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
1357 break;
1358 }
1359#else
1360 (void)(utf); /* Keep compiler happy by referencing function argument */
1361#endif /* SUPPORT_WIDE_CHARS */
1362 }
1363}
1364
1365/* End of pcre2_auto_possess.c */