blob: de259c9c40573d352b9769b16a1a09c813ab6ee6 [file] [log] [blame]
Elliott Hughes5b808042021-10-01 10:56:10 -07001/*************************************************
2* Perl-Compatible Regular Expressions *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
Elliott Hughes4e19c8e2022-04-15 15:11:02 -070010 New API code Copyright (c) 2016-2022 University of Cambridge
Elliott Hughes5b808042021-10-01 10:56:10 -070011
12-----------------------------------------------------------------------------
13Redistribution and use in source and binary forms, with or without
14modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37POSSIBILITY OF SUCH DAMAGE.
38-----------------------------------------------------------------------------
39*/
40
41
42#ifdef HAVE_CONFIG_H
43#include "config.h"
44#endif
45
46#define NLBLOCK cb /* Block containing newline information */
47#define PSSTART start_pattern /* Field containing processed string start */
48#define PSEND end_pattern /* Field containing processed string end */
49
50#include "pcre2_internal.h"
51
52/* In rare error cases debugging might require calling pcre2_printint(). */
53
54#if 0
55#ifdef EBCDIC
56#define PRINTABLE(c) ((c) >= 64 && (c) < 255)
57#else
58#define PRINTABLE(c) ((c) >= 32 && (c) < 127)
59#endif
60#include "pcre2_printint.c"
61#define DEBUG_CALL_PRINTINT
62#endif
63
64/* Other debugging code can be enabled by these defines. */
65
66/* #define DEBUG_SHOW_CAPTURES */
67/* #define DEBUG_SHOW_PARSED */
68
69/* There are a few things that vary with different code unit sizes. Handle them
70by defining macros in order to minimize #if usage. */
71
72#if PCRE2_CODE_UNIT_WIDTH == 8
73#define STRING_UTFn_RIGHTPAR STRING_UTF8_RIGHTPAR, 5
74#define XDIGIT(c) xdigitab[c]
75
76#else /* Either 16-bit or 32-bit */
77#define XDIGIT(c) (MAX_255(c)? xdigitab[c] : 0xff)
78
79#if PCRE2_CODE_UNIT_WIDTH == 16
80#define STRING_UTFn_RIGHTPAR STRING_UTF16_RIGHTPAR, 6
81
82#else /* 32-bit */
83#define STRING_UTFn_RIGHTPAR STRING_UTF32_RIGHTPAR, 6
84#endif
85#endif
86
87/* Macros to store and retrieve a PCRE2_SIZE value in the parsed pattern, which
88consists of uint32_t elements. Assume that if uint32_t can't hold it, two of
89them will be able to (i.e. assume a 64-bit world). */
90
91#if PCRE2_SIZE_MAX <= UINT32_MAX
92#define PUTOFFSET(s,p) *p++ = s
93#define GETOFFSET(s,p) s = *p++
94#define GETPLUSOFFSET(s,p) s = *(++p)
95#define READPLUSOFFSET(s,p) s = p[1]
96#define SKIPOFFSET(p) p++
97#define SIZEOFFSET 1
98#else
99#define PUTOFFSET(s,p) \
100 { *p++ = (uint32_t)(s >> 32); *p++ = (uint32_t)(s & 0xffffffff); }
101#define GETOFFSET(s,p) \
102 { s = ((PCRE2_SIZE)p[0] << 32) | (PCRE2_SIZE)p[1]; p += 2; }
103#define GETPLUSOFFSET(s,p) \
104 { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; p += 2; }
105#define READPLUSOFFSET(s,p) \
106 { s = ((PCRE2_SIZE)p[1] << 32) | (PCRE2_SIZE)p[2]; }
107#define SKIPOFFSET(p) p += 2
108#define SIZEOFFSET 2
109#endif
110
111/* Macros for manipulating elements of the parsed pattern vector. */
112
113#define META_CODE(x) (x & 0xffff0000u)
114#define META_DATA(x) (x & 0x0000ffffu)
115#define META_DIFF(x,y) ((x-y)>>16)
116
117/* Function definitions to allow mutual recursion */
118
119#ifdef SUPPORT_UNICODE
120static unsigned int
121 add_list_to_class_internal(uint8_t *, PCRE2_UCHAR **, uint32_t,
122 compile_block *, const uint32_t *, unsigned int);
123#endif
124
125static int
126 compile_regex(uint32_t, PCRE2_UCHAR **, uint32_t **, int *, uint32_t,
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700127 uint32_t *, uint32_t *, uint32_t *, uint32_t *, branch_chain *,
Elliott Hughes5b808042021-10-01 10:56:10 -0700128 compile_block *, PCRE2_SIZE *);
129
130static int
131 get_branchlength(uint32_t **, int *, int *, parsed_recurse_check *,
132 compile_block *);
133
134static BOOL
135 set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *,
136 compile_block *);
137
138static int
139 check_lookbehinds(uint32_t *, uint32_t **, parsed_recurse_check *,
140 compile_block *, int *);
141
142
143/*************************************************
144* Code parameters and static tables *
145*************************************************/
146
147#define MAX_GROUP_NUMBER 65535u
148#define MAX_REPEAT_COUNT 65535u
149#define REPEAT_UNLIMITED (MAX_REPEAT_COUNT+1)
150
151/* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in
152different ways in the different pattern scans. The parsing and group-
153identifying pre-scan uses it to handle nesting, and needs it to be 16-bit
154aligned for this. Having defined the size in code units, we set up
155C16_WORK_SIZE as the number of elements in the 16-bit vector.
156
157During the first compiling phase, when determining how much memory is required,
158the regex is partly compiled into this space, but the compiled parts are
159discarded as soon as they can be, so that hopefully there will never be an
160overrun. The code does, however, check for an overrun, which can occur for
161pathological patterns. The size of the workspace depends on LINK_SIZE because
162the length of compiled items varies with this.
163
164In the real compile phase, this workspace is not currently used. */
165
166#define COMPILE_WORK_SIZE (3000*LINK_SIZE) /* Size in code units */
167
168#define C16_WORK_SIZE \
169 ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t))
170
171/* A uint32_t vector is used for caching information about the size of
172capturing groups, to improve performance. A default is created on the stack of
173this size. */
174
175#define GROUPINFO_DEFAULT_SIZE 256
176
177/* The overrun tests check for a slightly smaller size so that they detect the
178overrun before it actually does run off the end of the data block. */
179
180#define WORK_SIZE_SAFETY_MARGIN (100)
181
182/* This value determines the size of the initial vector that is used for
183remembering named groups during the pre-compile. It is allocated on the stack,
184but if it is too small, it is expanded, in a similar way to the workspace. The
185value is the number of slots in the list. */
186
187#define NAMED_GROUP_LIST_SIZE 20
188
189/* The pre-compiling pass over the pattern creates a parsed pattern in a vector
190of uint32_t. For short patterns this lives on the stack, with this size. Heap
191memory is used for longer patterns. */
192
193#define PARSED_PATTERN_DEFAULT_SIZE 1024
194
195/* Maximum length value to check against when making sure that the variable
196that holds the compiled pattern length does not overflow. We make it a bit less
197than INT_MAX to allow for adding in group terminating code units, so that we
198don't have to check them every time. */
199
200#define OFLOW_MAX (INT_MAX - 20)
201
202/* Code values for parsed patterns, which are stored in a vector of 32-bit
203unsigned ints. Values less than META_END are literal data values. The coding
204for identifying the item is in the top 16-bits, leaving 16 bits for the
205additional data that some of them need. The META_CODE, META_DATA, and META_DIFF
206macros are used to manipulate parsed pattern elements.
207
208NOTE: When these definitions are changed, the table of extra lengths for each
209code (meta_extra_lengths, just below) must be updated to remain in step. */
210
211#define META_END 0x80000000u /* End of pattern */
212
213#define META_ALT 0x80010000u /* alternation */
214#define META_ATOMIC 0x80020000u /* atomic group */
215#define META_BACKREF 0x80030000u /* Back ref */
216#define META_BACKREF_BYNAME 0x80040000u /* \k'name' */
217#define META_BIGVALUE 0x80050000u /* Next is a literal > META_END */
218#define META_CALLOUT_NUMBER 0x80060000u /* (?C with numerical argument */
219#define META_CALLOUT_STRING 0x80070000u /* (?C with string argument */
220#define META_CAPTURE 0x80080000u /* Capturing parenthesis */
221#define META_CIRCUMFLEX 0x80090000u /* ^ metacharacter */
222#define META_CLASS 0x800a0000u /* start non-empty class */
223#define META_CLASS_EMPTY 0x800b0000u /* empty class */
224#define META_CLASS_EMPTY_NOT 0x800c0000u /* negative empty class */
225#define META_CLASS_END 0x800d0000u /* end of non-empty class */
226#define META_CLASS_NOT 0x800e0000u /* start non-empty negative class */
227#define META_COND_ASSERT 0x800f0000u /* (?(?assertion)... */
228#define META_COND_DEFINE 0x80100000u /* (?(DEFINE)... */
229#define META_COND_NAME 0x80110000u /* (?(<name>)... */
230#define META_COND_NUMBER 0x80120000u /* (?(digits)... */
231#define META_COND_RNAME 0x80130000u /* (?(R&name)... */
232#define META_COND_RNUMBER 0x80140000u /* (?(Rdigits)... */
233#define META_COND_VERSION 0x80150000u /* (?(VERSION<op>x.y)... */
234#define META_DOLLAR 0x80160000u /* $ metacharacter */
235#define META_DOT 0x80170000u /* . metacharacter */
236#define META_ESCAPE 0x80180000u /* \d and friends */
237#define META_KET 0x80190000u /* closing parenthesis */
238#define META_NOCAPTURE 0x801a0000u /* no capture parens */
239#define META_OPTIONS 0x801b0000u /* (?i) and friends */
240#define META_POSIX 0x801c0000u /* POSIX class item */
241#define META_POSIX_NEG 0x801d0000u /* negative POSIX class item */
242#define META_RANGE_ESCAPED 0x801e0000u /* range with at least one escape */
243#define META_RANGE_LITERAL 0x801f0000u /* range defined literally */
244#define META_RECURSE 0x80200000u /* Recursion */
245#define META_RECURSE_BYNAME 0x80210000u /* (?&name) */
246#define META_SCRIPT_RUN 0x80220000u /* (*script_run:...) */
247
248/* These must be kept together to make it easy to check that an assertion
249is present where expected in a conditional group. */
250
251#define META_LOOKAHEAD 0x80230000u /* (?= */
252#define META_LOOKAHEADNOT 0x80240000u /* (?! */
253#define META_LOOKBEHIND 0x80250000u /* (?<= */
254#define META_LOOKBEHINDNOT 0x80260000u /* (?<! */
255
256/* These cannot be conditions */
257
258#define META_LOOKAHEAD_NA 0x80270000u /* (*napla: */
259#define META_LOOKBEHIND_NA 0x80280000u /* (*naplb: */
260
261/* These must be kept in this order, with consecutive values, and the _ARG
262versions of COMMIT, PRUNE, SKIP, and THEN immediately after their non-argument
263versions. */
264
265#define META_MARK 0x80290000u /* (*MARK) */
266#define META_ACCEPT 0x802a0000u /* (*ACCEPT) */
267#define META_FAIL 0x802b0000u /* (*FAIL) */
268#define META_COMMIT 0x802c0000u /* These */
269#define META_COMMIT_ARG 0x802d0000u /* pairs */
270#define META_PRUNE 0x802e0000u /* must */
271#define META_PRUNE_ARG 0x802f0000u /* be */
272#define META_SKIP 0x80300000u /* kept */
273#define META_SKIP_ARG 0x80310000u /* in */
274#define META_THEN 0x80320000u /* this */
275#define META_THEN_ARG 0x80330000u /* order */
276
277/* These must be kept in groups of adjacent 3 values, and all together. */
278
279#define META_ASTERISK 0x80340000u /* * */
280#define META_ASTERISK_PLUS 0x80350000u /* *+ */
281#define META_ASTERISK_QUERY 0x80360000u /* *? */
282#define META_PLUS 0x80370000u /* + */
283#define META_PLUS_PLUS 0x80380000u /* ++ */
284#define META_PLUS_QUERY 0x80390000u /* +? */
285#define META_QUERY 0x803a0000u /* ? */
286#define META_QUERY_PLUS 0x803b0000u /* ?+ */
287#define META_QUERY_QUERY 0x803c0000u /* ?? */
288#define META_MINMAX 0x803d0000u /* {n,m} repeat */
289#define META_MINMAX_PLUS 0x803e0000u /* {n,m}+ repeat */
290#define META_MINMAX_QUERY 0x803f0000u /* {n,m}? repeat */
291
292#define META_FIRST_QUANTIFIER META_ASTERISK
293#define META_LAST_QUANTIFIER META_MINMAX_QUERY
294
295/* This is a special "meta code" that is used only to distinguish (*asr: from
296(*sr: in the table of aphabetic assertions. It is never stored in the parsed
297pattern because (*asr: is turned into (*sr:(*atomic: at that stage. There is
298therefore no need for it to have a length entry, so use a high value. */
299
300#define META_ATOMIC_SCRIPT_RUN 0x8fff0000u
301
302/* Table of extra lengths for each of the meta codes. Must be kept in step with
303the definitions above. For some items these values are a basic length to which
304a variable amount has to be added. */
305
306static unsigned char meta_extra_lengths[] = {
307 0, /* META_END */
308 0, /* META_ALT */
309 0, /* META_ATOMIC */
310 0, /* META_BACKREF - more if group is >= 10 */
311 1+SIZEOFFSET, /* META_BACKREF_BYNAME */
312 1, /* META_BIGVALUE */
313 3, /* META_CALLOUT_NUMBER */
314 3+SIZEOFFSET, /* META_CALLOUT_STRING */
315 0, /* META_CAPTURE */
316 0, /* META_CIRCUMFLEX */
317 0, /* META_CLASS */
318 0, /* META_CLASS_EMPTY */
319 0, /* META_CLASS_EMPTY_NOT */
320 0, /* META_CLASS_END */
321 0, /* META_CLASS_NOT */
322 0, /* META_COND_ASSERT */
323 SIZEOFFSET, /* META_COND_DEFINE */
324 1+SIZEOFFSET, /* META_COND_NAME */
325 1+SIZEOFFSET, /* META_COND_NUMBER */
326 1+SIZEOFFSET, /* META_COND_RNAME */
327 1+SIZEOFFSET, /* META_COND_RNUMBER */
328 3, /* META_COND_VERSION */
329 0, /* META_DOLLAR */
330 0, /* META_DOT */
331 0, /* META_ESCAPE - more for ESC_P, ESC_p, ESC_g, ESC_k */
332 0, /* META_KET */
333 0, /* META_NOCAPTURE */
334 1, /* META_OPTIONS */
335 1, /* META_POSIX */
336 1, /* META_POSIX_NEG */
337 0, /* META_RANGE_ESCAPED */
338 0, /* META_RANGE_LITERAL */
339 SIZEOFFSET, /* META_RECURSE */
340 1+SIZEOFFSET, /* META_RECURSE_BYNAME */
341 0, /* META_SCRIPT_RUN */
342 0, /* META_LOOKAHEAD */
343 0, /* META_LOOKAHEADNOT */
344 SIZEOFFSET, /* META_LOOKBEHIND */
345 SIZEOFFSET, /* META_LOOKBEHINDNOT */
346 0, /* META_LOOKAHEAD_NA */
347 SIZEOFFSET, /* META_LOOKBEHIND_NA */
348 1, /* META_MARK - plus the string length */
349 0, /* META_ACCEPT */
350 0, /* META_FAIL */
351 0, /* META_COMMIT */
352 1, /* META_COMMIT_ARG - plus the string length */
353 0, /* META_PRUNE */
354 1, /* META_PRUNE_ARG - plus the string length */
355 0, /* META_SKIP */
356 1, /* META_SKIP_ARG - plus the string length */
357 0, /* META_THEN */
358 1, /* META_THEN_ARG - plus the string length */
359 0, /* META_ASTERISK */
360 0, /* META_ASTERISK_PLUS */
361 0, /* META_ASTERISK_QUERY */
362 0, /* META_PLUS */
363 0, /* META_PLUS_PLUS */
364 0, /* META_PLUS_QUERY */
365 0, /* META_QUERY */
366 0, /* META_QUERY_PLUS */
367 0, /* META_QUERY_QUERY */
368 2, /* META_MINMAX */
369 2, /* META_MINMAX_PLUS */
370 2 /* META_MINMAX_QUERY */
371};
372
373/* Types for skipping parts of a parsed pattern. */
374
375enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET };
376
377/* Macro for setting individual bits in class bitmaps. It took some
378experimenting to figure out how to stop gcc 5.3.0 from warning with
379-Wconversion. This version gets a warning:
380
381 #define SETBIT(a,b) a[(b)/8] |= (uint8_t)(1u << ((b)&7))
382
383Let's hope the apparently less efficient version isn't actually so bad if the
384compiler is clever with identical subexpressions. */
385
386#define SETBIT(a,b) a[(b)/8] = (uint8_t)(a[(b)/8] | (1u << ((b)&7)))
387
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700388/* Values and flags for the unsigned xxcuflags variables that accompany xxcu
389variables, which are concerned with first and required code units. A value
390greater than or equal to REQ_NONE means "no code unit set"; otherwise the
391matching xxcu variable is set, and the low valued bits are relevant. */
Elliott Hughes5b808042021-10-01 10:56:10 -0700392
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700393#define REQ_UNSET 0xffffffffu /* Not yet found anything */
394#define REQ_NONE 0xfffffffeu /* Found not fixed character */
395#define REQ_CASELESS 0x00000001u /* Code unit in xxcu is caseless */
396#define REQ_VARY 0x00000002u /* Code unit is followed by non-literal */
Elliott Hughes5b808042021-10-01 10:56:10 -0700397
398/* These flags are used in the groupinfo vector. */
399
400#define GI_SET_FIXED_LENGTH 0x80000000u
401#define GI_NOT_FIXED_LENGTH 0x40000000u
402#define GI_FIXED_LENGTH_MASK 0x0000ffffu
403
404/* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
405and is fast (a good compiler can turn it into a subtraction and unsigned
406comparison). */
407
408#define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
409
410/* Table to identify hex digits. The tables in chartables are dependent on the
411locale, and may mark arbitrary characters as digits. We want to recognize only
4120-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
413costs 256 bytes, but it is a lot faster than doing character value tests (at
414least in some simple cases I timed), and in some applications one wants PCRE2
415to compile efficiently as well as match efficiently. The value in the table is
416the binary hex digit value, or 0xff for non-hex digits. */
417
418/* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
419UTF-8 mode. */
420
421#ifndef EBCDIC
422static const uint8_t xdigitab[] =
423 {
424 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 */
425 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */
426 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 */
427 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */
428 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - ' */
429 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ( - / */
430 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 */
431 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /* 8 - ? */
432 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* @ - G */
433 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H - O */
434 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* P - W */
435 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* X - _ */
436 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* ` - g */
437 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h - o */
438 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* p - w */
439 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* x -127 */
440 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */
441 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */
442 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */
443 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */
444 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */
445 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */
446 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */
447 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
448 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */
449 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */
450 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */
451 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */
452 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */
453 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */
454 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */
455 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */
456
457#else
458
459/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
460
461static const uint8_t xdigitab[] =
462 {
463 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 0 */
464 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */
465 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 10 */
466 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */
467 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 32- 39 20 */
468 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 40- 47 */
469 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 48- 55 30 */
470 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 56- 63 */
471 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - 71 40 */
472 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 72- | */
473 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* & - 87 50 */
474 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 88- 95 */
475 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - -103 60 */
476 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ? */
477 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */
478 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- " */
479 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g 80 */
480 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h -143 */
481 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p 90 */
482 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* q -159 */
483 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x A0 */
484 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* y -175 */
485 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ^ -183 B0 */
486 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
487 0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* { - G C0 */
488 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H -207 */
489 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* } - P D0 */
490 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Q -223 */
491 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* \ - X E0 */
492 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Y -239 */
493 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 F0 */
494 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/* 8 -255 */
495#endif /* EBCDIC */
496
497
498/* Table for handling alphanumeric escaped characters. Positive returns are
499simple data values; negative values are for special things like \d and so on.
500Zero means further processing is needed (for things like \x), or the escape is
501invalid. */
502
503/* This is the "normal" table for ASCII systems or for EBCDIC systems running
504in UTF-8 mode. It runs from '0' to 'z'. */
505
506#ifndef EBCDIC
507#define ESCAPES_FIRST CHAR_0
508#define ESCAPES_LAST CHAR_z
509#define UPPER_CASE(c) (c-32)
510
511static const short int escapes[] = {
512 0, 0,
513 0, 0,
514 0, 0,
515 0, 0,
516 0, 0,
517 CHAR_COLON, CHAR_SEMICOLON,
518 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
519 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
520 CHAR_COMMERCIAL_AT, -ESC_A,
521 -ESC_B, -ESC_C,
522 -ESC_D, -ESC_E,
523 0, -ESC_G,
524 -ESC_H, 0,
525 0, -ESC_K,
526 0, 0,
527 -ESC_N, 0,
528 -ESC_P, -ESC_Q,
529 -ESC_R, -ESC_S,
530 0, 0,
531 -ESC_V, -ESC_W,
532 -ESC_X, 0,
533 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
534 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
535 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
536 CHAR_GRAVE_ACCENT, CHAR_BEL,
537 -ESC_b, 0,
538 -ESC_d, CHAR_ESC,
539 CHAR_FF, 0,
540 -ESC_h, 0,
541 0, -ESC_k,
542 0, 0,
543 CHAR_LF, 0,
544 -ESC_p, 0,
545 CHAR_CR, -ESC_s,
546 CHAR_HT, 0,
547 -ESC_v, -ESC_w,
548 0, 0,
549 -ESC_z
550};
551
552#else
553
554/* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
555It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code
556is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a
557because it is defined as 'a', which of course picks up the ASCII value. */
558
559#if 'a' == 0x81 /* Check for a real EBCDIC environment */
560#define ESCAPES_FIRST CHAR_a
561#define ESCAPES_LAST CHAR_9
562#define UPPER_CASE(c) (c+64)
563#else /* Testing in an ASCII environment */
564#define ESCAPES_FIRST ((unsigned char)'\x81') /* EBCDIC 'a' */
565#define ESCAPES_LAST ((unsigned char)'\xf9') /* EBCDIC '9' */
566#define UPPER_CASE(c) (c-32)
567#endif
568
569static const short int escapes[] = {
570/* 80 */ CHAR_BEL, -ESC_b, 0, -ESC_d, CHAR_ESC, CHAR_FF, 0,
571/* 88 */ -ESC_h, 0, 0, '{', 0, 0, 0, 0,
572/* 90 */ 0, 0, -ESC_k, 0, 0, CHAR_LF, 0, -ESC_p,
573/* 98 */ 0, CHAR_CR, 0, '}', 0, 0, 0, 0,
574/* A0 */ 0, '~', -ESC_s, CHAR_HT, 0, -ESC_v, -ESC_w, 0,
575/* A8 */ 0, -ESC_z, 0, 0, 0, '[', 0, 0,
576/* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
577/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
578/* C0 */ '{', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G,
579/* C8 */ -ESC_H, 0, 0, 0, 0, 0, 0, 0,
580/* D0 */ '}', 0, -ESC_K, 0, 0, -ESC_N, 0, -ESC_P,
581/* D8 */ -ESC_Q, -ESC_R, 0, 0, 0, 0, 0, 0,
582/* E0 */ '\\', 0, -ESC_S, 0, 0, -ESC_V, -ESC_W, -ESC_X,
583/* E8 */ 0, -ESC_Z, 0, 0, 0, 0, 0, 0,
584/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
585/* F8 */ 0, 0
586};
587
588/* We also need a table of characters that may follow \c in an EBCDIC
589environment for characters 0-31. */
590
591static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
592
593#endif /* EBCDIC */
594
595
596/* Table of special "verbs" like (*PRUNE). This is a short table, so it is
597searched linearly. Put all the names into a single string, in order to reduce
598the number of relocations when a shared library is dynamically linked. The
599string is built from string macros so that it works in UTF-8 mode on EBCDIC
600platforms. */
601
602typedef struct verbitem {
603 unsigned int len; /* Length of verb name */
604 uint32_t meta; /* Base META_ code */
605 int has_arg; /* Argument requirement */
606} verbitem;
607
608static const char verbnames[] =
609 "\0" /* Empty name is a shorthand for MARK */
610 STRING_MARK0
611 STRING_ACCEPT0
612 STRING_F0
613 STRING_FAIL0
614 STRING_COMMIT0
615 STRING_PRUNE0
616 STRING_SKIP0
617 STRING_THEN;
618
619static const verbitem verbs[] = {
620 { 0, META_MARK, +1 }, /* > 0 => must have an argument */
621 { 4, META_MARK, +1 },
622 { 6, META_ACCEPT, -1 }, /* < 0 => Optional argument, convert to pre-MARK */
623 { 1, META_FAIL, -1 },
624 { 4, META_FAIL, -1 },
625 { 6, META_COMMIT, 0 },
626 { 5, META_PRUNE, 0 }, /* Optional argument; bump META code if found */
627 { 4, META_SKIP, 0 },
628 { 4, META_THEN, 0 }
629};
630
631static const int verbcount = sizeof(verbs)/sizeof(verbitem);
632
633/* Verb opcodes, indexed by their META code offset from META_MARK. */
634
635static const uint32_t verbops[] = {
636 OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE,
637 OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG };
638
639/* Table of "alpha assertions" like (*pla:...), similar to the (*VERB) table. */
640
641typedef struct alasitem {
642 unsigned int len; /* Length of name */
643 uint32_t meta; /* Base META_ code */
644} alasitem;
645
646static const char alasnames[] =
647 STRING_pla0
648 STRING_plb0
649 STRING_napla0
650 STRING_naplb0
651 STRING_nla0
652 STRING_nlb0
653 STRING_positive_lookahead0
654 STRING_positive_lookbehind0
655 STRING_non_atomic_positive_lookahead0
656 STRING_non_atomic_positive_lookbehind0
657 STRING_negative_lookahead0
658 STRING_negative_lookbehind0
659 STRING_atomic0
660 STRING_sr0
661 STRING_asr0
662 STRING_script_run0
663 STRING_atomic_script_run;
664
665static const alasitem alasmeta[] = {
666 { 3, META_LOOKAHEAD },
667 { 3, META_LOOKBEHIND },
668 { 5, META_LOOKAHEAD_NA },
669 { 5, META_LOOKBEHIND_NA },
670 { 3, META_LOOKAHEADNOT },
671 { 3, META_LOOKBEHINDNOT },
672 { 18, META_LOOKAHEAD },
673 { 19, META_LOOKBEHIND },
674 { 29, META_LOOKAHEAD_NA },
675 { 30, META_LOOKBEHIND_NA },
676 { 18, META_LOOKAHEADNOT },
677 { 19, META_LOOKBEHINDNOT },
678 { 6, META_ATOMIC },
679 { 2, META_SCRIPT_RUN }, /* sr = script run */
680 { 3, META_ATOMIC_SCRIPT_RUN }, /* asr = atomic script run */
681 { 10, META_SCRIPT_RUN }, /* script run */
682 { 17, META_ATOMIC_SCRIPT_RUN } /* atomic script run */
683};
684
685static const int alascount = sizeof(alasmeta)/sizeof(alasitem);
686
687/* Offsets from OP_STAR for case-independent and negative repeat opcodes. */
688
689static uint32_t chartypeoffset[] = {
690 OP_STAR - OP_STAR, OP_STARI - OP_STAR,
691 OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR };
692
693/* Tables of names of POSIX character classes and their lengths. The names are
694now all in a single string, to reduce the number of relocations when a shared
695library is dynamically loaded. The list of lengths is terminated by a zero
696length entry. The first three must be alpha, lower, upper, as this is assumed
697for handling case independence. The indices for graph, print, and punct are
698needed, so identify them. */
699
700static const char posix_names[] =
701 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
702 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
703 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
704 STRING_word0 STRING_xdigit;
705
706static const uint8_t posix_name_lengths[] = {
707 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
708
709#define PC_GRAPH 8
710#define PC_PRINT 9
711#define PC_PUNCT 10
712
713/* Table of class bit maps for each POSIX class. Each class is formed from a
714base map, with an optional addition or removal of another map. Then, for some
715classes, there is some additional tweaking: for [:blank:] the vertical space
716characters are removed, and for [:alpha:] and [:alnum:] the underscore
717character is removed. The triples in the table consist of the base map offset,
718second map offset or -1 if no second map, and a non-negative value for map
719addition or a negative value for map subtraction (if there are two maps). The
720absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
721remove vertical space characters, 2 => remove underscore. */
722
723static const int posix_class_maps[] = {
724 cbit_word, cbit_digit, -2, /* alpha */
725 cbit_lower, -1, 0, /* lower */
726 cbit_upper, -1, 0, /* upper */
727 cbit_word, -1, 2, /* alnum - word without underscore */
728 cbit_print, cbit_cntrl, 0, /* ascii */
729 cbit_space, -1, 1, /* blank - a GNU extension */
730 cbit_cntrl, -1, 0, /* cntrl */
731 cbit_digit, -1, 0, /* digit */
732 cbit_graph, -1, 0, /* graph */
733 cbit_print, -1, 0, /* print */
734 cbit_punct, -1, 0, /* punct */
735 cbit_space, -1, 0, /* space */
736 cbit_word, -1, 0, /* word - a Perl extension */
737 cbit_xdigit,-1, 0 /* xdigit */
738};
739
740#ifdef SUPPORT_UNICODE
741
742/* The POSIX class Unicode property substitutes that are used in UCP mode must
743be in the order of the POSIX class names, defined above. */
744
745static int posix_substitutes[] = {
746 PT_GC, ucp_L, /* alpha */
747 PT_PC, ucp_Ll, /* lower */
748 PT_PC, ucp_Lu, /* upper */
749 PT_ALNUM, 0, /* alnum */
750 -1, 0, /* ascii, treat as non-UCP */
751 -1, 1, /* blank, treat as \h */
752 PT_PC, ucp_Cc, /* cntrl */
753 PT_PC, ucp_Nd, /* digit */
754 PT_PXGRAPH, 0, /* graph */
755 PT_PXPRINT, 0, /* print */
756 PT_PXPUNCT, 0, /* punct */
757 PT_PXSPACE, 0, /* space */ /* Xps is POSIX space, but from 8.34 */
758 PT_WORD, 0, /* word */ /* Perl and POSIX space are the same */
759 -1, 0 /* xdigit, treat as non-UCP */
760};
761#define POSIX_SUBSIZE (sizeof(posix_substitutes) / (2*sizeof(uint32_t)))
762#endif /* SUPPORT_UNICODE */
763
764/* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset
765are allowed. */
766
767#define PUBLIC_LITERAL_COMPILE_OPTIONS \
768 (PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \
769 PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_MATCH_INVALID_UTF| \
770 PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF)
771
772#define PUBLIC_COMPILE_OPTIONS \
773 (PUBLIC_LITERAL_COMPILE_OPTIONS| \
774 PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
775 PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \
776 PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \
777 PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
778 PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
779 PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY)
780
781#define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \
782 (PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD)
783
784#define PUBLIC_COMPILE_EXTRA_OPTIONS \
785 (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
786 PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
787 PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \
788 PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK)
789
790/* Compile time error code numbers. They are given names so that they can more
791easily be tracked. When a new number is added, the tables called eint1 and
792eint2 in pcre2posix.c may need to be updated, and a new error text must be
793added to compile_error_texts in pcre2_error.c. Also, the error codes in
794pcre2.h.in must be updated - their values are exactly 100 greater than these
795values. */
796
797enum { ERR0 = COMPILE_ERROR_BASE,
798 ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR10,
799 ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20,
800 ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30,
801 ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40,
802 ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50,
803 ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
804 ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
805 ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
806 ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90,
807 ERR91, ERR92, ERR93, ERR94, ERR95, ERR96, ERR97, ERR98, ERR99 };
808
809/* This is a table of start-of-pattern options such as (*UTF) and settings such
810as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
811compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is
812generic and always supported. */
813
814enum { PSO_OPT, /* Value is an option bit */
815 PSO_FLG, /* Value is a flag bit */
816 PSO_NL, /* Value is a newline type */
817 PSO_BSR, /* Value is a \R type */
818 PSO_LIMH, /* Read integer value for heap limit */
819 PSO_LIMM, /* Read integer value for match limit */
820 PSO_LIMD }; /* Read integer value for depth limit */
821
822typedef struct pso {
823 const uint8_t *name;
824 uint16_t length;
825 uint16_t type;
826 uint32_t value;
827} pso;
828
829/* NB: STRING_UTFn_RIGHTPAR contains the length as well */
830
831static pso pso_list[] = {
832 { (uint8_t *)STRING_UTFn_RIGHTPAR, PSO_OPT, PCRE2_UTF },
833 { (uint8_t *)STRING_UTF_RIGHTPAR, 4, PSO_OPT, PCRE2_UTF },
834 { (uint8_t *)STRING_UCP_RIGHTPAR, 4, PSO_OPT, PCRE2_UCP },
835 { (uint8_t *)STRING_NOTEMPTY_RIGHTPAR, 9, PSO_FLG, PCRE2_NOTEMPTY_SET },
836 { (uint8_t *)STRING_NOTEMPTY_ATSTART_RIGHTPAR, 17, PSO_FLG, PCRE2_NE_ATST_SET },
837 { (uint8_t *)STRING_NO_AUTO_POSSESS_RIGHTPAR, 16, PSO_OPT, PCRE2_NO_AUTO_POSSESS },
838 { (uint8_t *)STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR },
839 { (uint8_t *)STRING_NO_JIT_RIGHTPAR, 7, PSO_FLG, PCRE2_NOJIT },
840 { (uint8_t *)STRING_NO_START_OPT_RIGHTPAR, 13, PSO_OPT, PCRE2_NO_START_OPTIMIZE },
841 { (uint8_t *)STRING_LIMIT_HEAP_EQ, 11, PSO_LIMH, 0 },
842 { (uint8_t *)STRING_LIMIT_MATCH_EQ, 12, PSO_LIMM, 0 },
843 { (uint8_t *)STRING_LIMIT_DEPTH_EQ, 12, PSO_LIMD, 0 },
844 { (uint8_t *)STRING_LIMIT_RECURSION_EQ, 16, PSO_LIMD, 0 },
845 { (uint8_t *)STRING_CR_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_CR },
846 { (uint8_t *)STRING_LF_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_LF },
847 { (uint8_t *)STRING_CRLF_RIGHTPAR, 5, PSO_NL, PCRE2_NEWLINE_CRLF },
848 { (uint8_t *)STRING_ANY_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_ANY },
849 { (uint8_t *)STRING_NUL_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_NUL },
850 { (uint8_t *)STRING_ANYCRLF_RIGHTPAR, 8, PSO_NL, PCRE2_NEWLINE_ANYCRLF },
851 { (uint8_t *)STRING_BSR_ANYCRLF_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_ANYCRLF },
852 { (uint8_t *)STRING_BSR_UNICODE_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_UNICODE }
853};
854
855/* This table is used when converting repeating opcodes into possessified
856versions as a result of an explicit possessive quantifier such as ++. A zero
857value means there is no possessified version - in those cases the item in
858question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
859because all relevant opcodes are less than that. */
860
861static const uint8_t opcode_possessify[] = {
862 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 15 */
863 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 - 31 */
864
865 0, /* NOTI */
866 OP_POSSTAR, 0, /* STAR, MINSTAR */
867 OP_POSPLUS, 0, /* PLUS, MINPLUS */
868 OP_POSQUERY, 0, /* QUERY, MINQUERY */
869 OP_POSUPTO, 0, /* UPTO, MINUPTO */
870 0, /* EXACT */
871 0, 0, 0, 0, /* POS{STAR,PLUS,QUERY,UPTO} */
872
873 OP_POSSTARI, 0, /* STARI, MINSTARI */
874 OP_POSPLUSI, 0, /* PLUSI, MINPLUSI */
875 OP_POSQUERYI, 0, /* QUERYI, MINQUERYI */
876 OP_POSUPTOI, 0, /* UPTOI, MINUPTOI */
877 0, /* EXACTI */
878 0, 0, 0, 0, /* POS{STARI,PLUSI,QUERYI,UPTOI} */
879
880 OP_NOTPOSSTAR, 0, /* NOTSTAR, NOTMINSTAR */
881 OP_NOTPOSPLUS, 0, /* NOTPLUS, NOTMINPLUS */
882 OP_NOTPOSQUERY, 0, /* NOTQUERY, NOTMINQUERY */
883 OP_NOTPOSUPTO, 0, /* NOTUPTO, NOTMINUPTO */
884 0, /* NOTEXACT */
885 0, 0, 0, 0, /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
886
887 OP_NOTPOSSTARI, 0, /* NOTSTARI, NOTMINSTARI */
888 OP_NOTPOSPLUSI, 0, /* NOTPLUSI, NOTMINPLUSI */
889 OP_NOTPOSQUERYI, 0, /* NOTQUERYI, NOTMINQUERYI */
890 OP_NOTPOSUPTOI, 0, /* NOTUPTOI, NOTMINUPTOI */
891 0, /* NOTEXACTI */
892 0, 0, 0, 0, /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
893
894 OP_TYPEPOSSTAR, 0, /* TYPESTAR, TYPEMINSTAR */
895 OP_TYPEPOSPLUS, 0, /* TYPEPLUS, TYPEMINPLUS */
896 OP_TYPEPOSQUERY, 0, /* TYPEQUERY, TYPEMINQUERY */
897 OP_TYPEPOSUPTO, 0, /* TYPEUPTO, TYPEMINUPTO */
898 0, /* TYPEEXACT */
899 0, 0, 0, 0, /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
900
901 OP_CRPOSSTAR, 0, /* CRSTAR, CRMINSTAR */
902 OP_CRPOSPLUS, 0, /* CRPLUS, CRMINPLUS */
903 OP_CRPOSQUERY, 0, /* CRQUERY, CRMINQUERY */
904 OP_CRPOSRANGE, 0, /* CRRANGE, CRMINRANGE */
905 0, 0, 0, 0, /* CRPOS{STAR,PLUS,QUERY,RANGE} */
906
907 0, 0, 0, /* CLASS, NCLASS, XCLASS */
908 0, 0, /* REF, REFI */
909 0, 0, /* DNREF, DNREFI */
910 0, 0 /* RECURSE, CALLOUT */
911};
912
913
914#ifdef DEBUG_SHOW_PARSED
915/*************************************************
916* Show the parsed pattern for debugging *
917*************************************************/
918
919/* For debugging the pre-scan, this code, which outputs the parsed data vector,
920can be enabled. */
921
922static void show_parsed(compile_block *cb)
923{
924uint32_t *pptr = cb->parsed_pattern;
925
926for (;;)
927 {
928 int max, min;
929 PCRE2_SIZE offset;
930 uint32_t i;
931 uint32_t length;
932 uint32_t meta_arg = META_DATA(*pptr);
933
934 fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr);
935
936 if (*pptr < META_END)
937 {
938 if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr);
939 pptr++;
940 }
941
942 else switch (META_CODE(*pptr++))
943 {
944 default:
945 fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n");
946 return;
947
948 case META_END:
949 fprintf(stderr, "META_END\n");
950 return;
951
952 case META_CAPTURE:
953 fprintf(stderr, "META_CAPTURE %d", meta_arg);
954 break;
955
956 case META_RECURSE:
957 GETOFFSET(offset, pptr);
958 fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset);
959 break;
960
961 case META_BACKREF:
962 if (meta_arg < 10)
963 offset = cb->small_ref_offset[meta_arg];
964 else
965 GETOFFSET(offset, pptr);
966 fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset);
967 break;
968
969 case META_ESCAPE:
970 if (meta_arg == ESC_P || meta_arg == ESC_p)
971 {
972 uint32_t ptype = *pptr >> 16;
973 uint32_t pvalue = *pptr++ & 0xffff;
974 fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? 'P':'p',
975 ptype, pvalue);
976 }
977 else
978 {
979 uint32_t cc;
980 /* There's just one escape we might have here that isn't negated in the
981 escapes table. */
982 if (meta_arg == ESC_g) cc = CHAR_g;
983 else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++)
984 {
985 if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break;
986 }
987 if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK;
988 fprintf(stderr, "META \\%c", cc);
989 }
990 break;
991
992 case META_MINMAX:
993 min = *pptr++;
994 max = *pptr++;
995 if (max != REPEAT_UNLIMITED)
996 fprintf(stderr, "META {%d,%d}", min, max);
997 else
998 fprintf(stderr, "META {%d,}", min);
999 break;
1000
1001 case META_MINMAX_QUERY:
1002 min = *pptr++;
1003 max = *pptr++;
1004 if (max != REPEAT_UNLIMITED)
1005 fprintf(stderr, "META {%d,%d}?", min, max);
1006 else
1007 fprintf(stderr, "META {%d,}?", min);
1008 break;
1009
1010 case META_MINMAX_PLUS:
1011 min = *pptr++;
1012 max = *pptr++;
1013 if (max != REPEAT_UNLIMITED)
1014 fprintf(stderr, "META {%d,%d}+", min, max);
1015 else
1016 fprintf(stderr, "META {%d,}+", min);
1017 break;
1018
1019 case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break;
1020 case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break;
1021 case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break;
1022 case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break;
1023 case META_DOT: fprintf(stderr, "META_DOT"); break;
1024 case META_ASTERISK: fprintf(stderr, "META *"); break;
1025 case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break;
1026 case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break;
1027 case META_PLUS: fprintf(stderr, "META +"); break;
1028 case META_PLUS_QUERY: fprintf(stderr, "META +?"); break;
1029 case META_PLUS_PLUS: fprintf(stderr, "META ++"); break;
1030 case META_QUERY: fprintf(stderr, "META ?"); break;
1031 case META_QUERY_QUERY: fprintf(stderr, "META ??"); break;
1032 case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break;
1033
1034 case META_ATOMIC: fprintf(stderr, "META (?>"); break;
1035 case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;
1036 case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;
1037 case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;
1038 case META_LOOKAHEAD_NA: fprintf(stderr, "META (*napla:"); break;
1039 case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break;
1040 case META_KET: fprintf(stderr, "META )"); break;
1041 case META_ALT: fprintf(stderr, "META | %d", meta_arg); break;
1042
1043 case META_CLASS: fprintf(stderr, "META ["); break;
1044 case META_CLASS_NOT: fprintf(stderr, "META [^"); break;
1045 case META_CLASS_END: fprintf(stderr, "META ]"); break;
1046 case META_CLASS_EMPTY: fprintf(stderr, "META []"); break;
1047 case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break;
1048
1049 case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break;
1050 case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break;
1051
1052 case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break;
1053 case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break;
1054
1055 case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break;
1056 case META_FAIL: fprintf(stderr, "META (*FAIL)"); break;
1057 case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break;
1058 case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break;
1059 case META_SKIP: fprintf(stderr, "META (*SKIP)"); break;
1060 case META_THEN: fprintf(stderr, "META (*THEN)"); break;
1061
1062 case META_OPTIONS: fprintf(stderr, "META_OPTIONS 0x%02x", *pptr++); break;
1063
1064 case META_LOOKBEHIND:
1065 fprintf(stderr, "META (?<= %d offset=", meta_arg);
1066 GETOFFSET(offset, pptr);
1067 fprintf(stderr, "%zd", offset);
1068 break;
1069
1070 case META_LOOKBEHIND_NA:
1071 fprintf(stderr, "META (*naplb: %d offset=", meta_arg);
1072 GETOFFSET(offset, pptr);
1073 fprintf(stderr, "%zd", offset);
1074 break;
1075
1076 case META_LOOKBEHINDNOT:
1077 fprintf(stderr, "META (?<! %d offset=", meta_arg);
1078 GETOFFSET(offset, pptr);
1079 fprintf(stderr, "%zd", offset);
1080 break;
1081
1082 case META_CALLOUT_NUMBER:
1083 fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0],
1084 pptr[1]);
1085 pptr += 3;
1086 break;
1087
1088 case META_CALLOUT_STRING:
1089 {
1090 uint32_t patoffset = *pptr++; /* Offset of next pattern item */
1091 uint32_t patlength = *pptr++; /* Length of next pattern item */
1092 fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++);
1093 GETOFFSET(offset, pptr);
1094 fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength);
1095 }
1096 break;
1097
1098 case META_RECURSE_BYNAME:
1099 fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++);
1100 GETOFFSET(offset, pptr);
1101 fprintf(stderr, "%zd", offset);
1102 break;
1103
1104 case META_BACKREF_BYNAME:
1105 fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++);
1106 GETOFFSET(offset, pptr);
1107 fprintf(stderr, "%zd", offset);
1108 break;
1109
1110 case META_COND_NUMBER:
1111 fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]);
1112 GETOFFSET(offset, pptr);
1113 fprintf(stderr, "%zd", offset);
1114 pptr++;
1115 break;
1116
1117 case META_COND_DEFINE:
1118 fprintf(stderr, "META (?(DEFINE) offset=");
1119 GETOFFSET(offset, pptr);
1120 fprintf(stderr, "%zd", offset);
1121 break;
1122
1123 case META_COND_VERSION:
1124 fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">=");
1125 fprintf(stderr, "%d.", *pptr++);
1126 fprintf(stderr, "%d)", *pptr++);
1127 break;
1128
1129 case META_COND_NAME:
1130 fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++);
1131 GETOFFSET(offset, pptr);
1132 fprintf(stderr, "%zd", offset);
1133 break;
1134
1135 case META_COND_RNAME:
1136 fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++);
1137 GETOFFSET(offset, pptr);
1138 fprintf(stderr, "%zd", offset);
1139 break;
1140
1141 /* This is kept as a name, because it might be. */
1142
1143 case META_COND_RNUMBER:
1144 fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++);
1145 GETOFFSET(offset, pptr);
1146 fprintf(stderr, "%zd", offset);
1147 break;
1148
1149 case META_MARK:
1150 fprintf(stderr, "META (*MARK:");
1151 goto SHOWARG;
1152
1153 case META_COMMIT_ARG:
1154 fprintf(stderr, "META (*COMMIT:");
1155 goto SHOWARG;
1156
1157 case META_PRUNE_ARG:
1158 fprintf(stderr, "META (*PRUNE:");
1159 goto SHOWARG;
1160
1161 case META_SKIP_ARG:
1162 fprintf(stderr, "META (*SKIP:");
1163 goto SHOWARG;
1164
1165 case META_THEN_ARG:
1166 fprintf(stderr, "META (*THEN:");
1167 SHOWARG:
1168 length = *pptr++;
1169 for (i = 0; i < length; i++)
1170 {
1171 uint32_t cc = *pptr++;
1172 if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc);
1173 else fprintf(stderr, "\\x{%x}", cc);
1174 }
1175 fprintf(stderr, ") length=%u", length);
1176 break;
1177 }
1178 fprintf(stderr, "\n");
1179 }
1180return;
1181}
1182#endif /* DEBUG_SHOW_PARSED */
1183
1184
1185
1186/*************************************************
1187* Copy compiled code *
1188*************************************************/
1189
1190/* Compiled JIT code cannot be copied, so the new compiled block has no
1191associated JIT data. */
1192
1193PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
1194pcre2_code_copy(const pcre2_code *code)
1195{
1196PCRE2_SIZE* ref_count;
1197pcre2_code *newcode;
1198
1199if (code == NULL) return NULL;
1200newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1201if (newcode == NULL) return NULL;
1202memcpy(newcode, code, code->blocksize);
1203newcode->executable_jit = NULL;
1204
1205/* If the code is one that has been deserialized, increment the reference count
1206in the decoded tables. */
1207
1208if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1209 {
1210 ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1211 (*ref_count)++;
1212 }
1213
1214return newcode;
1215}
1216
1217
1218
1219/*************************************************
1220* Copy compiled code and character tables *
1221*************************************************/
1222
1223/* Compiled JIT code cannot be copied, so the new compiled block has no
1224associated JIT data. This version of code_copy also makes a separate copy of
1225the character tables. */
1226
1227PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
1228pcre2_code_copy_with_tables(const pcre2_code *code)
1229{
1230PCRE2_SIZE* ref_count;
1231pcre2_code *newcode;
1232uint8_t *newtables;
1233
1234if (code == NULL) return NULL;
1235newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1236if (newcode == NULL) return NULL;
1237memcpy(newcode, code, code->blocksize);
1238newcode->executable_jit = NULL;
1239
1240newtables = code->memctl.malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE),
1241 code->memctl.memory_data);
1242if (newtables == NULL)
1243 {
1244 code->memctl.free((void *)newcode, code->memctl.memory_data);
1245 return NULL;
1246 }
1247memcpy(newtables, code->tables, TABLES_LENGTH);
1248ref_count = (PCRE2_SIZE *)(newtables + TABLES_LENGTH);
1249*ref_count = 1;
1250
1251newcode->tables = newtables;
1252newcode->flags |= PCRE2_DEREF_TABLES;
1253return newcode;
1254}
1255
1256
1257
1258/*************************************************
1259* Free compiled code *
1260*************************************************/
1261
1262PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
1263pcre2_code_free(pcre2_code *code)
1264{
1265PCRE2_SIZE* ref_count;
1266
1267if (code != NULL)
1268 {
1269 if (code->executable_jit != NULL)
1270 PRIV(jit_free)(code->executable_jit, &code->memctl);
1271
1272 if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1273 {
1274 /* Decoded tables belong to the codes after deserialization, and they must
1275 be freed when there are no more references to them. The *ref_count should
1276 always be > 0. */
1277
1278 ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1279 if (*ref_count > 0)
1280 {
1281 (*ref_count)--;
1282 if (*ref_count == 0)
1283 code->memctl.free((void *)code->tables, code->memctl.memory_data);
1284 }
1285 }
1286
1287 code->memctl.free(code, code->memctl.memory_data);
1288 }
1289}
1290
1291
1292
1293/*************************************************
1294* Read a number, possibly signed *
1295*************************************************/
1296
1297/* This function is used to read numbers in the pattern. The initial pointer
1298must be the sign or first digit of the number. When relative values (introduced
1299by + or -) are allowed, they are relative group numbers, and the result must be
1300greater than zero.
1301
1302Arguments:
1303 ptrptr points to the character pointer variable
1304 ptrend points to the end of the input string
1305 allow_sign if < 0, sign not allowed; if >= 0, sign is relative to this
1306 max_value the largest number allowed
1307 max_error the error to give for an over-large number
1308 intptr where to put the result
1309 errcodeptr where to put an error code
1310
1311Returns: TRUE - a number was read
1312 FALSE - errorcode == 0 => no number was found
1313 errorcode != 0 => an error occurred
1314*/
1315
1316static BOOL
1317read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign,
1318 uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr)
1319{
1320int sign = 0;
1321uint32_t n = 0;
1322PCRE2_SPTR ptr = *ptrptr;
1323BOOL yield = FALSE;
1324
1325*errorcodeptr = 0;
1326
1327if (allow_sign >= 0 && ptr < ptrend)
1328 {
1329 if (*ptr == CHAR_PLUS)
1330 {
1331 sign = +1;
1332 max_value -= allow_sign;
1333 ptr++;
1334 }
1335 else if (*ptr == CHAR_MINUS)
1336 {
1337 sign = -1;
1338 ptr++;
1339 }
1340 }
1341
1342if (ptr >= ptrend || !IS_DIGIT(*ptr)) return FALSE;
1343while (ptr < ptrend && IS_DIGIT(*ptr))
1344 {
1345 n = n * 10 + *ptr++ - CHAR_0;
1346 if (n > max_value)
1347 {
1348 *errorcodeptr = max_error;
1349 goto EXIT;
1350 }
1351 }
1352
1353if (allow_sign >= 0 && sign != 0)
1354 {
1355 if (n == 0)
1356 {
1357 *errorcodeptr = ERR26; /* +0 and -0 are not allowed */
1358 goto EXIT;
1359 }
1360
1361 if (sign > 0) n += allow_sign;
1362 else if ((int)n > allow_sign)
1363 {
1364 *errorcodeptr = ERR15; /* Non-existent subpattern */
1365 goto EXIT;
1366 }
1367 else n = allow_sign + 1 - n;
1368 }
1369
1370yield = TRUE;
1371
1372EXIT:
1373*intptr = n;
1374*ptrptr = ptr;
1375return yield;
1376}
1377
1378
1379
1380/*************************************************
1381* Read repeat counts *
1382*************************************************/
1383
1384/* Read an item of the form {n,m} and return the values if non-NULL pointers
1385are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a
1386larger value is used for "unlimited". We have to use signed arguments for
1387read_number() because it is capable of returning a signed value.
1388
1389Arguments:
1390 ptrptr points to pointer to character after'{'
1391 ptrend pointer to end of input
1392 minp if not NULL, pointer to int for min
1393 maxp if not NULL, pointer to int for max (-1 if no max)
1394 returned as -1 if no max
1395 errorcodeptr points to error code variable
1396
1397Returns: FALSE if not a repeat quantifier, errorcode set zero
1398 FALSE on error, with errorcode set non-zero
1399 TRUE on success, with pointer updated to point after '}'
1400*/
1401
1402static BOOL
1403read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp,
1404 uint32_t *maxp, int *errorcodeptr)
1405{
1406PCRE2_SPTR p;
1407BOOL yield = FALSE;
1408BOOL had_comma = FALSE;
1409int32_t min = 0;
1410int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */
1411
1412/* Check the syntax */
1413
1414*errorcodeptr = 0;
1415for (p = *ptrptr;; p++)
1416 {
1417 uint32_t c;
1418 if (p >= ptrend) return FALSE;
1419 c = *p;
1420 if (IS_DIGIT(c)) continue;
1421 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1422 if (c == CHAR_COMMA)
1423 {
1424 if (had_comma) return FALSE;
1425 had_comma = TRUE;
1426 }
1427 else return FALSE;
1428 }
1429
1430/* The only error from read_number() is for a number that is too big. */
1431
1432p = *ptrptr;
1433if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr))
1434 goto EXIT;
1435
1436if (*p == CHAR_RIGHT_CURLY_BRACKET)
1437 {
1438 p++;
1439 max = min;
1440 }
1441else
1442 {
1443 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1444 {
1445 if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max,
1446 errorcodeptr))
1447 goto EXIT;
1448 if (max < min)
1449 {
1450 *errorcodeptr = ERR4;
1451 goto EXIT;
1452 }
1453 }
1454 p++;
1455 }
1456
1457yield = TRUE;
1458if (minp != NULL) *minp = (uint32_t)min;
1459if (maxp != NULL) *maxp = (uint32_t)max;
1460
1461/* Update the pattern pointer */
1462
1463EXIT:
1464*ptrptr = p;
1465return yield;
1466}
1467
1468
1469
1470/*************************************************
1471* Handle escapes *
1472*************************************************/
1473
1474/* This function is called when a \ has been encountered. It either returns a
1475positive value for a simple escape such as \d, or 0 for a data character, which
1476is placed in chptr. A backreference to group n is returned as negative n. On
1477entry, ptr is pointing at the character after \. On exit, it points after the
1478final code unit of the escape sequence.
1479
1480This function is also called from pcre2_substitute() to handle escape sequences
1481in replacement strings. In this case, the cb argument is NULL, and in the case
1482of escapes that have further processing, only sequences that define a data
1483character are recognised. The isclass argument is not relevant; the options
1484argument is the final value of the compiled pattern's options.
1485
1486Arguments:
1487 ptrptr points to the input position pointer
1488 ptrend points to the end of the input
1489 chptr points to a returned data character
1490 errorcodeptr points to the errorcode variable (containing zero)
1491 options the current options bits
1492 isclass TRUE if inside a character class
1493 cb compile data block or NULL when called from pcre2_substitute()
1494
1495Returns: zero => a data character
1496 positive => a special escape sequence
1497 negative => a numerical back reference
1498 on error, errorcodeptr is set non-zero
1499*/
1500
1501int
1502PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
1503 int *errorcodeptr, uint32_t options, uint32_t extra_options, BOOL isclass,
1504 compile_block *cb)
1505{
1506BOOL utf = (options & PCRE2_UTF) != 0;
1507PCRE2_SPTR ptr = *ptrptr;
1508uint32_t c, cc;
1509int escape = 0;
1510int i;
1511
1512/* If backslash is at the end of the string, it's an error. */
1513
1514if (ptr >= ptrend)
1515 {
1516 *errorcodeptr = ERR1;
1517 return 0;
1518 }
1519
1520GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
1521*errorcodeptr = 0; /* Be optimistic */
1522
1523/* Non-alphanumerics are literals, so we just leave the value in c. An initial
1524value test saves a memory lookup for code points outside the alphanumeric
1525range. */
1526
1527if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {} /* Definitely literal */
1528
1529/* Otherwise, do a table lookup. Non-zero values need little processing here. A
1530positive value is a literal value for something like \n. A negative value is
1531the negation of one of the ESC_ macros that is passed back for handling by the
1532calling function. Some extra checking is needed for \N because only \N{U+dddd}
1533is supported. If the value is zero, further processing is handled below. */
1534
1535else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
1536 {
1537 if (i > 0)
1538 {
1539 c = (uint32_t)i;
1540 if (c == CHAR_CR && (extra_options & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)
1541 c = CHAR_LF;
1542 }
1543 else /* Negative table entry */
1544 {
1545 escape = -i; /* Else return a special escape */
1546 if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X))
1547 cb->external_flags |= PCRE2_HASBKPORX; /* Note \P, \p, or \X */
1548
1549 /* Perl supports \N{name} for character names and \N{U+dddd} for numerical
1550 Unicode code points, as well as plain \N for "not newline". PCRE does not
1551 support \N{name}. However, it does support quantification such as \N{2,3},
1552 so if \N{ is not followed by U+dddd we check for a quantifier. */
1553
1554 if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1555 {
1556 PCRE2_SPTR p = ptr + 1;
1557
1558 /* \N{U+ can be handled by the \x{ code. However, this construction is
1559 not valid in EBCDIC environments because it specifies a Unicode
1560 character, not a codepoint in the local code. For example \N{U+0041}
1561 must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode
1562 casing semantics for the entire pattern, so allow it only in UTF (i.e.
1563 Unicode) mode. */
1564
1565 if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS)
1566 {
1567#ifdef EBCDIC
1568 *errorcodeptr = ERR93;
1569#else
1570 if (utf)
1571 {
1572 ptr = p + 1;
1573 escape = 0; /* Not a fancy escape after all */
1574 goto COME_FROM_NU;
1575 }
1576 else *errorcodeptr = ERR93;
1577#endif
1578 }
1579
1580 /* Give an error if what follows is not a quantifier, but don't override
1581 an error set by the quantifier reader (e.g. number overflow). */
1582
1583 else
1584 {
1585 if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) &&
1586 *errorcodeptr == 0)
1587 *errorcodeptr = ERR37;
1588 }
1589 }
1590 }
1591 }
1592
1593/* Escapes that need further processing, including those that are unknown, have
1594a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
1595\o, and \x are recognized (\u and \U can never appear as they are used for case
1596forcing). */
1597
1598else
1599 {
1600 int s;
1601 PCRE2_SPTR oldptr;
1602 BOOL overflow;
1603 BOOL alt_bsux =
1604 ((options & PCRE2_ALT_BSUX) | (extra_options & PCRE2_EXTRA_ALT_BSUX)) != 0;
1605
1606 /* Filter calls from pcre2_substitute(). */
1607
1608 if (cb == NULL)
1609 {
1610 if (c != CHAR_c && c != CHAR_o && c != CHAR_x)
1611 {
1612 *errorcodeptr = ERR3;
1613 return 0;
1614 }
1615 alt_bsux = FALSE; /* Do not modify \x handling */
1616 }
1617
1618 switch (c)
1619 {
1620 /* A number of Perl escapes are not handled by PCRE. We give an explicit
1621 error. */
1622
1623 case CHAR_F:
1624 case CHAR_l:
1625 case CHAR_L:
1626 *errorcodeptr = ERR37;
1627 break;
1628
1629 /* \u is unrecognized when neither PCRE2_ALT_BSUX nor PCRE2_EXTRA_ALT_BSUX
1630 is set. Otherwise, \u must be followed by exactly four hex digits or, if
1631 PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces.
1632 Otherwise it is a lowercase u letter. This gives some compatibility with
1633 ECMAScript (aka JavaScript). */
1634
1635 case CHAR_u:
1636 if (!alt_bsux) *errorcodeptr = ERR37; else
1637 {
1638 uint32_t xc;
1639
1640 if (ptr >= ptrend) break;
1641 if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
1642 (extra_options & PCRE2_EXTRA_ALT_BSUX) != 0)
1643 {
1644 PCRE2_SPTR hptr = ptr + 1;
1645 cc = 0;
1646
1647 while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff)
1648 {
1649 if ((cc & 0xf0000000) != 0) /* Test for 32-bit overflow */
1650 {
1651 *errorcodeptr = ERR77;
1652 ptr = hptr; /* Show where */
1653 break; /* *hptr != } will cause another break below */
1654 }
1655 cc = (cc << 4) | xc;
1656 hptr++;
1657 }
1658
1659 if (hptr == ptr + 1 || /* No hex digits */
1660 hptr >= ptrend || /* Hit end of input */
1661 *hptr != CHAR_RIGHT_CURLY_BRACKET) /* No } terminator */
1662 break; /* Hex escape not recognized */
1663
1664 c = cc; /* Accept the code point */
1665 ptr = hptr + 1;
1666 }
1667
1668 else /* Must be exactly 4 hex digits */
1669 {
1670 if (ptrend - ptr < 4) break; /* Less than 4 chars */
1671 if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */
1672 if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
1673 cc = (cc << 4) | xc;
1674 if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */
1675 cc = (cc << 4) | xc;
1676 if ((xc = XDIGIT(ptr[3])) == 0xff) break; /* Not a hex digit */
1677 c = (cc << 4) | xc;
1678 ptr += 4;
1679 }
1680
1681 if (utf)
1682 {
1683 if (c > 0x10ffffU) *errorcodeptr = ERR77;
1684 else
1685 if (c >= 0xd800 && c <= 0xdfff &&
1686 (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1687 *errorcodeptr = ERR73;
1688 }
1689 else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
1690 }
1691 break;
1692
1693 /* \U is unrecognized unless PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set,
1694 in which case it is an upper case letter. */
1695
1696 case CHAR_U:
1697 if (!alt_bsux) *errorcodeptr = ERR37;
1698 break;
1699
1700 /* In a character class, \g is just a literal "g". Outside a character
1701 class, \g must be followed by one of a number of specific things:
1702
1703 (1) A number, either plain or braced. If positive, it is an absolute
1704 backreference. If negative, it is a relative backreference. This is a Perl
1705 5.10 feature.
1706
1707 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1708 is part of Perl's movement towards a unified syntax for back references. As
1709 this is synonymous with \k{name}, we fudge it up by pretending it really
1710 was \k{name}.
1711
1712 (3) For Oniguruma compatibility we also support \g followed by a name or a
1713 number either in angle brackets or in single quotes. However, these are
1714 (possibly recursive) subroutine calls, _not_ backreferences. We return
1715 the ESC_g code.
1716
1717 Summary: Return a negative number for a numerical back reference, ESC_k for
1718 a named back reference, and ESC_g for a named or numbered subroutine call.
1719 */
1720
1721 case CHAR_g:
1722 if (isclass) break;
1723
1724 if (ptr >= ptrend)
1725 {
1726 *errorcodeptr = ERR57;
1727 break;
1728 }
1729
1730 if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE)
1731 {
1732 escape = ESC_g;
1733 break;
1734 }
1735
1736 /* If there is a brace delimiter, try to read a numerical reference. If
1737 there isn't one, assume we have a name and treat it as \k. */
1738
1739 if (*ptr == CHAR_LEFT_CURLY_BRACKET)
1740 {
1741 PCRE2_SPTR p = ptr + 1;
1742 if (!read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1743 errorcodeptr))
1744 {
1745 if (*errorcodeptr == 0) escape = ESC_k; /* No number found */
1746 break;
1747 }
1748 if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)
1749 {
1750 *errorcodeptr = ERR57;
1751 break;
1752 }
1753 ptr = p + 1;
1754 }
1755
1756 /* Read an undelimited number */
1757
1758 else
1759 {
1760 if (!read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
1761 errorcodeptr))
1762 {
1763 if (*errorcodeptr == 0) *errorcodeptr = ERR57; /* No number found */
1764 break;
1765 }
1766 }
1767
1768 if (s <= 0)
1769 {
1770 *errorcodeptr = ERR15;
1771 break;
1772 }
1773
1774 escape = -s;
1775 break;
1776
1777 /* The handling of escape sequences consisting of a string of digits
1778 starting with one that is not zero is not straightforward. Perl has changed
1779 over the years. Nowadays \g{} for backreferences and \o{} for octal are
1780 recommended to avoid the ambiguities in the old syntax.
1781
1782 Outside a character class, the digits are read as a decimal number. If the
1783 number is less than 10, or if there are that many previous extracting left
1784 brackets, it is a back reference. Otherwise, up to three octal digits are
1785 read to form an escaped character code. Thus \123 is likely to be octal 123
1786 (cf \0123, which is octal 012 followed by the literal 3).
1787
1788 Inside a character class, \ followed by a digit is always either a literal
1789 8 or 9 or an octal number. */
1790
1791 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1792 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1793
1794 if (!isclass)
1795 {
1796 oldptr = ptr;
1797 ptr--; /* Back to the digit */
1798
1799 /* As we know we are at a digit, the only possible error from
1800 read_number() is a number that is too large to be a group number. In this
1801 case we fall through handle this as not a group reference. If we have
1802 read a small enough number, check for a back reference.
1803
1804 \1 to \9 are always back references. \8x and \9x are too; \1x to \7x
1805 are octal escapes if there are not that many previous captures. */
1806
1807 if (read_number(&ptr, ptrend, -1, INT_MAX/10 - 1, 0, &s, errorcodeptr) &&
1808 (s < 10 || oldptr[-1] >= CHAR_8 || s <= (int)cb->bracount))
1809 {
1810 if (s > (int)MAX_GROUP_NUMBER) *errorcodeptr = ERR61;
1811 else escape = -s; /* Indicates a back reference */
1812 break;
1813 }
1814
1815 ptr = oldptr; /* Put the pointer back and fall through */
1816 }
1817
1818 /* Handle a digit following \ when the number is not a back reference, or
1819 we are within a character class. If the first digit is 8 or 9, Perl used to
1820 generate a binary zero and then treat the digit as a following literal. At
1821 least by Perl 5.18 this changed so as not to insert the binary zero. */
1822
1823 if (c >= CHAR_8) break;
1824
1825 /* Fall through */
1826
1827 /* \0 always starts an octal number, but we may drop through to here with a
1828 larger first octal digit. The original code used just to take the least
1829 significant 8 bits of octal numbers (I think this is what early Perls used
1830 to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1831 but no more than 3 octal digits. */
1832
1833 case CHAR_0:
1834 c -= CHAR_0;
1835 while(i++ < 2 && ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1836 c = c * 8 + *ptr++ - CHAR_0;
1837#if PCRE2_CODE_UNIT_WIDTH == 8
1838 if (!utf && c > 0xff) *errorcodeptr = ERR51;
1839#endif
1840 break;
1841
1842 /* \o is a relatively new Perl feature, supporting a more general way of
1843 specifying character codes in octal. The only supported form is \o{ddd}. */
1844
1845 case CHAR_o:
1846 if (ptr >= ptrend || *ptr++ != CHAR_LEFT_CURLY_BRACKET)
1847 {
1848 ptr--;
1849 *errorcodeptr = ERR55;
1850 }
1851 else if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1852 *errorcodeptr = ERR78;
1853 else
1854 {
1855 c = 0;
1856 overflow = FALSE;
1857 while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1858 {
1859 cc = *ptr++;
1860 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1861#if PCRE2_CODE_UNIT_WIDTH == 32
1862 if (c >= 0x20000000l) { overflow = TRUE; break; }
1863#endif
1864 c = (c << 3) + (cc - CHAR_0);
1865#if PCRE2_CODE_UNIT_WIDTH == 8
1866 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1867#elif PCRE2_CODE_UNIT_WIDTH == 16
1868 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1869#elif PCRE2_CODE_UNIT_WIDTH == 32
1870 if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1871#endif
1872 }
1873 if (overflow)
1874 {
1875 while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1876 *errorcodeptr = ERR34;
1877 }
1878 else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1879 {
1880 if (utf && c >= 0xd800 && c <= 0xdfff &&
1881 (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1882 {
1883 ptr--;
1884 *errorcodeptr = ERR73;
1885 }
1886 }
1887 else
1888 {
1889 ptr--;
1890 *errorcodeptr = ERR64;
1891 }
1892 }
1893 break;
1894
1895 /* When PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, \x must be followed
1896 by two hexadecimal digits. Otherwise it is a lowercase x letter. */
1897
1898 case CHAR_x:
1899 if (alt_bsux)
1900 {
1901 uint32_t xc;
1902 if (ptrend - ptr < 2) break; /* Less than 2 characters */
1903 if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */
1904 if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
1905 c = (cc << 4) | xc;
1906 ptr += 2;
1907 }
1908
1909 /* Handle \x in Perl's style. \x{ddd} is a character code which can be
1910 greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
1911 digits. If not, { used to be treated as a data character. However, Perl
1912 seems to read hex digits up to the first non-such, and ignore the rest, so
1913 that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1914 now gives an error. */
1915
1916 else
1917 {
1918 if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1919 {
1920#ifndef EBCDIC
1921 COME_FROM_NU:
1922#endif
1923 if (++ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1924 {
1925 *errorcodeptr = ERR78;
1926 break;
1927 }
1928 c = 0;
1929 overflow = FALSE;
1930
1931 while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff)
1932 {
1933 ptr++;
1934 if (c == 0 && cc == 0) continue; /* Leading zeroes */
1935#if PCRE2_CODE_UNIT_WIDTH == 32
1936 if (c >= 0x10000000l) { overflow = TRUE; break; }
1937#endif
1938 c = (c << 4) | cc;
1939 if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))
1940 {
1941 overflow = TRUE;
1942 break;
1943 }
1944 }
1945
1946 if (overflow)
1947 {
1948 while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;
1949 *errorcodeptr = ERR34;
1950 }
1951 else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1952 {
1953 if (utf && c >= 0xd800 && c <= 0xdfff &&
1954 (extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1955 {
1956 ptr--;
1957 *errorcodeptr = ERR73;
1958 }
1959 }
1960
1961 /* If the sequence of hex digits does not end with '}', give an error.
1962 We used just to recognize this construct and fall through to the normal
1963 \x handling, but nowadays Perl gives an error, which seems much more
1964 sensible, so we do too. */
1965
1966 else
1967 {
1968 ptr--;
1969 *errorcodeptr = ERR67;
1970 }
1971 } /* End of \x{} processing */
1972
1973 /* Read a up to two hex digits after \x */
1974
1975 else
1976 {
1977 c = 0;
1978 if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break; /* Not a hex digit */
1979 ptr++;
1980 c = cc;
1981 if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break; /* Not a hex digit */
1982 ptr++;
1983 c = (c << 4) | cc;
1984 } /* End of \xdd handling */
1985 } /* End of Perl-style \x handling */
1986 break;
1987
1988 /* The handling of \c is different in ASCII and EBCDIC environments. In an
1989 ASCII (or Unicode) environment, an error is given if the character
1990 following \c is not a printable ASCII character. Otherwise, the following
1991 character is upper-cased if it is a letter, and after that the 0x40 bit is
1992 flipped. The result is the value of the escape.
1993
1994 In an EBCDIC environment the handling of \c is compatible with the
1995 specification in the perlebcdic document. The following character must be
1996 a letter or one of small number of special characters. These provide a
1997 means of defining the character values 0-31.
1998
1999 For testing the EBCDIC handling of \c in an ASCII environment, recognize
2000 the EBCDIC value of 'c' explicitly. */
2001
2002#if defined EBCDIC && 'a' != 0x81
2003 case 0x83:
2004#else
2005 case CHAR_c:
2006#endif
2007 if (ptr >= ptrend)
2008 {
2009 *errorcodeptr = ERR2;
2010 break;
2011 }
2012 c = *ptr;
2013 if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);
2014
2015 /* Handle \c in an ASCII/Unicode environment. */
2016
2017#ifndef EBCDIC /* ASCII/UTF-8 coding */
2018 if (c < 32 || c > 126) /* Excludes all non-printable ASCII */
2019 {
2020 *errorcodeptr = ERR68;
2021 break;
2022 }
2023 c ^= 0x40;
2024
2025 /* Handle \c in an EBCDIC environment. The special case \c? is converted to
2026 255 (0xff) or 95 (0x5f) if other characters suggest we are using the
2027 POSIX-BC encoding. (This is the way Perl indicates that it handles \c?.)
2028 The other valid sequences correspond to a list of specific characters. */
2029
2030#else
2031 if (c == CHAR_QUESTION_MARK)
2032 c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
2033 else
2034 {
2035 for (i = 0; i < 32; i++)
2036 {
2037 if (c == ebcdic_escape_c[i]) break;
2038 }
2039 if (i < 32) c = i; else *errorcodeptr = ERR68;
2040 }
2041#endif /* EBCDIC */
2042
2043 ptr++;
2044 break;
2045
2046 /* Any other alphanumeric following \ is an error. Perl gives an error only
2047 if in warning mode, but PCRE doesn't have a warning mode. */
2048
2049 default:
2050 *errorcodeptr = ERR3;
2051 *ptrptr = ptr - 1; /* Point to the character at fault */
2052 return 0;
2053 }
2054 }
2055
2056/* Set the pointer to the next character before returning. */
2057
2058*ptrptr = ptr;
2059*chptr = c;
2060return escape;
2061}
2062
2063
2064
2065#ifdef SUPPORT_UNICODE
2066/*************************************************
2067* Handle \P and \p *
2068*************************************************/
2069
2070/* This function is called after \P or \p has been encountered, provided that
2071PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
2072contents of ptrptr are pointing after the P or p. On exit, it is left pointing
2073after the final code unit of the escape sequence.
2074
2075Arguments:
2076 ptrptr the pattern position pointer
2077 negptr a boolean that is set TRUE for negation else FALSE
2078 ptypeptr an unsigned int that is set to the type value
2079 pdataptr an unsigned int that is set to the detailed property value
2080 errorcodeptr the error code variable
2081 cb the compile data
2082
2083Returns: TRUE if the type value was found, or FALSE for an invalid type
2084*/
2085
2086static BOOL
2087get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, uint16_t *ptypeptr,
2088 uint16_t *pdataptr, int *errorcodeptr, compile_block *cb)
2089{
2090PCRE2_UCHAR c;
2091PCRE2_SIZE i, bot, top;
2092PCRE2_SPTR ptr = *ptrptr;
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002093PCRE2_UCHAR name[50];
2094PCRE2_UCHAR *vptr = NULL;
2095uint16_t ptscript = PT_NOTSCRIPT;
Elliott Hughes5b808042021-10-01 10:56:10 -07002096
2097if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2098c = *ptr++;
2099*negptr = FALSE;
2100
2101/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
2102negation. */
2103
2104if (c == CHAR_LEFT_CURLY_BRACKET)
2105 {
2106 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002107
Elliott Hughes5b808042021-10-01 10:56:10 -07002108 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
2109 {
2110 *negptr = TRUE;
2111 ptr++;
2112 }
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002113
Elliott Hughes5b808042021-10-01 10:56:10 -07002114 for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
2115 {
2116 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2117 c = *ptr++;
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002118 while (c == '_' || c == '-' || isspace(c))
2119 {
2120 if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2121 c = *ptr++;
2122 }
Elliott Hughes5b808042021-10-01 10:56:10 -07002123 if (c == CHAR_NUL) goto ERROR_RETURN;
2124 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002125 name[i] = tolower(c);
2126 if ((c == ':' || c == '=') && vptr == NULL) vptr = name + i;
Elliott Hughes5b808042021-10-01 10:56:10 -07002127 }
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002128
Elliott Hughes5b808042021-10-01 10:56:10 -07002129 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
2130 name[i] = 0;
2131 }
2132
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002133/* If { doesn't follow \p or \P there is just one following character, which
2134must be an ASCII letter. */
Elliott Hughes5b808042021-10-01 10:56:10 -07002135
2136else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0)
2137 {
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002138 name[0] = tolower(c);
Elliott Hughes5b808042021-10-01 10:56:10 -07002139 name[1] = 0;
2140 }
2141else goto ERROR_RETURN;
2142
2143*ptrptr = ptr;
2144
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002145/* If the property contains ':' or '=' we have class name and value separately
2146specified. The following are supported:
2147
2148 . Bidi_Class (synonym bc), for which the property names are "bidi<name>".
2149 . Script (synonym sc) for which the property name is the script name
2150 . Script_Extensions (synonym scx), ditto
2151
2152As this is a small number, we currently just check the names directly. If this
2153grows, a sorted table and a switch will be neater.
2154
2155For both the script properties, set a PT_xxx value so that (1) they can be
2156distinguished and (2) invalid script names that happen to be the name of
2157another property can be diagnosed. */
2158
2159if (vptr != NULL)
2160 {
2161 int offset = 0;
2162 PCRE2_UCHAR sname[8];
2163
2164 *vptr = 0; /* Terminate property name */
2165 if (PRIV(strcmp_c8)(name, STRING_bidiclass) == 0 ||
2166 PRIV(strcmp_c8)(name, STRING_bc) == 0)
2167 {
2168 offset = 4;
2169 sname[0] = CHAR_b;
2170 sname[1] = CHAR_i; /* There is no strcpy_c8 function */
2171 sname[2] = CHAR_d;
2172 sname[3] = CHAR_i;
2173 }
2174
2175 else if (PRIV(strcmp_c8)(name, STRING_script) == 0 ||
2176 PRIV(strcmp_c8)(name, STRING_sc) == 0)
2177 ptscript = PT_SC;
2178
2179 else if (PRIV(strcmp_c8)(name, STRING_scriptextensions) == 0 ||
2180 PRIV(strcmp_c8)(name, STRING_scx) == 0)
2181 ptscript = PT_SCX;
2182
2183 else
2184 {
2185 *errorcodeptr = ERR47;
2186 return FALSE;
2187 }
2188
2189 /* Adjust the string in name[] as needed */
2190
2191 memmove(name + offset, vptr + 1, (name + i - vptr)*sizeof(PCRE2_UCHAR));
2192 if (offset != 0) memmove(name, sname, offset*sizeof(PCRE2_UCHAR));
2193 }
2194
2195/* Search for a recognized property using binary chop. */
Elliott Hughes5b808042021-10-01 10:56:10 -07002196
2197bot = 0;
2198top = PRIV(utt_size);
2199
2200while (bot < top)
2201 {
2202 int r;
2203 i = (bot + top) >> 1;
2204 r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002205
2206 /* When a matching property is found, some extra checking is needed when the
2207 \p{xx:yy} syntax is used and xx is either sc or scx. */
2208
Elliott Hughes5b808042021-10-01 10:56:10 -07002209 if (r == 0)
2210 {
Elliott Hughes5b808042021-10-01 10:56:10 -07002211 *pdataptr = PRIV(utt)[i].value;
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002212 if (vptr == NULL || ptscript == PT_NOTSCRIPT)
2213 {
2214 *ptypeptr = PRIV(utt)[i].type;
2215 return TRUE;
2216 }
2217
2218 switch (PRIV(utt)[i].type)
2219 {
2220 case PT_SC:
2221 *ptypeptr = PT_SC;
2222 return TRUE;
2223
2224 case PT_SCX:
2225 *ptypeptr = ptscript;
2226 return TRUE;
2227 }
2228
2229 break; /* Non-script found */
Elliott Hughes5b808042021-10-01 10:56:10 -07002230 }
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002231
Elliott Hughes5b808042021-10-01 10:56:10 -07002232 if (r > 0) bot = i + 1; else top = i;
2233 }
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07002234
2235*errorcodeptr = ERR47; /* Unrecognized property */
Elliott Hughes5b808042021-10-01 10:56:10 -07002236return FALSE;
2237
2238ERROR_RETURN: /* Malformed \P or \p */
2239*errorcodeptr = ERR46;
2240*ptrptr = ptr;
2241return FALSE;
2242}
2243#endif
2244
2245
2246
2247/*************************************************
2248* Check for POSIX class syntax *
2249*************************************************/
2250
2251/* This function is called when the sequence "[:" or "[." or "[=" is
2252encountered in a character class. It checks whether this is followed by a
2253sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2254reach an unescaped ']' without the special preceding character, return FALSE.
2255
2256Originally, this function only recognized a sequence of letters between the
2257terminators, but it seems that Perl recognizes any sequence of characters,
2258though of course unknown POSIX names are subsequently rejected. Perl gives an
2259"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2260didn't consider this to be a POSIX class. Likewise for [:1234:].
2261
2262The problem in trying to be exactly like Perl is in the handling of escapes. We
2263have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2264class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2265below handles the special cases \\ and \], but does not try to do any other
2266escape processing. This makes it different from Perl for cases such as
2267[:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
2268not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
2269when Perl does, I think.
2270
2271A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2272It seems that the appearance of a nested POSIX class supersedes an apparent
2273external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2274a digit. This is handled by returning FALSE if the start of a new group with
2275the same terminator is encountered, since the next closing sequence must close
2276the nested group, not the outer one.
2277
2278In Perl, unescaped square brackets may also appear as part of class names. For
2279example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2280[:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2281seem right at all. PCRE does not allow closing square brackets in POSIX class
2282names.
2283
2284Arguments:
2285 ptr pointer to the character after the initial [ (colon, dot, equals)
2286 ptrend pointer to the end of the pattern
2287 endptr where to return a pointer to the terminating ':', '.', or '='
2288
2289Returns: TRUE or FALSE
2290*/
2291
2292static BOOL
2293check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr)
2294{
2295PCRE2_UCHAR terminator; /* Don't combine these lines; the Solaris cc */
2296terminator = *ptr++; /* compiler warns about "non-constant" initializer. */
2297
2298for (; ptrend - ptr >= 2; ptr++)
2299 {
2300 if (*ptr == CHAR_BACKSLASH &&
2301 (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH))
2302 ptr++;
2303
2304 else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
2305 *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2306
2307 else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2308 {
2309 *endptr = ptr;
2310 return TRUE;
2311 }
2312 }
2313
2314return FALSE;
2315}
2316
2317
2318
2319/*************************************************
2320* Check POSIX class name *
2321*************************************************/
2322
2323/* This function is called to check the name given in a POSIX-style class entry
2324such as [:alnum:].
2325
2326Arguments:
2327 ptr points to the first letter
2328 len the length of the name
2329
2330Returns: a value representing the name, or -1 if unknown
2331*/
2332
2333static int
2334check_posix_name(PCRE2_SPTR ptr, int len)
2335{
2336const char *pn = posix_names;
2337int yield = 0;
2338while (posix_name_lengths[yield] != 0)
2339 {
2340 if (len == posix_name_lengths[yield] &&
2341 PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;
2342 pn += posix_name_lengths[yield] + 1;
2343 yield++;
2344 }
2345return -1;
2346}
2347
2348
2349
2350/*************************************************
2351* Read a subpattern or VERB name *
2352*************************************************/
2353
2354/* This function is called from parse_regex() below whenever it needs to read
2355the name of a subpattern or a (*VERB) or an (*alpha_assertion). The initial
2356pointer must be to the character before the name. If that character is '*' we
2357are reading a verb or alpha assertion name. The pointer is updated to point
2358after the name, for a VERB or alpha assertion name, or after tha name's
2359terminator for a subpattern name. Returning both the offset and the name
2360pointer is redundant information, but some callers use one and some the other,
2361so it is simplest just to return both.
2362
2363Arguments:
2364 ptrptr points to the character pointer variable
2365 ptrend points to the end of the input string
2366 utf true if the input is UTF-encoded
2367 terminator the terminator of a subpattern name must be this
2368 offsetptr where to put the offset from the start of the pattern
2369 nameptr where to put a pointer to the name in the input
2370 namelenptr where to put the length of the name
2371 errcodeptr where to put an error code
2372 cb pointer to the compile data block
2373
2374Returns: TRUE if a name was read
2375 FALSE otherwise, with error code set
2376*/
2377
2378static BOOL
2379read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator,
2380 PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr,
2381 int *errorcodeptr, compile_block *cb)
2382{
2383PCRE2_SPTR ptr = *ptrptr;
2384BOOL is_group = (*ptr != CHAR_ASTERISK);
2385
2386if (++ptr >= ptrend) /* No characters in name */
2387 {
2388 *errorcodeptr = is_group? ERR62: /* Subpattern name expected */
2389 ERR60; /* Verb not recognized or malformed */
2390 goto FAILED;
2391 }
2392
2393*nameptr = ptr;
2394*offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);
2395
2396/* In UTF mode, a group name may contain letters and decimal digits as defined
2397by Unicode properties, and underscores, but must not start with a digit. */
2398
2399#ifdef SUPPORT_UNICODE
2400if (utf && is_group)
2401 {
2402 uint32_t c, type;
2403
2404 GETCHAR(c, ptr);
2405 type = UCD_CHARTYPE(c);
2406
2407 if (type == ucp_Nd)
2408 {
2409 *errorcodeptr = ERR44;
2410 goto FAILED;
2411 }
2412
2413 for(;;)
2414 {
2415 if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&
2416 c != CHAR_UNDERSCORE) break;
2417 ptr++;
2418 FORWARDCHARTEST(ptr, ptrend);
2419 if (ptr >= ptrend) break;
2420 GETCHAR(c, ptr);
2421 type = UCD_CHARTYPE(c);
2422 }
2423 }
2424else
2425#else
2426(void)utf; /* Avoid compiler warning */
2427#endif /* SUPPORT_UNICODE */
2428
2429/* Handle non-group names and group names in non-UTF modes. A group name must
2430not start with a digit. If either of the others start with a digit it just
2431won't be recognized. */
2432
2433 {
2434 if (is_group && IS_DIGIT(*ptr))
2435 {
2436 *errorcodeptr = ERR44;
2437 goto FAILED;
2438 }
2439
2440 while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0)
2441 {
2442 ptr++;
2443 }
2444 }
2445
2446/* Check name length */
2447
2448if (ptr > *nameptr + MAX_NAME_SIZE)
2449 {
2450 *errorcodeptr = ERR48;
2451 goto FAILED;
2452 }
2453*namelenptr = (uint32_t)(ptr - *nameptr);
2454
2455/* Subpattern names must not be empty, and their terminator is checked here.
2456(What follows a verb or alpha assertion name is checked separately.) */
2457
2458if (is_group)
2459 {
2460 if (ptr == *nameptr)
2461 {
2462 *errorcodeptr = ERR62; /* Subpattern name expected */
2463 goto FAILED;
2464 }
2465 if (ptr >= ptrend || *ptr != (PCRE2_UCHAR)terminator)
2466 {
2467 *errorcodeptr = ERR42;
2468 goto FAILED;
2469 }
2470 ptr++;
2471 }
2472
2473*ptrptr = ptr;
2474return TRUE;
2475
2476FAILED:
2477*ptrptr = ptr;
2478return FALSE;
2479}
2480
2481
2482
2483/*************************************************
2484* Manage callouts at start of cycle *
2485*************************************************/
2486
2487/* At the start of a new item in parse_regex() we are able to record the
2488details of the previous item in a prior callout, and also to set up an
2489automatic callout if enabled. Avoid having two adjacent automatic callouts,
2490which would otherwise happen for items such as \Q that contribute nothing to
2491the parsed pattern.
2492
2493Arguments:
2494 ptr current pattern pointer
2495 pcalloutptr points to a pointer to previous callout, or NULL
2496 auto_callout TRUE if auto_callouts are enabled
2497 parsed_pattern the parsed pattern pointer
2498 cb compile block
2499
2500Returns: possibly updated parsed_pattern pointer.
2501*/
2502
2503static uint32_t *
2504manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout,
2505 uint32_t *parsed_pattern, compile_block *cb)
2506{
2507uint32_t *previous_callout = *pcalloutptr;
2508
2509if (previous_callout != NULL) previous_callout[2] = (uint32_t)(ptr -
2510 cb->start_pattern - (PCRE2_SIZE)previous_callout[1]);
2511
2512if (!auto_callout) previous_callout = NULL; else
2513 {
2514 if (previous_callout == NULL ||
2515 previous_callout != parsed_pattern - 4 ||
2516 previous_callout[3] != 255)
2517 {
2518 previous_callout = parsed_pattern; /* Set up new automatic callout */
2519 parsed_pattern += 4;
2520 previous_callout[0] = META_CALLOUT_NUMBER;
2521 previous_callout[2] = 0;
2522 previous_callout[3] = 255;
2523 }
2524 previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
2525 }
2526
2527*pcalloutptr = previous_callout;
2528return parsed_pattern;
2529}
2530
2531
2532
2533/*************************************************
2534* Parse regex and identify named groups *
2535*************************************************/
2536
2537/* This function is called first of all. It scans the pattern and does two
2538things: (1) It identifies capturing groups and makes a table of named capturing
2539groups so that information about them is fully available to both the compiling
2540scans. (2) It writes a parsed version of the pattern with comments omitted and
2541escapes processed into the parsed_pattern vector.
2542
2543Arguments:
2544 ptr points to the start of the pattern
2545 options compiling dynamic options (may change during the scan)
2546 has_lookbehind points to a boolean, set TRUE if a lookbehind is found
2547 cb pointer to the compile data block
2548
2549Returns: zero on success or a non-zero error code, with the
2550 error offset placed in the cb field
2551*/
2552
2553/* A structure and some flags for dealing with nested groups. */
2554
2555typedef struct nest_save {
2556 uint16_t nest_depth;
2557 uint16_t reset_group;
2558 uint16_t max_group;
2559 uint16_t flags;
2560 uint32_t options;
2561} nest_save;
2562
2563#define NSF_RESET 0x0001u
2564#define NSF_CONDASSERT 0x0002u
2565#define NSF_ATOMICSR 0x0004u
2566
2567/* Options that are changeable within the pattern must be tracked during
2568parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,
2569but all must be tracked so that META_OPTIONS items set the correct values for
2570the main compiling phase. */
2571
2572#define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_DUPNAMES| \
2573 PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \
2574 PCRE2_UNGREEDY)
2575
2576/* States used for analyzing ranges in character classes. The two OK values
2577must be last. */
2578
2579enum { RANGE_NO, RANGE_STARTED, RANGE_OK_ESCAPED, RANGE_OK_LITERAL };
2580
2581/* Only in 32-bit mode can there be literals > META_END. A macro encapsulates
2582the storing of literal values in the main parsed pattern, where they can always
2583be quantified. */
2584
2585#if PCRE2_CODE_UNIT_WIDTH == 32
2586#define PARSED_LITERAL(c, p) \
2587 { \
2588 if (c >= META_END) *p++ = META_BIGVALUE; \
2589 *p++ = c; \
2590 okquantifier = TRUE; \
2591 }
2592#else
2593#define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE;
2594#endif
2595
2596/* Here's the actual function. */
2597
2598static int parse_regex(PCRE2_SPTR ptr, uint32_t options, BOOL *has_lookbehind,
2599 compile_block *cb)
2600{
2601uint32_t c;
2602uint32_t delimiter;
2603uint32_t namelen;
2604uint32_t class_range_state;
2605uint32_t *verblengthptr = NULL; /* Value avoids compiler warning */
2606uint32_t *verbstartptr = NULL;
2607uint32_t *previous_callout = NULL;
2608uint32_t *parsed_pattern = cb->parsed_pattern;
2609uint32_t *parsed_pattern_end = cb->parsed_pattern_end;
2610uint32_t meta_quantifier = 0;
2611uint32_t add_after_mark = 0;
2612uint32_t extra_options = cb->cx->extra_options;
2613uint16_t nest_depth = 0;
2614int after_manual_callout = 0;
2615int expect_cond_assert = 0;
2616int errorcode = 0;
2617int escape;
2618int i;
2619BOOL inescq = FALSE;
2620BOOL inverbname = FALSE;
2621BOOL utf = (options & PCRE2_UTF) != 0;
2622BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0;
2623BOOL isdupname;
2624BOOL negate_class;
2625BOOL okquantifier = FALSE;
2626PCRE2_SPTR thisptr;
2627PCRE2_SPTR name;
2628PCRE2_SPTR ptrend = cb->end_pattern;
2629PCRE2_SPTR verbnamestart = NULL; /* Value avoids compiler warning */
2630named_group *ng;
2631nest_save *top_nest, *end_nests;
2632
2633/* Insert leading items for word and line matching (features provided for the
2634benefit of pcre2grep). */
2635
2636if ((extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
2637 {
2638 *parsed_pattern++ = META_CIRCUMFLEX;
2639 *parsed_pattern++ = META_NOCAPTURE;
2640 }
2641else if ((extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
2642 {
2643 *parsed_pattern++ = META_ESCAPE + ESC_b;
2644 *parsed_pattern++ = META_NOCAPTURE;
2645 }
2646
2647/* If the pattern is actually a literal string, process it separately to avoid
2648cluttering up the main loop. */
2649
2650if ((options & PCRE2_LITERAL) != 0)
2651 {
2652 while (ptr < ptrend)
2653 {
2654 if (parsed_pattern >= parsed_pattern_end)
2655 {
2656 errorcode = ERR63; /* Internal error (parsed pattern overflow) */
2657 goto FAILED;
2658 }
2659 thisptr = ptr;
2660 GETCHARINCTEST(c, ptr);
2661 if (auto_callout)
2662 parsed_pattern = manage_callouts(thisptr, &previous_callout,
2663 auto_callout, parsed_pattern, cb);
2664 PARSED_LITERAL(c, parsed_pattern);
2665 }
2666 goto PARSED_END;
2667 }
2668
2669/* Process a real regex which may contain meta-characters. */
2670
2671top_nest = NULL;
2672end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);
2673
2674/* The size of the nest_save structure might not be a factor of the size of the
2675workspace. Therefore we must round down end_nests so as to correctly avoid
2676creating a nest_save that spans the end of the workspace. */
2677
2678end_nests = (nest_save *)((char *)end_nests -
2679 ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));
2680
2681/* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */
2682
2683if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED;
2684
2685/* Now scan the pattern */
2686
2687while (ptr < ptrend)
2688 {
2689 int prev_expect_cond_assert;
2690 uint32_t min_repeat, max_repeat;
2691 uint32_t set, unset, *optset;
2692 uint32_t terminator;
2693 uint32_t prev_meta_quantifier;
2694 BOOL prev_okquantifier;
2695 PCRE2_SPTR tempptr;
2696 PCRE2_SIZE offset;
2697
2698 if (parsed_pattern >= parsed_pattern_end)
2699 {
2700 errorcode = ERR63; /* Internal error (parsed pattern overflow) */
2701 goto FAILED;
2702 }
2703
2704 if (nest_depth > cb->cx->parens_nest_limit)
2705 {
2706 errorcode = ERR19;
2707 goto FAILED; /* Parentheses too deeply nested */
2708 }
2709
2710 /* Get next input character, save its position for callout handling. */
2711
2712 thisptr = ptr;
2713 GETCHARINCTEST(c, ptr);
2714
2715 /* Copy quoted literals until \E, allowing for the possibility of automatic
2716 callouts, except when processing a (*VERB) "name". */
2717
2718 if (inescq)
2719 {
2720 if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
2721 {
2722 inescq = FALSE;
2723 ptr++; /* Skip E */
2724 }
2725 else
2726 {
2727 if (expect_cond_assert > 0) /* A literal is not allowed if we are */
2728 { /* expecting a conditional assertion, */
2729 ptr--; /* but an empty \Q\E sequence is OK. */
2730 errorcode = ERR28;
2731 goto FAILED;
2732 }
2733 if (inverbname)
2734 { /* Don't use PARSED_LITERAL() because it */
2735#if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */
2736 if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2737#endif
2738 *parsed_pattern++ = c;
2739 }
2740 else
2741 {
2742 if (after_manual_callout-- <= 0)
2743 parsed_pattern = manage_callouts(thisptr, &previous_callout,
2744 auto_callout, parsed_pattern, cb);
2745 PARSED_LITERAL(c, parsed_pattern);
2746 }
2747 meta_quantifier = 0;
2748 }
2749 continue; /* Next character */
2750 }
2751
2752 /* If we are processing the "name" part of a (*VERB:NAME) item, all
2753 characters up to the closing parenthesis are literals except when
2754 PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q
2755 and \E and escaped characters are allowed (no character types such as \d). If
2756 PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do
2757 this by not entering the special (*VERB:NAME) processing - they are then
2758 picked up below. Note that c is a character, not a code unit, so we must not
2759 use MAX_255 to test its size because MAX_255 tests code units and is assumed
2760 TRUE in 8-bit mode. */
2761
2762 if (inverbname &&
2763 (
2764 /* EITHER: not both options set */
2765 ((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) !=
2766 (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) ||
2767#ifdef SUPPORT_UNICODE
2768 /* OR: character > 255 AND not Unicode Pattern White Space */
2769 (c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) ||
2770#endif
2771 /* OR: not a # comment or isspace() white space */
2772 (c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0
2773#ifdef SUPPORT_UNICODE
2774 /* and not CHAR_NEL when Unicode is supported */
2775 && c != CHAR_NEL
2776#endif
2777 )))
2778 {
2779 PCRE2_SIZE verbnamelength;
2780
2781 switch(c)
2782 {
2783 default: /* Don't use PARSED_LITERAL() because it */
2784#if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */
2785 if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2786#endif
2787 *parsed_pattern++ = c;
2788 break;
2789
2790 case CHAR_RIGHT_PARENTHESIS:
2791 inverbname = FALSE;
2792 /* This is the length in characters */
2793 verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1);
2794 /* But the limit on the length is in code units */
2795 if (ptr - verbnamestart - 1 > (int)MAX_MARK)
2796 {
2797 ptr--;
2798 errorcode = ERR76;
2799 goto FAILED;
2800 }
2801 *verblengthptr = (uint32_t)verbnamelength;
2802
2803 /* If this name was on a verb such as (*ACCEPT) which does not continue,
2804 a (*MARK) was generated for the name. We now add the original verb as the
2805 next item. */
2806
2807 if (add_after_mark != 0)
2808 {
2809 *parsed_pattern++ = add_after_mark;
2810 add_after_mark = 0;
2811 }
2812 break;
2813
2814 case CHAR_BACKSLASH:
2815 if ((options & PCRE2_ALT_VERBNAMES) != 0)
2816 {
2817 escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
2818 cb->cx->extra_options, FALSE, cb);
2819 if (errorcode != 0) goto FAILED;
2820 }
2821 else escape = 0; /* Treat all as literal */
2822
2823 switch(escape)
2824 {
2825 case 0: /* Don't use PARSED_LITERAL() because it */
2826#if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */
2827 if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
2828#endif
2829 *parsed_pattern++ = c;
2830 break;
2831
2832 case ESC_Q:
2833 inescq = TRUE;
2834 break;
2835
2836 case ESC_E: /* Ignore */
2837 break;
2838
2839 default:
2840 errorcode = ERR40; /* Invalid in verb name */
2841 goto FAILED;
2842 }
2843 }
2844 continue; /* Next character in pattern */
2845 }
2846
2847 /* Not a verb name character. At this point we must process everything that
2848 must not change the quantification state. This is mainly comments, but we
2849 handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as
2850 A+, as in Perl. An isolated \E is ignored. */
2851
2852 if (c == CHAR_BACKSLASH && ptr < ptrend)
2853 {
2854 if (*ptr == CHAR_Q || *ptr == CHAR_E)
2855 {
2856 inescq = *ptr == CHAR_Q;
2857 ptr++;
2858 continue;
2859 }
2860 }
2861
2862 /* Skip over whitespace and # comments in extended mode. Note that c is a
2863 character, not a code unit, so we must not use MAX_255 to test its size
2864 because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The
2865 whitespace characters are those designated as "Pattern White Space" by
2866 Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is
2867 U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a
2868 subset of space characters that match \h and \v. */
2869
2870 if ((options & PCRE2_EXTENDED) != 0)
2871 {
2872 if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue;
2873#ifdef SUPPORT_UNICODE
2874 if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue;
2875#endif
2876 if (c == CHAR_NUMBER_SIGN)
2877 {
2878 while (ptr < ptrend)
2879 {
2880 if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */
2881 { /* IS_NEWLINE sets cb->nllen. */
2882 ptr += cb->nllen;
2883 break;
2884 }
2885 ptr++;
2886#ifdef SUPPORT_UNICODE
2887 if (utf) FORWARDCHARTEST(ptr, ptrend);
2888#endif
2889 }
2890 continue; /* Next character in pattern */
2891 }
2892 }
2893
2894 /* Skip over bracketed comments */
2895
2896 if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 &&
2897 ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
2898 {
2899 while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS);
2900 if (ptr >= ptrend)
2901 {
2902 errorcode = ERR18; /* A special error for missing ) in a comment */
2903 goto FAILED; /* to make it easier to debug. */
2904 }
2905 ptr++;
2906 continue; /* Next character in pattern */
2907 }
2908
2909 /* If the next item is not a quantifier, fill in length of any previous
2910 callout and create an auto callout if required. */
2911
2912 if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK &&
2913 (c != CHAR_LEFT_CURLY_BRACKET ||
2914 (tempptr = ptr,
2915 !read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode))))
2916 {
2917 if (after_manual_callout-- <= 0)
2918 parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout,
2919 parsed_pattern, cb);
2920 }
2921
2922 /* If expect_cond_assert is 2, we have just passed (?( and are expecting an
2923 assertion, possibly preceded by a callout. If the value is 1, we have just
2924 had the callout and expect an assertion. There must be at least 3 more
2925 characters in all cases. When expect_cond_assert is 2, we know that the
2926 current character is an opening parenthesis, as otherwise we wouldn't be
2927 here. However, when it is 1, we need to check, and it's easiest just to check
2928 always. Note that expect_cond_assert may be negative, since all callouts just
2929 decrement it. */
2930
2931 if (expect_cond_assert > 0)
2932 {
2933 BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 &&
2934 (ptr[0] == CHAR_QUESTION_MARK || ptr[0] == CHAR_ASTERISK);
2935 if (ok)
2936 {
2937 if (ptr[0] == CHAR_ASTERISK) /* New alpha assertion format, possibly */
2938 {
2939 ok = MAX_255(ptr[1]) && (cb->ctypes[ptr[1]] & ctype_lcletter) != 0;
2940 }
2941 else switch(ptr[1]) /* Traditional symbolic format */
2942 {
2943 case CHAR_C:
2944 ok = expect_cond_assert == 2;
2945 break;
2946
2947 case CHAR_EQUALS_SIGN:
2948 case CHAR_EXCLAMATION_MARK:
2949 break;
2950
2951 case CHAR_LESS_THAN_SIGN:
2952 ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK;
2953 break;
2954
2955 default:
2956 ok = FALSE;
2957 }
2958 }
2959
2960 if (!ok)
2961 {
2962 ptr--; /* Adjust error offset */
2963 errorcode = ERR28;
2964 goto FAILED;
2965 }
2966 }
2967
2968 /* Remember whether we are expecting a conditional assertion, and set the
2969 default for this item. */
2970
2971 prev_expect_cond_assert = expect_cond_assert;
2972 expect_cond_assert = 0;
2973
2974 /* Remember quantification status for the previous significant item, then set
2975 default for this item. */
2976
2977 prev_okquantifier = okquantifier;
2978 prev_meta_quantifier = meta_quantifier;
2979 okquantifier = FALSE;
2980 meta_quantifier = 0;
2981
2982 /* If the previous significant item was a quantifier, adjust the parsed code
2983 if there is a following modifier. The base meta value is always followed by
2984 the PLUS and QUERY values, in that order. We do this here rather than after
2985 reading a quantifier so that intervening comments and /x whitespace can be
2986 ignored without having to replicate code. */
2987
2988 if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK || c == CHAR_PLUS))
2989 {
2990 parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] =
2991 prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)?
2992 0x00020000u : 0x00010000u);
2993 continue; /* Next character in pattern */
2994 }
2995
2996
2997 /* Process the next item in the main part of a pattern. */
2998
2999 switch(c)
3000 {
3001 default: /* Non-special character */
3002 PARSED_LITERAL(c, parsed_pattern);
3003 break;
3004
3005
3006 /* ---- Escape sequence ---- */
3007
3008 case CHAR_BACKSLASH:
3009 tempptr = ptr;
3010 escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3011 cb->cx->extra_options, FALSE, cb);
3012 if (errorcode != 0)
3013 {
3014 ESCAPE_FAILED:
3015 if ((extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3016 goto FAILED;
3017 ptr = tempptr;
3018 if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3019 {
3020 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
3021 }
3022 escape = 0; /* Treat as literal character */
3023 }
3024
3025 /* The escape was a data escape or literal character. */
3026
3027 if (escape == 0)
3028 {
3029 PARSED_LITERAL(c, parsed_pattern);
3030 }
3031
3032 /* The escape was a back (or forward) reference. We keep the offset in
3033 order to give a more useful diagnostic for a bad forward reference. For
3034 references to groups numbered less than 10 we can't use more than two items
3035 in parsed_pattern because they may be just two characters in the input (and
3036 in a 64-bit world an offset may need two elements). So for them, the offset
3037 of the first occurrent is held in a special vector. */
3038
3039 else if (escape < 0)
3040 {
3041 offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 1);
3042 escape = -escape;
3043 *parsed_pattern++ = META_BACKREF | (uint32_t)escape;
3044 if (escape < 10)
3045 {
3046 if (cb->small_ref_offset[escape] == PCRE2_UNSET)
3047 cb->small_ref_offset[escape] = offset;
3048 }
3049 else
3050 {
3051 PUTOFFSET(offset, parsed_pattern);
3052 }
3053 okquantifier = TRUE;
3054 }
3055
3056 /* The escape was a character class such as \d etc. or other special
3057 escape indicator such as \A or \X. Most of them generate just a single
3058 parsed item, but \P and \p are followed by a 16-bit type and a 16-bit
3059 value. They are supported only when Unicode is available. The type and
3060 value are packed into a single 32-bit value so that the whole sequences
3061 uses only two elements in the parsed_vector. This is because the same
3062 coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is
3063 set.
3064
3065 There are also some cases where the escape sequence is followed by a name:
3066 \k{name}, \k<name>, and \k'name' are backreferences by name, and \g<name>
3067 and \g'name' are subroutine calls by name; \g{name} is a synonym for
3068 \k{name}. Note that \g<number> and \g'number' are handled by check_escape()
3069 and returned as a negative value (handled above). A name is coded as an
3070 offset into the pattern and a length. */
3071
3072 else switch (escape)
3073 {
3074 case ESC_C:
3075#ifdef NEVER_BACKSLASH_C
3076 errorcode = ERR85;
3077 goto ESCAPE_FAILED;
3078#else
3079 if ((options & PCRE2_NEVER_BACKSLASH_C) != 0)
3080 {
3081 errorcode = ERR83;
3082 goto ESCAPE_FAILED;
3083 }
3084#endif
3085 okquantifier = TRUE;
3086 *parsed_pattern++ = META_ESCAPE + escape;
3087 break;
3088
3089 case ESC_X:
3090#ifndef SUPPORT_UNICODE
3091 errorcode = ERR45; /* Supported only with Unicode support */
3092 goto ESCAPE_FAILED;
3093#endif
3094 case ESC_H:
3095 case ESC_h:
3096 case ESC_N:
3097 case ESC_R:
3098 case ESC_V:
3099 case ESC_v:
3100 okquantifier = TRUE;
3101 *parsed_pattern++ = META_ESCAPE + escape;
3102 break;
3103
3104 default: /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */
3105 *parsed_pattern++ = META_ESCAPE + escape;
3106 break;
3107
3108 /* Escapes that change in UCP mode. Note that PCRE2_UCP will never be set
3109 without Unicode support because it is checked when pcre2_compile() is
3110 called. */
3111
3112 case ESC_d:
3113 case ESC_D:
3114 case ESC_s:
3115 case ESC_S:
3116 case ESC_w:
3117 case ESC_W:
3118 okquantifier = TRUE;
3119 if ((options & PCRE2_UCP) == 0)
3120 {
3121 *parsed_pattern++ = META_ESCAPE + escape;
3122 }
3123 else
3124 {
3125 *parsed_pattern++ = META_ESCAPE +
3126 ((escape == ESC_d || escape == ESC_s || escape == ESC_w)?
3127 ESC_p : ESC_P);
3128 switch(escape)
3129 {
3130 case ESC_d:
3131 case ESC_D:
3132 *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
3133 break;
3134
3135 case ESC_s:
3136 case ESC_S:
3137 *parsed_pattern++ = PT_SPACE << 16;
3138 break;
3139
3140 case ESC_w:
3141 case ESC_W:
3142 *parsed_pattern++ = PT_WORD << 16;
3143 break;
3144 }
3145 }
3146 break;
3147
3148 /* Unicode property matching */
3149
3150 case ESC_P:
3151 case ESC_p:
3152#ifdef SUPPORT_UNICODE
3153 {
3154 BOOL negated;
3155 uint16_t ptype = 0, pdata = 0;
3156 if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3157 goto ESCAPE_FAILED;
3158 if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3159 *parsed_pattern++ = META_ESCAPE + escape;
3160 *parsed_pattern++ = (ptype << 16) | pdata;
3161 okquantifier = TRUE;
3162 }
3163#else
3164 errorcode = ERR45;
3165 goto ESCAPE_FAILED;
3166#endif
3167 break; /* End \P and \p */
3168
3169 /* When \g is used with quotes or angle brackets as delimiters, it is a
3170 numerical or named subroutine call, and control comes here. When used
3171 with brace delimiters it is a numberical back reference and does not come
3172 here because check_escape() returns it directly as a reference. \k is
3173 always a named back reference. */
3174
3175 case ESC_g:
3176 case ESC_k:
3177 if (ptr >= ptrend || (*ptr != CHAR_LEFT_CURLY_BRACKET &&
3178 *ptr != CHAR_LESS_THAN_SIGN && *ptr != CHAR_APOSTROPHE))
3179 {
3180 errorcode = (escape == ESC_g)? ERR57 : ERR69;
3181 goto ESCAPE_FAILED;
3182 }
3183 terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
3184 CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
3185 CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
3186
3187 /* For a non-braced \g, check for a numerical recursion. */
3188
3189 if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET)
3190 {
3191 PCRE2_SPTR p = ptr + 1;
3192
3193 if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
3194 &errorcode))
3195 {
3196 if (p >= ptrend || *p != terminator)
3197 {
3198 errorcode = ERR57;
3199 goto ESCAPE_FAILED;
3200 }
3201 ptr = p;
3202 goto SET_RECURSION;
3203 }
3204 if (errorcode != 0) goto ESCAPE_FAILED;
3205 }
3206
3207 /* Not a numerical recursion */
3208
3209 if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
3210 &errorcode, cb)) goto ESCAPE_FAILED;
3211
3212 /* \k and \g when used with braces are back references, whereas \g used
3213 with quotes or angle brackets is a recursion */
3214
3215 *parsed_pattern++ =
3216 (escape == ESC_k || terminator == CHAR_RIGHT_CURLY_BRACKET)?
3217 META_BACKREF_BYNAME : META_RECURSE_BYNAME;
3218 *parsed_pattern++ = namelen;
3219
3220 PUTOFFSET(offset, parsed_pattern);
3221 okquantifier = TRUE;
3222 break; /* End special escape processing */
3223 }
3224 break; /* End escape sequence processing */
3225
3226
3227 /* ---- Single-character special items ---- */
3228
3229 case CHAR_CIRCUMFLEX_ACCENT:
3230 *parsed_pattern++ = META_CIRCUMFLEX;
3231 break;
3232
3233 case CHAR_DOLLAR_SIGN:
3234 *parsed_pattern++ = META_DOLLAR;
3235 break;
3236
3237 case CHAR_DOT:
3238 *parsed_pattern++ = META_DOT;
3239 okquantifier = TRUE;
3240 break;
3241
3242
3243 /* ---- Single-character quantifiers ---- */
3244
3245 case CHAR_ASTERISK:
3246 meta_quantifier = META_ASTERISK;
3247 goto CHECK_QUANTIFIER;
3248
3249 case CHAR_PLUS:
3250 meta_quantifier = META_PLUS;
3251 goto CHECK_QUANTIFIER;
3252
3253 case CHAR_QUESTION_MARK:
3254 meta_quantifier = META_QUERY;
3255 goto CHECK_QUANTIFIER;
3256
3257
3258 /* ---- Potential {n,m} quantifier ---- */
3259
3260 case CHAR_LEFT_CURLY_BRACKET:
3261 if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat,
3262 &errorcode))
3263 {
3264 if (errorcode != 0) goto FAILED; /* Error in quantifier. */
3265 PARSED_LITERAL(c, parsed_pattern); /* Not a quantifier */
3266 break; /* No more quantifier processing */
3267 }
3268 meta_quantifier = META_MINMAX;
3269 /* Fall through */
3270
3271
3272 /* ---- Quantifier post-processing ---- */
3273
3274 /* Check that a quantifier is allowed after the previous item. */
3275
3276 CHECK_QUANTIFIER:
3277 if (!prev_okquantifier)
3278 {
3279 errorcode = ERR9;
3280 goto FAILED_BACK;
3281 }
3282
3283 /* Most (*VERB)s are not allowed to be quantified, but an ungreedy
3284 quantifier can be useful for (*ACCEPT) - meaning "succeed on backtrack", a
3285 sort of negated (*COMMIT). We therefore allow (*ACCEPT) to be quantified by
3286 wrapping it in non-capturing brackets, but we have to allow for a preceding
3287 (*MARK) for when (*ACCEPT) has an argument. */
3288
3289 if (parsed_pattern[-1] == META_ACCEPT)
3290 {
3291 uint32_t *p;
3292 for (p = parsed_pattern - 1; p >= verbstartptr; p--) p[1] = p[0];
3293 *verbstartptr = META_NOCAPTURE;
3294 parsed_pattern[1] = META_KET;
3295 parsed_pattern += 2;
3296 }
3297
3298 /* Now we can put the quantifier into the parsed pattern vector. At this
3299 stage, we have only the basic quantifier. The check for a following + or ?
3300 modifier happens at the top of the loop, after any intervening comments
3301 have been removed. */
3302
3303 *parsed_pattern++ = meta_quantifier;
3304 if (c == CHAR_LEFT_CURLY_BRACKET)
3305 {
3306 *parsed_pattern++ = min_repeat;
3307 *parsed_pattern++ = max_repeat;
3308 }
3309 break;
3310
3311
3312 /* ---- Character class ---- */
3313
3314 case CHAR_LEFT_SQUARE_BRACKET:
3315 okquantifier = TRUE;
3316
3317 /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
3318 used for "start of word" and "end of word". As these are otherwise illegal
3319 sequences, we don't break anything by recognizing them. They are replaced
3320 by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
3321 erroneous and are handled by the normal code below. */
3322
3323 if (ptrend - ptr >= 6 &&
3324 (PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 ||
3325 PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0))
3326 {
3327 *parsed_pattern++ = META_ESCAPE + ESC_b;
3328
3329 if (ptr[2] == CHAR_LESS_THAN_SIGN)
3330 {
3331 *parsed_pattern++ = META_LOOKAHEAD;
3332 }
3333 else
3334 {
3335 *parsed_pattern++ = META_LOOKBEHIND;
3336 *has_lookbehind = TRUE;
3337
3338 /* The offset is used only for the "non-fixed length" error; this won't
3339 occur here, so just store zero. */
3340
3341 PUTOFFSET((PCRE2_SIZE)0, parsed_pattern);
3342 }
3343
3344 if ((options & PCRE2_UCP) == 0)
3345 *parsed_pattern++ = META_ESCAPE + ESC_w;
3346 else
3347 {
3348 *parsed_pattern++ = META_ESCAPE + ESC_p;
3349 *parsed_pattern++ = PT_WORD << 16;
3350 }
3351 *parsed_pattern++ = META_KET;
3352 ptr += 6;
3353 break;
3354 }
3355
3356 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3357 they are encountered at the top level, so we'll do that too. */
3358
3359 if (ptr < ptrend && (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3360 *ptr == CHAR_EQUALS_SIGN) &&
3361 check_posix_syntax(ptr, ptrend, &tempptr))
3362 {
3363 errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13;
3364 goto FAILED;
3365 }
3366
3367 /* Process a regular character class. If the first character is '^', set
3368 the negation flag. If the first few characters (either before or after ^)
3369 are \Q\E or \E or space or tab in extended-more mode, we skip them too.
3370 This makes for compatibility with Perl. */
3371
3372 negate_class = FALSE;
3373 while (ptr < ptrend)
3374 {
3375 GETCHARINCTEST(c, ptr);
3376 if (c == CHAR_BACKSLASH)
3377 {
3378 if (ptr < ptrend && *ptr == CHAR_E) ptr++;
3379 else if (ptrend - ptr >= 3 &&
3380 PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0)
3381 ptr += 3;
3382 else
3383 break;
3384 }
3385 else if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3386 (c == CHAR_SPACE || c == CHAR_HT)) /* Note: just these two */
3387 continue;
3388 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3389 negate_class = TRUE;
3390 else break;
3391 }
3392
3393 /* Now the real contents of the class; c has the first "real" character.
3394 Empty classes are permitted only if the option is set. */
3395
3396 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3397 (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0)
3398 {
3399 *parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY;
3400 break; /* End of class processing */
3401 }
3402
3403 /* Process a non-empty class. */
3404
3405 *parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS;
3406 class_range_state = RANGE_NO;
3407
3408 /* In an EBCDIC environment, Perl treats alphabetic ranges specially
3409 because there are holes in the encoding, and simply using the range A-Z
3410 (for example) would include the characters in the holes. This applies only
3411 to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z]
3412 in this respect. In order to accommodate this, we keep track of whether
3413 character values are literal or not, and a state variable for handling
3414 ranges. */
3415
3416 /* Loop for the contents of the class */
3417
3418 for (;;)
3419 {
3420 BOOL char_is_literal = TRUE;
3421
3422 /* Inside \Q...\E everything is literal except \E */
3423
3424 if (inescq)
3425 {
3426 if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3427 {
3428 inescq = FALSE; /* Reset literal state */
3429 ptr++; /* Skip the 'E' */
3430 goto CLASS_CONTINUE;
3431 }
3432 goto CLASS_LITERAL;
3433 }
3434
3435 /* Skip over space and tab (only) in extended-more mode. */
3436
3437 if ((options & PCRE2_EXTENDED_MORE) != 0 &&
3438 (c == CHAR_SPACE || c == CHAR_HT))
3439 goto CLASS_CONTINUE;
3440
3441 /* Handle POSIX class names. Perl allows a negation extension of the
3442 form [:^name:]. A square bracket that doesn't match the syntax is
3443 treated as a literal. We also recognize the POSIX constructions
3444 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3445 5.6 and 5.8 do. */
3446
3447 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3448 ptrend - ptr >= 3 &&
3449 (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3450 *ptr == CHAR_EQUALS_SIGN) &&
3451 check_posix_syntax(ptr, ptrend, &tempptr))
3452 {
3453 BOOL posix_negate = FALSE;
3454 int posix_class;
3455
3456 /* Perl treats a hyphen before a POSIX class as a literal, not the
3457 start of a range. However, it gives a warning in its warning mode. PCRE
3458 does not have a warning mode, so we give an error, because this is
3459 likely an error on the user's part. */
3460
3461 if (class_range_state == RANGE_STARTED)
3462 {
3463 errorcode = ERR50;
3464 goto FAILED;
3465 }
3466
3467 if (*ptr != CHAR_COLON)
3468 {
3469 errorcode = ERR13;
3470 goto FAILED_BACK;
3471 }
3472
3473 if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
3474 {
3475 posix_negate = TRUE;
3476 ptr++;
3477 }
3478
3479 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3480 if (posix_class < 0)
3481 {
3482 errorcode = ERR30;
3483 goto FAILED;
3484 }
3485 ptr = tempptr + 2;
3486
3487 /* Perl treats a hyphen after a POSIX class as a literal, not the
3488 start of a range. However, it gives a warning in its warning mode
3489 unless the hyphen is the last character in the class. PCRE does not
3490 have a warning mode, so we give an error, because this is likely an
3491 error on the user's part. */
3492
3493 if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3494 ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3495 {
3496 errorcode = ERR50;
3497 goto FAILED;
3498 }
3499
3500 /* Set "a hyphen is not the start of a range" for the -] case, and also
3501 in case the POSIX class is followed by \E or \Q\E (possibly repeated -
3502 fuzzers do that kind of thing) and *then* a hyphen. This causes that
3503 hyphen to be treated as a literal. I don't think it's worth setting up
3504 special apparatus to do otherwise. */
3505
3506 class_range_state = RANGE_NO;
3507
3508 /* When PCRE2_UCP is set, some of the POSIX classes are converted to
3509 use Unicode properties \p or \P or, in one case, \h or \H. The
3510 substitutes table has two values per class, containing the type and
3511 value of a \p or \P item. The special cases are specified with a
3512 negative type: a non-zero value causes \h or \H to be used, and a zero
3513 value falls through to behave like a non-UCP POSIX class. */
3514
3515#ifdef SUPPORT_UNICODE
3516 if ((options & PCRE2_UCP) != 0)
3517 {
3518 int ptype = posix_substitutes[2*posix_class];
3519 int pvalue = posix_substitutes[2*posix_class + 1];
3520 if (ptype >= 0)
3521 {
3522 *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p);
3523 *parsed_pattern++ = (ptype << 16) | pvalue;
3524 goto CLASS_CONTINUE;
3525 }
3526
3527 if (pvalue != 0)
3528 {
3529 *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h);
3530 goto CLASS_CONTINUE;
3531 }
3532
3533 /* Fall through */
3534 }
3535#endif /* SUPPORT_UNICODE */
3536
3537 /* Non-UCP POSIX class */
3538
3539 *parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX;
3540 *parsed_pattern++ = posix_class;
3541 }
3542
3543 /* Handle potential start of range */
3544
3545 else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED)
3546 {
3547 *parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)?
3548 META_RANGE_LITERAL : META_RANGE_ESCAPED;
3549 class_range_state = RANGE_STARTED;
3550 }
3551
3552 /* Handle a literal character */
3553
3554 else if (c != CHAR_BACKSLASH)
3555 {
3556 CLASS_LITERAL:
3557 if (class_range_state == RANGE_STARTED)
3558 {
3559 if (c == parsed_pattern[-2]) /* Optimize one-char range */
3560 parsed_pattern--;
3561 else if (parsed_pattern[-2] > c) /* Check range is in order */
3562 {
3563 errorcode = ERR8;
3564 goto FAILED_BACK;
3565 }
3566 else
3567 {
3568 if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL)
3569 parsed_pattern[-1] = META_RANGE_ESCAPED;
3570 PARSED_LITERAL(c, parsed_pattern);
3571 }
3572 class_range_state = RANGE_NO;
3573 }
3574 else /* Potential start of range */
3575 {
3576 class_range_state = char_is_literal?
3577 RANGE_OK_LITERAL : RANGE_OK_ESCAPED;
3578 PARSED_LITERAL(c, parsed_pattern);
3579 }
3580 }
3581
3582 /* Handle escapes in a class */
3583
3584 else
3585 {
3586 tempptr = ptr;
3587 escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3588 cb->cx->extra_options, TRUE, cb);
3589
3590 if (errorcode != 0)
3591 {
3592 if ((extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3593 goto FAILED;
3594 ptr = tempptr;
3595 if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3596 {
3597 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
3598 }
3599 escape = 0; /* Treat as literal character */
3600 }
3601
3602 switch(escape)
3603 {
3604 case 0: /* Escaped character code point is in c */
3605 char_is_literal = FALSE;
3606 goto CLASS_LITERAL;
3607
3608 case ESC_b:
3609 c = CHAR_BS; /* \b is backspace in a class */
3610 char_is_literal = FALSE;
3611 goto CLASS_LITERAL;
3612
3613 case ESC_Q:
3614 inescq = TRUE; /* Enter literal mode */
3615 goto CLASS_CONTINUE;
3616
3617 case ESC_E: /* Ignore orphan \E */
3618 goto CLASS_CONTINUE;
3619
3620 case ESC_B: /* Always an error in a class */
3621 case ESC_R:
3622 case ESC_X:
3623 errorcode = ERR7;
3624 ptr--;
3625 goto FAILED;
3626 }
3627
3628 /* The second part of a range can be a single-character escape
3629 sequence (detected above), but not any of the other escapes. Perl
3630 treats a hyphen as a literal in such circumstances. However, in Perl's
3631 warning mode, a warning is given, so PCRE now faults it, as it is
3632 almost certainly a mistake on the user's part. */
3633
3634 if (class_range_state == RANGE_STARTED)
3635 {
3636 errorcode = ERR50;
3637 goto FAILED; /* Not CLASS_ESCAPE_FAILED; always an error */
3638 }
3639
3640 /* Of the remaining escapes, only those that define characters are
3641 allowed in a class. None may start a range. */
3642
3643 class_range_state = RANGE_NO;
3644 switch(escape)
3645 {
3646 case ESC_N:
3647 errorcode = ERR71;
3648 goto FAILED;
3649
3650 case ESC_H:
3651 case ESC_h:
3652 case ESC_V:
3653 case ESC_v:
3654 *parsed_pattern++ = META_ESCAPE + escape;
3655 break;
3656
3657 /* These escapes are converted to Unicode property tests when
3658 PCRE2_UCP is set. */
3659
3660 case ESC_d:
3661 case ESC_D:
3662 case ESC_s:
3663 case ESC_S:
3664 case ESC_w:
3665 case ESC_W:
3666 if ((options & PCRE2_UCP) == 0)
3667 {
3668 *parsed_pattern++ = META_ESCAPE + escape;
3669 }
3670 else
3671 {
3672 *parsed_pattern++ = META_ESCAPE +
3673 ((escape == ESC_d || escape == ESC_s || escape == ESC_w)?
3674 ESC_p : ESC_P);
3675 switch(escape)
3676 {
3677 case ESC_d:
3678 case ESC_D:
3679 *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
3680 break;
3681
3682 case ESC_s:
3683 case ESC_S:
3684 *parsed_pattern++ = PT_SPACE << 16;
3685 break;
3686
3687 case ESC_w:
3688 case ESC_W:
3689 *parsed_pattern++ = PT_WORD << 16;
3690 break;
3691 }
3692 }
3693 break;
3694
3695 /* Explicit Unicode property matching */
3696
3697 case ESC_P:
3698 case ESC_p:
3699#ifdef SUPPORT_UNICODE
3700 {
3701 BOOL negated;
3702 uint16_t ptype = 0, pdata = 0;
3703 if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3704 goto FAILED;
3705 if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3706 *parsed_pattern++ = META_ESCAPE + escape;
3707 *parsed_pattern++ = (ptype << 16) | pdata;
3708 }
3709#else
3710 errorcode = ERR45;
3711 goto FAILED;
3712#endif
3713 break; /* End \P and \p */
3714
3715 default: /* All others are not allowed in a class */
3716 errorcode = ERR7;
3717 ptr--;
3718 goto FAILED;
3719 }
3720
3721 /* Perl gives a warning unless a following hyphen is the last character
3722 in the class. PCRE throws an error. */
3723
3724 if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
3725 ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
3726 {
3727 errorcode = ERR50;
3728 goto FAILED;
3729 }
3730 }
3731
3732 /* Proceed to next thing in the class. */
3733
3734 CLASS_CONTINUE:
3735 if (ptr >= ptrend)
3736 {
3737 errorcode = ERR6; /* Missing terminating ']' */
3738 goto FAILED;
3739 }
3740 GETCHARINCTEST(c, ptr);
3741 if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break;
3742 } /* End of class-processing loop */
3743
3744 /* -] at the end of a class is a literal '-' */
3745
3746 if (class_range_state == RANGE_STARTED)
3747 {
3748 parsed_pattern[-1] = CHAR_MINUS;
3749 class_range_state = RANGE_NO;
3750 }
3751
3752 *parsed_pattern++ = META_CLASS_END;
3753 break; /* End of character class */
3754
3755
3756 /* ---- Opening parenthesis ---- */
3757
3758 case CHAR_LEFT_PARENTHESIS:
3759 if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
3760
3761 /* If ( is not followed by ? it is either a capture or a special verb or an
3762 alpha assertion or a positive non-atomic lookahead. */
3763
3764 if (*ptr != CHAR_QUESTION_MARK)
3765 {
3766 const char *vn;
3767
3768 /* Handle capturing brackets (or non-capturing if auto-capture is turned
3769 off). */
3770
3771 if (*ptr != CHAR_ASTERISK)
3772 {
3773 nest_depth++;
3774 if ((options & PCRE2_NO_AUTO_CAPTURE) == 0)
3775 {
3776 if (cb->bracount >= MAX_GROUP_NUMBER)
3777 {
3778 errorcode = ERR97;
3779 goto FAILED;
3780 }
3781 cb->bracount++;
3782 *parsed_pattern++ = META_CAPTURE | cb->bracount;
3783 }
3784 else *parsed_pattern++ = META_NOCAPTURE;
3785 }
3786
3787 /* Do nothing for (* followed by end of pattern or ) so it gives a "bad
3788 quantifier" error rather than "(*MARK) must have an argument". */
3789
3790 else if (ptrend - ptr <= 1 || (c = ptr[1]) == CHAR_RIGHT_PARENTHESIS)
3791 break;
3792
3793 /* Handle "alpha assertions" such as (*pla:...). Most of these are
3794 synonyms for the historical symbolic assertions, but the script run and
3795 non-atomic lookaround ones are new. They are distinguished by starting
3796 with a lower case letter. Checking both ends of the alphabet makes this
3797 work in all character codes. */
3798
3799 else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0)
3800 {
3801 uint32_t meta;
3802
3803 vn = alasnames;
3804 if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
3805 &errorcode, cb)) goto FAILED;
3806 if (ptr >= ptrend || *ptr != CHAR_COLON)
3807 {
3808 errorcode = ERR95; /* Malformed */
3809 goto FAILED;
3810 }
3811
3812 /* Scan the table of alpha assertion names */
3813
3814 for (i = 0; i < alascount; i++)
3815 {
3816 if (namelen == alasmeta[i].len &&
3817 PRIV(strncmp_c8)(name, vn, namelen) == 0)
3818 break;
3819 vn += alasmeta[i].len + 1;
3820 }
3821
3822 if (i >= alascount)
3823 {
3824 errorcode = ERR95; /* Alpha assertion not recognized */
3825 goto FAILED;
3826 }
3827
3828 /* Check for expecting an assertion condition. If so, only atomic
3829 lookaround assertions are valid. */
3830
3831 meta = alasmeta[i].meta;
3832 if (prev_expect_cond_assert > 0 &&
3833 (meta < META_LOOKAHEAD || meta > META_LOOKBEHINDNOT))
3834 {
3835 errorcode = (meta == META_LOOKAHEAD_NA || meta == META_LOOKBEHIND_NA)?
3836 ERR98 : ERR28; /* (Atomic) assertion expected */
3837 goto FAILED;
3838 }
3839
3840 /* The lookaround alphabetic synonyms can mostly be handled by jumping
3841 to the code that handles the traditional symbolic forms. */
3842
3843 switch(meta)
3844 {
3845 default:
3846 errorcode = ERR89; /* Unknown code; should never occur because */
3847 goto FAILED; /* the meta values come from a table above. */
3848
3849 case META_ATOMIC:
3850 goto ATOMIC_GROUP;
3851
3852 case META_LOOKAHEAD:
3853 goto POSITIVE_LOOK_AHEAD;
3854
3855 case META_LOOKAHEAD_NA:
3856 goto POSITIVE_NONATOMIC_LOOK_AHEAD;
3857
3858 case META_LOOKAHEADNOT:
3859 goto NEGATIVE_LOOK_AHEAD;
3860
3861 case META_LOOKBEHIND:
3862 case META_LOOKBEHINDNOT:
3863 case META_LOOKBEHIND_NA:
3864 *parsed_pattern++ = meta;
3865 ptr--;
3866 goto POST_LOOKBEHIND;
3867
3868 /* The script run facilities are handled here. Unicode support is
3869 required (give an error if not, as this is a security issue). Always
3870 record a META_SCRIPT_RUN item. Then, for the atomic version, insert
3871 META_ATOMIC and remember that we need two META_KETs at the end. */
3872
3873 case META_SCRIPT_RUN:
3874 case META_ATOMIC_SCRIPT_RUN:
3875#ifdef SUPPORT_UNICODE
3876 *parsed_pattern++ = META_SCRIPT_RUN;
3877 nest_depth++;
3878 ptr++;
3879 if (meta == META_ATOMIC_SCRIPT_RUN)
3880 {
3881 *parsed_pattern++ = META_ATOMIC;
3882 if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
3883 else if (++top_nest >= end_nests)
3884 {
3885 errorcode = ERR84;
3886 goto FAILED;
3887 }
3888 top_nest->nest_depth = nest_depth;
3889 top_nest->flags = NSF_ATOMICSR;
3890 top_nest->options = options & PARSE_TRACKED_OPTIONS;
3891 }
3892 break;
3893#else /* SUPPORT_UNICODE */
3894 errorcode = ERR96;
3895 goto FAILED;
3896#endif
3897 }
3898 }
3899
3900
3901 /* ---- Handle (*VERB) and (*VERB:NAME) ---- */
3902
3903 else
3904 {
3905 vn = verbnames;
3906 if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
3907 &errorcode, cb)) goto FAILED;
3908 if (ptr >= ptrend || (*ptr != CHAR_COLON &&
3909 *ptr != CHAR_RIGHT_PARENTHESIS))
3910 {
3911 errorcode = ERR60; /* Malformed */
3912 goto FAILED;
3913 }
3914
3915 /* Scan the table of verb names */
3916
3917 for (i = 0; i < verbcount; i++)
3918 {
3919 if (namelen == verbs[i].len &&
3920 PRIV(strncmp_c8)(name, vn, namelen) == 0)
3921 break;
3922 vn += verbs[i].len + 1;
3923 }
3924
3925 if (i >= verbcount)
3926 {
3927 errorcode = ERR60; /* Verb not recognized */
3928 goto FAILED;
3929 }
3930
3931 /* An empty argument is treated as no argument. */
3932
3933 if (*ptr == CHAR_COLON && ptr + 1 < ptrend &&
3934 ptr[1] == CHAR_RIGHT_PARENTHESIS)
3935 ptr++; /* Advance to the closing parens */
3936
3937 /* Check for mandatory non-empty argument; this is (*MARK) */
3938
3939 if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON)
3940 {
3941 errorcode = ERR66;
3942 goto FAILED;
3943 }
3944
3945 /* Remember where this verb, possibly with a preceding (*MARK), starts,
3946 for handling quantified (*ACCEPT). */
3947
3948 verbstartptr = parsed_pattern;
3949 okquantifier = (verbs[i].meta == META_ACCEPT);
3950
3951 /* It appears that Perl allows any characters whatsoever, other than a
3952 closing parenthesis, to appear in arguments ("names"), so we no longer
3953 insist on letters, digits, and underscores. Perl does not, however, do
3954 any interpretation within arguments, and has no means of including a
3955 closing parenthesis. PCRE supports escape processing but only when it
3956 is requested by an option. We set inverbname TRUE here, and let the
3957 main loop take care of this so that escape and \x processing is done by
3958 the main code above. */
3959
3960 if (*ptr++ == CHAR_COLON) /* Skip past : or ) */
3961 {
3962 /* Some optional arguments can be treated as a preceding (*MARK) */
3963
3964 if (verbs[i].has_arg < 0)
3965 {
3966 add_after_mark = verbs[i].meta;
3967 *parsed_pattern++ = META_MARK;
3968 }
3969
3970 /* The remaining verbs with arguments (except *MARK) need a different
3971 opcode. */
3972
3973 else
3974 {
3975 *parsed_pattern++ = verbs[i].meta +
3976 ((verbs[i].meta != META_MARK)? 0x00010000u:0);
3977 }
3978
3979 /* Set up for reading the name in the main loop. */
3980
3981 verblengthptr = parsed_pattern++;
3982 verbnamestart = ptr;
3983 inverbname = TRUE;
3984 }
3985 else /* No verb "name" argument */
3986 {
3987 *parsed_pattern++ = verbs[i].meta;
3988 }
3989 } /* End of (*VERB) handling */
3990 break; /* Done with this parenthesis */
3991 } /* End of groups that don't start with (? */
3992
3993
3994 /* ---- Items starting (? ---- */
3995
3996 /* The type of item is determined by what follows (?. Handle (?| and option
3997 changes under "default" because both need a new block on the nest stack.
3998 Comments starting with (?# are handled above. Note that there is some
3999 ambiguity about the sequence (?- because if a digit follows it's a relative
4000 recursion or subroutine call whereas otherwise it's an option unsetting. */
4001
4002 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4003
4004 switch(*ptr)
4005 {
4006 default:
4007 if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1]))
4008 goto RECURSION_BYNUMBER; /* The + case is handled by CHAR_PLUS */
4009
4010 /* We now have either (?| or a (possibly empty) option setting,
4011 optionally followed by a non-capturing group. */
4012
4013 nest_depth++;
4014 if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4015 else if (++top_nest >= end_nests)
4016 {
4017 errorcode = ERR84;
4018 goto FAILED;
4019 }
4020 top_nest->nest_depth = nest_depth;
4021 top_nest->flags = 0;
4022 top_nest->options = options & PARSE_TRACKED_OPTIONS;
4023
4024 /* Start of non-capturing group that resets the capture count for each
4025 branch. */
4026
4027 if (*ptr == CHAR_VERTICAL_LINE)
4028 {
4029 top_nest->reset_group = (uint16_t)cb->bracount;
4030 top_nest->max_group = (uint16_t)cb->bracount;
4031 top_nest->flags |= NSF_RESET;
4032 cb->external_flags |= PCRE2_DUPCAPUSED;
4033 *parsed_pattern++ = META_NOCAPTURE;
4034 ptr++;
4035 }
4036
4037 /* Scan for options imnsxJU to be set or unset. */
4038
4039 else
4040 {
4041 BOOL hyphenok = TRUE;
4042 uint32_t oldoptions = options;
4043
4044 top_nest->reset_group = 0;
4045 top_nest->max_group = 0;
4046 set = unset = 0;
4047 optset = &set;
4048
4049 /* ^ at the start unsets imnsx and disables the subsequent use of - */
4050
4051 if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT)
4052 {
4053 options &= ~(PCRE2_CASELESS|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE|
4054 PCRE2_DOTALL|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE);
4055 hyphenok = FALSE;
4056 ptr++;
4057 }
4058
4059 while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS &&
4060 *ptr != CHAR_COLON)
4061 {
4062 switch (*ptr++)
4063 {
4064 case CHAR_MINUS:
4065 if (!hyphenok)
4066 {
4067 errorcode = ERR94;
4068 ptr--; /* Correct the offset */
4069 goto FAILED;
4070 }
4071 optset = &unset;
4072 hyphenok = FALSE;
4073 break;
4074
4075 case CHAR_J: /* Record that it changed in the external options */
4076 *optset |= PCRE2_DUPNAMES;
4077 cb->external_flags |= PCRE2_JCHANGED;
4078 break;
4079
4080 case CHAR_i: *optset |= PCRE2_CASELESS; break;
4081 case CHAR_m: *optset |= PCRE2_MULTILINE; break;
4082 case CHAR_n: *optset |= PCRE2_NO_AUTO_CAPTURE; break;
4083 case CHAR_s: *optset |= PCRE2_DOTALL; break;
4084 case CHAR_U: *optset |= PCRE2_UNGREEDY; break;
4085
4086 /* If x appears twice it sets the extended extended option. */
4087
4088 case CHAR_x:
4089 *optset |= PCRE2_EXTENDED;
4090 if (ptr < ptrend && *ptr == CHAR_x)
4091 {
4092 *optset |= PCRE2_EXTENDED_MORE;
4093 ptr++;
4094 }
4095 break;
4096
4097 default:
4098 errorcode = ERR11;
4099 ptr--; /* Correct the offset */
4100 goto FAILED;
4101 }
4102 }
4103
4104 /* If we are setting extended without extended-more, ensure that any
4105 existing extended-more gets unset. Also, unsetting extended must also
4106 unset extended-more. */
4107
4108 if ((set & (PCRE2_EXTENDED|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED ||
4109 (unset & PCRE2_EXTENDED) != 0)
4110 unset |= PCRE2_EXTENDED_MORE;
4111
4112 options = (options | set) & (~unset);
4113
4114 /* If the options ended with ')' this is not the start of a nested
4115 group with option changes, so the options change at this level.
4116 In this case, if the previous level set up a nest block, discard the
4117 one we have just created. Otherwise adjust it for the previous level.
4118 If the options ended with ':' we are starting a non-capturing group,
4119 possibly with an options setting. */
4120
4121 if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4122 if (*ptr++ == CHAR_RIGHT_PARENTHESIS)
4123 {
4124 nest_depth--; /* This is not a nested group after all. */
4125 if (top_nest > (nest_save *)(cb->start_workspace) &&
4126 (top_nest-1)->nest_depth == nest_depth) top_nest--;
4127 else top_nest->nest_depth = nest_depth;
4128 }
4129 else *parsed_pattern++ = META_NOCAPTURE;
4130
4131 /* If nothing changed, no need to record. */
4132
4133 if (options != oldoptions)
4134 {
4135 *parsed_pattern++ = META_OPTIONS;
4136 *parsed_pattern++ = options;
4137 }
4138 } /* End options processing */
4139 break; /* End default case after (? */
4140
4141
4142 /* ---- Python syntax support ---- */
4143
4144 case CHAR_P:
4145 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4146
4147 /* (?P<name> is the same as (?<name>, which defines a named group. */
4148
4149 if (*ptr == CHAR_LESS_THAN_SIGN)
4150 {
4151 terminator = CHAR_GREATER_THAN_SIGN;
4152 goto DEFINE_NAME;
4153 }
4154
4155 /* (?P>name) is the same as (?&name), which is a recursion or subroutine
4156 call. */
4157
4158 if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME;
4159
4160 /* (?P=name) is the same as \k<name>, a back reference by name. Anything
4161 else after (?P is an error. */
4162
4163 if (*ptr != CHAR_EQUALS_SIGN)
4164 {
4165 errorcode = ERR41;
4166 goto FAILED;
4167 }
4168 if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
4169 &namelen, &errorcode, cb)) goto FAILED;
4170 *parsed_pattern++ = META_BACKREF_BYNAME;
4171 *parsed_pattern++ = namelen;
4172 PUTOFFSET(offset, parsed_pattern);
4173 okquantifier = TRUE;
4174 break; /* End of (?P processing */
4175
4176
4177 /* ---- Recursion/subroutine calls by number ---- */
4178
4179 case CHAR_R:
4180 i = 0; /* (?R) == (?R0) */
4181 ptr++;
4182 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4183 {
4184 errorcode = ERR58;
4185 goto FAILED;
4186 }
4187 goto SET_RECURSION;
4188
4189 /* An item starting (?- followed by a digit comes here via the "default"
4190 case because (?- followed by a non-digit is an options setting. */
4191
4192 case CHAR_PLUS:
4193 if (ptrend - ptr < 2 || !IS_DIGIT(ptr[1]))
4194 {
4195 errorcode = ERR29; /* Missing number */
4196 goto FAILED;
4197 }
4198 /* Fall through */
4199
4200 case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
4201 case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
4202 RECURSION_BYNUMBER:
4203 if (!read_number(&ptr, ptrend,
4204 (IS_DIGIT(*ptr))? -1:(int)(cb->bracount), /* + and - are relative */
4205 MAX_GROUP_NUMBER, ERR61,
4206 &i, &errorcode)) goto FAILED;
4207 if (i < 0) /* NB (?0) is permitted */
4208 {
4209 errorcode = ERR15; /* Unknown group */
4210 goto FAILED_BACK;
4211 }
4212 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4213 goto UNCLOSED_PARENTHESIS;
4214
4215 SET_RECURSION:
4216 *parsed_pattern++ = META_RECURSE | (uint32_t)i;
4217 offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
4218 ptr++;
4219 PUTOFFSET(offset, parsed_pattern);
4220 okquantifier = TRUE;
4221 break; /* End of recursive call by number handling */
4222
4223
4224 /* ---- Recursion/subroutine calls by name ---- */
4225
4226 case CHAR_AMPERSAND:
4227 RECURSE_BY_NAME:
4228 if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
4229 &namelen, &errorcode, cb)) goto FAILED;
4230 *parsed_pattern++ = META_RECURSE_BYNAME;
4231 *parsed_pattern++ = namelen;
4232 PUTOFFSET(offset, parsed_pattern);
4233 okquantifier = TRUE;
4234 break;
4235
4236 /* ---- Callout with numerical or string argument ---- */
4237
4238 case CHAR_C:
4239 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4240
4241 /* If the previous item was a condition starting (?(? an assertion,
4242 optionally preceded by a callout, is expected. This is checked later on,
4243 during actual compilation. However we need to identify this kind of
4244 assertion in this pass because it must not be qualified. The value of
4245 expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4246 for a callout - still leaving a positive value that identifies the
4247 assertion. Multiple callouts or any other items will make it zero or
4248 less, which doesn't matter because they will cause an error later. */
4249
4250 expect_cond_assert = prev_expect_cond_assert - 1;
4251
4252 /* If previous_callout is not NULL, it means this follows a previous
4253 callout. If it was a manual callout, do nothing; this means its "length
4254 of next pattern item" field will remain zero. If it was an automatic
4255 callout, abolish it. */
4256
4257 if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 &&
4258 previous_callout == parsed_pattern - 4 &&
4259 parsed_pattern[-1] == 255)
4260 parsed_pattern = previous_callout;
4261
4262 /* Save for updating next pattern item length, and skip one item before
4263 completing. */
4264
4265 previous_callout = parsed_pattern;
4266 after_manual_callout = 1;
4267
4268 /* Handle a string argument; specific delimiter is required. */
4269
4270 if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr))
4271 {
4272 PCRE2_SIZE calloutlength;
4273 PCRE2_SPTR startptr = ptr;
4274
4275 delimiter = 0;
4276 for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
4277 {
4278 if (*ptr == PRIV(callout_start_delims)[i])
4279 {
4280 delimiter = PRIV(callout_end_delims)[i];
4281 break;
4282 }
4283 }
4284 if (delimiter == 0)
4285 {
4286 errorcode = ERR82;
4287 goto FAILED;
4288 }
4289
4290 *parsed_pattern = META_CALLOUT_STRING;
4291 parsed_pattern += 3; /* Skip pattern info */
4292
4293 for (;;)
4294 {
4295 if (++ptr >= ptrend)
4296 {
4297 errorcode = ERR81;
4298 ptr = startptr; /* To give a more useful message */
4299 goto FAILED;
4300 }
4301 if (*ptr == delimiter && (++ptr >= ptrend || *ptr != delimiter))
4302 break;
4303 }
4304
4305 calloutlength = (PCRE2_SIZE)(ptr - startptr);
4306 if (calloutlength > UINT32_MAX)
4307 {
4308 errorcode = ERR72;
4309 goto FAILED;
4310 }
4311 *parsed_pattern++ = (uint32_t)calloutlength;
4312 offset = (PCRE2_SIZE)(startptr - cb->start_pattern);
4313 PUTOFFSET(offset, parsed_pattern);
4314 }
4315
4316 /* Handle a callout with an optional numerical argument, which must be
4317 less than or equal to 255. A missing argument gives 0. */
4318
4319 else
4320 {
4321 int n = 0;
4322 *parsed_pattern = META_CALLOUT_NUMBER; /* Numerical callout */
4323 parsed_pattern += 3; /* Skip pattern info */
4324 while (ptr < ptrend && IS_DIGIT(*ptr))
4325 {
4326 n = n * 10 + *ptr++ - CHAR_0;
4327 if (n > 255)
4328 {
4329 errorcode = ERR38;
4330 goto FAILED;
4331 }
4332 }
4333 *parsed_pattern++ = n;
4334 }
4335
4336 /* Both formats must have a closing parenthesis */
4337
4338 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4339 {
4340 errorcode = ERR39;
4341 goto FAILED;
4342 }
4343 ptr++;
4344
4345 /* Remember the offset to the next item in the pattern, and set a default
4346 length. This should get updated after the next item is read. */
4347
4348 previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
4349 previous_callout[2] = 0;
4350 break; /* End callout */
4351
4352
4353 /* ---- Conditional group ---- */
4354
4355 /* A condition can be an assertion, a number (referring to a numbered
4356 group's having been set), a name (referring to a named group), or 'R',
4357 referring to overall recursion. R<digits> and R&name are also permitted
4358 for recursion state tests. Numbers may be preceded by + or - to specify a
4359 relative group number.
4360
4361 There are several syntaxes for testing a named group: (?(name)) is used
4362 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4363
4364 There are two unfortunate ambiguities. 'R' can be the recursive thing or
4365 the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be
4366 the Perl DEFINE feature or the Python named test. We look for a name
4367 first; if not found, we try the other case.
4368
4369 For compatibility with auto-callouts, we allow a callout to be specified
4370 before a condition that is an assertion. */
4371
4372 case CHAR_LEFT_PARENTHESIS:
4373 if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4374 nest_depth++;
4375
4376 /* If the next character is ? or * there must be an assertion next
4377 (optionally preceded by a callout). We do not check this here, but
4378 instead we set expect_cond_assert to 2. If this is still greater than
4379 zero (callouts decrement it) when the next assertion is read, it will be
4380 marked as a condition that must not be repeated. A value greater than
4381 zero also causes checking that an assertion (possibly with callout)
4382 follows. */
4383
4384 if (*ptr == CHAR_QUESTION_MARK || *ptr == CHAR_ASTERISK)
4385 {
4386 *parsed_pattern++ = META_COND_ASSERT;
4387 ptr--; /* Pull pointer back to the opening parenthesis. */
4388 expect_cond_assert = 2;
4389 break; /* End of conditional */
4390 }
4391
4392 /* Handle (?([+-]number)... */
4393
4394 if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
4395 &errorcode))
4396 {
4397 if (i <= 0)
4398 {
4399 errorcode = ERR15;
4400 goto FAILED;
4401 }
4402 *parsed_pattern++ = META_COND_NUMBER;
4403 offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4404 PUTOFFSET(offset, parsed_pattern);
4405 *parsed_pattern++ = i;
4406 }
4407 else if (errorcode != 0) goto FAILED; /* Number too big */
4408
4409 /* No number found. Handle the special case (?(VERSION[>]=n.m)... */
4410
4411 else if (ptrend - ptr >= 10 &&
4412 PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 &&
4413 ptr[7] != CHAR_RIGHT_PARENTHESIS)
4414 {
4415 uint32_t ge = 0;
4416 int major = 0;
4417 int minor = 0;
4418
4419 ptr += 7;
4420 if (*ptr == CHAR_GREATER_THAN_SIGN)
4421 {
4422 ge = 1;
4423 ptr++;
4424 }
4425
4426 /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT
4427 references its argument twice. */
4428
4429 if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr)))
4430 goto BAD_VERSION_CONDITION;
4431
4432 if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode))
4433 goto FAILED;
4434
4435 if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
4436 if (*ptr == CHAR_DOT)
4437 {
4438 if (++ptr >= ptrend || !IS_DIGIT(*ptr)) goto BAD_VERSION_CONDITION;
4439 minor = (*ptr++ - CHAR_0) * 10;
4440 if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
4441 if (IS_DIGIT(*ptr)) minor += *ptr++ - CHAR_0;
4442 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4443 goto BAD_VERSION_CONDITION;
4444 }
4445
4446 *parsed_pattern++ = META_COND_VERSION;
4447 *parsed_pattern++ = ge;
4448 *parsed_pattern++ = major;
4449 *parsed_pattern++ = minor;
4450 }
4451
4452 /* All the remaining cases now require us to read a name. We cannot at
4453 this stage distinguish ambiguous cases such as (?(R12) which might be a
4454 recursion test by number or a name, because the named groups have not yet
4455 all been identified. Those cases are treated as names, but given a
4456 different META code. */
4457
4458 else
4459 {
4460 BOOL was_r_ampersand = FALSE;
4461
4462 if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND)
4463 {
4464 terminator = CHAR_RIGHT_PARENTHESIS;
4465 was_r_ampersand = TRUE;
4466 ptr++;
4467 }
4468 else if (*ptr == CHAR_LESS_THAN_SIGN)
4469 terminator = CHAR_GREATER_THAN_SIGN;
4470 else if (*ptr == CHAR_APOSTROPHE)
4471 terminator = CHAR_APOSTROPHE;
4472 else
4473 {
4474 terminator = CHAR_RIGHT_PARENTHESIS;
4475 ptr--; /* Point to char before name */
4476 }
4477 if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4478 &errorcode, cb)) goto FAILED;
4479
4480 /* Handle (?(R&name) */
4481
4482 if (was_r_ampersand)
4483 {
4484 *parsed_pattern = META_COND_RNAME;
4485 ptr--; /* Back to closing parens */
4486 }
4487
4488 /* Handle (?(name). If the name is "DEFINE" we identify it with a
4489 special code. Likewise if the name consists of R followed only by
4490 digits. Otherwise, handle it like a quoted name. */
4491
4492 else if (terminator == CHAR_RIGHT_PARENTHESIS)
4493 {
4494 if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0)
4495 *parsed_pattern = META_COND_DEFINE;
4496 else
4497 {
4498 for (i = 1; i < (int)namelen; i++)
4499 if (!IS_DIGIT(name[i])) break;
4500 *parsed_pattern = (*name == CHAR_R && i >= (int)namelen)?
4501 META_COND_RNUMBER : META_COND_NAME;
4502 }
4503 ptr--; /* Back to closing parens */
4504 }
4505
4506 /* Handle (?('name') or (?(<name>) */
4507
4508 else *parsed_pattern = META_COND_NAME;
4509
4510 /* All these cases except DEFINE end with the name length and offset;
4511 DEFINE just has an offset (for the "too many branches" error). */
4512
4513 if (*parsed_pattern++ != META_COND_DEFINE) *parsed_pattern++ = namelen;
4514 PUTOFFSET(offset, parsed_pattern);
4515 } /* End cases that read a name */
4516
4517 /* Check the closing parenthesis of the condition */
4518
4519 if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4520 {
4521 errorcode = ERR24;
4522 goto FAILED;
4523 }
4524 ptr++;
4525 break; /* End of condition processing */
4526
4527
4528 /* ---- Atomic group ---- */
4529
4530 case CHAR_GREATER_THAN_SIGN:
4531 ATOMIC_GROUP: /* Come from (*atomic: */
4532 *parsed_pattern++ = META_ATOMIC;
4533 nest_depth++;
4534 ptr++;
4535 break;
4536
4537
4538 /* ---- Lookahead assertions ---- */
4539
4540 case CHAR_EQUALS_SIGN:
4541 POSITIVE_LOOK_AHEAD: /* Come from (*pla: */
4542 *parsed_pattern++ = META_LOOKAHEAD;
4543 ptr++;
4544 goto POST_ASSERTION;
4545
4546 case CHAR_ASTERISK:
4547 POSITIVE_NONATOMIC_LOOK_AHEAD: /* Come from (?* */
4548 *parsed_pattern++ = META_LOOKAHEAD_NA;
4549 ptr++;
4550 goto POST_ASSERTION;
4551
4552 case CHAR_EXCLAMATION_MARK:
4553 NEGATIVE_LOOK_AHEAD: /* Come from (*nla: */
4554 *parsed_pattern++ = META_LOOKAHEADNOT;
4555 ptr++;
4556 goto POST_ASSERTION;
4557
4558
4559 /* ---- Lookbehind assertions ---- */
4560
4561 /* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?<
4562 is the start of the name of a capturing group. */
4563
4564 case CHAR_LESS_THAN_SIGN:
4565 if (ptrend - ptr <= 1 ||
4566 (ptr[1] != CHAR_EQUALS_SIGN &&
4567 ptr[1] != CHAR_EXCLAMATION_MARK &&
4568 ptr[1] != CHAR_ASTERISK))
4569 {
4570 terminator = CHAR_GREATER_THAN_SIGN;
4571 goto DEFINE_NAME;
4572 }
4573 *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
4574 META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)?
4575 META_LOOKBEHINDNOT : META_LOOKBEHIND_NA;
4576
4577 POST_LOOKBEHIND: /* Come from (*plb: (*naplb: and (*nlb: */
4578 *has_lookbehind = TRUE;
4579 offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4580 PUTOFFSET(offset, parsed_pattern);
4581 ptr += 2;
4582 /* Fall through */
4583
4584 /* If the previous item was a condition starting (?(? an assertion,
4585 optionally preceded by a callout, is expected. This is checked later on,
4586 during actual compilation. However we need to identify this kind of
4587 assertion in this pass because it must not be qualified. The value of
4588 expect_cond_assert is set to 2 after (?(? is processed. We decrement it
4589 for a callout - still leaving a positive value that identifies the
4590 assertion. Multiple callouts or any other items will make it zero or
4591 less, which doesn't matter because they will cause an error later. */
4592
4593 POST_ASSERTION:
4594 nest_depth++;
4595 if (prev_expect_cond_assert > 0)
4596 {
4597 if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4598 else if (++top_nest >= end_nests)
4599 {
4600 errorcode = ERR84;
4601 goto FAILED;
4602 }
4603 top_nest->nest_depth = nest_depth;
4604 top_nest->flags = NSF_CONDASSERT;
4605 top_nest->options = options & PARSE_TRACKED_OPTIONS;
4606 }
4607 break;
4608
4609
4610 /* ---- Define a named group ---- */
4611
4612 /* A named group may be defined as (?'name') or (?<name>). In the latter
4613 case we jump to DEFINE_NAME from the disambiguation of (?< above with the
4614 terminator set to '>'. */
4615
4616 case CHAR_APOSTROPHE:
4617 terminator = CHAR_APOSTROPHE; /* Terminator */
4618
4619 DEFINE_NAME:
4620 if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
4621 &errorcode, cb)) goto FAILED;
4622
4623 /* We have a name for this capturing group. It is also assigned a number,
4624 which is its primary means of identification. */
4625
4626 if (cb->bracount >= MAX_GROUP_NUMBER)
4627 {
4628 errorcode = ERR97;
4629 goto FAILED;
4630 }
4631 cb->bracount++;
4632 *parsed_pattern++ = META_CAPTURE | cb->bracount;
4633 nest_depth++;
4634
4635 /* Check not too many names */
4636
4637 if (cb->names_found >= MAX_NAME_COUNT)
4638 {
4639 errorcode = ERR49;
4640 goto FAILED;
4641 }
4642
4643 /* Adjust the entry size to accommodate the longest name found. */
4644
4645 if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)
4646 cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);
4647
4648 /* Scan the list to check for duplicates. For duplicate names, if the
4649 number is the same, break the loop, which causes the name to be
4650 discarded; otherwise, if DUPNAMES is not set, give an error.
4651 If it is set, allow the name with a different number, but continue
4652 scanning in case this is a duplicate with the same number. For
4653 non-duplicate names, give an error if the number is duplicated. */
4654
4655 isdupname = FALSE;
4656 ng = cb->named_groups;
4657 for (i = 0; i < cb->names_found; i++, ng++)
4658 {
4659 if (namelen == ng->length &&
4660 PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0)
4661 {
4662 if (ng->number == cb->bracount) break;
4663 if ((options & PCRE2_DUPNAMES) == 0)
4664 {
4665 errorcode = ERR43;
4666 goto FAILED;
4667 }
4668 isdupname = ng->isdup = TRUE; /* Mark as a duplicate */
4669 cb->dupnames = TRUE; /* Duplicate names exist */
4670 }
4671 else if (ng->number == cb->bracount)
4672 {
4673 errorcode = ERR65;
4674 goto FAILED;
4675 }
4676 }
4677
4678 if (i < cb->names_found) break; /* Ignore duplicate with same number */
4679
4680 /* Increase the list size if necessary */
4681
4682 if (cb->names_found >= cb->named_group_list_size)
4683 {
4684 uint32_t newsize = cb->named_group_list_size * 2;
4685 named_group *newspace =
4686 cb->cx->memctl.malloc(newsize * sizeof(named_group),
4687 cb->cx->memctl.memory_data);
4688 if (newspace == NULL)
4689 {
4690 errorcode = ERR21;
4691 goto FAILED;
4692 }
4693
4694 memcpy(newspace, cb->named_groups,
4695 cb->named_group_list_size * sizeof(named_group));
4696 if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)
4697 cb->cx->memctl.free((void *)cb->named_groups,
4698 cb->cx->memctl.memory_data);
4699 cb->named_groups = newspace;
4700 cb->named_group_list_size = newsize;
4701 }
4702
4703 /* Add this name to the list */
4704
4705 cb->named_groups[cb->names_found].name = name;
4706 cb->named_groups[cb->names_found].length = (uint16_t)namelen;
4707 cb->named_groups[cb->names_found].number = cb->bracount;
4708 cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname;
4709 cb->names_found++;
4710 break;
4711 } /* End of (? switch */
4712 break; /* End of ( handling */
4713
4714
4715 /* ---- Branch terminators ---- */
4716
4717 /* Alternation: reset the capture count if we are in a (?| group. */
4718
4719 case CHAR_VERTICAL_LINE:
4720 if (top_nest != NULL && top_nest->nest_depth == nest_depth &&
4721 (top_nest->flags & NSF_RESET) != 0)
4722 {
4723 if (cb->bracount > top_nest->max_group)
4724 top_nest->max_group = (uint16_t)cb->bracount;
4725 cb->bracount = top_nest->reset_group;
4726 }
4727 *parsed_pattern++ = META_ALT;
4728 break;
4729
4730 /* End of group; reset the capture count to the maximum if we are in a (?|
4731 group and/or reset the options that are tracked during parsing. Disallow
4732 quantifier for a condition that is an assertion. */
4733
4734 case CHAR_RIGHT_PARENTHESIS:
4735 okquantifier = TRUE;
4736 if (top_nest != NULL && top_nest->nest_depth == nest_depth)
4737 {
4738 options = (options & ~PARSE_TRACKED_OPTIONS) | top_nest->options;
4739 if ((top_nest->flags & NSF_RESET) != 0 &&
4740 top_nest->max_group > cb->bracount)
4741 cb->bracount = top_nest->max_group;
4742 if ((top_nest->flags & NSF_CONDASSERT) != 0)
4743 okquantifier = FALSE;
4744
4745 if ((top_nest->flags & NSF_ATOMICSR) != 0)
4746 {
4747 *parsed_pattern++ = META_KET;
4748 }
4749
4750 if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
4751 else top_nest--;
4752 }
4753 if (nest_depth == 0) /* Unmatched closing parenthesis */
4754 {
4755 errorcode = ERR22;
4756 goto FAILED_BACK;
4757 }
4758 nest_depth--;
4759 *parsed_pattern++ = META_KET;
4760 break;
4761 } /* End of switch on pattern character */
4762 } /* End of main character scan loop */
4763
4764/* End of pattern reached. Check for missing ) at the end of a verb name. */
4765
4766if (inverbname && ptr >= ptrend)
4767 {
4768 errorcode = ERR60;
4769 goto FAILED;
4770 }
4771
4772/* Manage callout for the final item */
4773
4774PARSED_END:
4775parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout,
4776 parsed_pattern, cb);
4777
4778/* Insert trailing items for word and line matching (features provided for the
4779benefit of pcre2grep). */
4780
4781if ((extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
4782 {
4783 *parsed_pattern++ = META_KET;
4784 *parsed_pattern++ = META_DOLLAR;
4785 }
4786else if ((extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
4787 {
4788 *parsed_pattern++ = META_KET;
4789 *parsed_pattern++ = META_ESCAPE + ESC_b;
4790 }
4791
4792/* Terminate the parsed pattern, then return success if all groups are closed.
4793Otherwise we have unclosed parentheses. */
4794
4795if (parsed_pattern >= parsed_pattern_end)
4796 {
4797 errorcode = ERR63; /* Internal error (parsed pattern overflow) */
4798 goto FAILED;
4799 }
4800
4801*parsed_pattern = META_END;
4802if (nest_depth == 0) return 0;
4803
4804UNCLOSED_PARENTHESIS:
4805errorcode = ERR14;
4806
4807/* Come here for all failures. */
4808
4809FAILED:
4810cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern);
4811return errorcode;
4812
4813/* Some errors need to indicate the previous character. */
4814
4815FAILED_BACK:
4816ptr--;
4817goto FAILED;
4818
4819/* This failure happens several times. */
4820
4821BAD_VERSION_CONDITION:
4822errorcode = ERR79;
4823goto FAILED;
4824}
4825
4826
4827
4828/*************************************************
4829* Find first significant opcode *
4830*************************************************/
4831
4832/* This is called by several functions that scan a compiled expression looking
4833for a fixed first character, or an anchoring opcode etc. It skips over things
4834that do not influence this. For some calls, it makes sense to skip negative
4835forward and all backward assertions, and also the \b assertion; for others it
4836does not.
4837
4838Arguments:
4839 code pointer to the start of the group
4840 skipassert TRUE if certain assertions are to be skipped
4841
4842Returns: pointer to the first significant opcode
4843*/
4844
4845static const PCRE2_UCHAR*
4846first_significant_code(PCRE2_SPTR code, BOOL skipassert)
4847{
4848for (;;)
4849 {
4850 switch ((int)*code)
4851 {
4852 case OP_ASSERT_NOT:
4853 case OP_ASSERTBACK:
4854 case OP_ASSERTBACK_NOT:
4855 case OP_ASSERTBACK_NA:
4856 if (!skipassert) return code;
4857 do code += GET(code, 1); while (*code == OP_ALT);
4858 code += PRIV(OP_lengths)[*code];
4859 break;
4860
4861 case OP_WORD_BOUNDARY:
4862 case OP_NOT_WORD_BOUNDARY:
4863 if (!skipassert) return code;
4864 /* Fall through */
4865
4866 case OP_CALLOUT:
4867 case OP_CREF:
4868 case OP_DNCREF:
4869 case OP_RREF:
4870 case OP_DNRREF:
4871 case OP_FALSE:
4872 case OP_TRUE:
4873 code += PRIV(OP_lengths)[*code];
4874 break;
4875
4876 case OP_CALLOUT_STR:
4877 code += GET(code, 1 + 2*LINK_SIZE);
4878 break;
4879
4880 case OP_SKIPZERO:
4881 code += 2 + GET(code, 2) + LINK_SIZE;
4882 break;
4883
4884 case OP_COND:
4885 case OP_SCOND:
4886 if (code[1+LINK_SIZE] != OP_FALSE || /* Not DEFINE */
4887 code[GET(code, 1)] != OP_KET) /* More than one branch */
4888 return code;
4889 code += GET(code, 1) + 1 + LINK_SIZE;
4890 break;
4891
4892 case OP_MARK:
4893 case OP_COMMIT_ARG:
4894 case OP_PRUNE_ARG:
4895 case OP_SKIP_ARG:
4896 case OP_THEN_ARG:
4897 code += code[1] + PRIV(OP_lengths)[*code];
4898 break;
4899
4900 default:
4901 return code;
4902 }
4903 }
4904/* Control never reaches here */
4905}
4906
4907
4908
4909#ifdef SUPPORT_UNICODE
4910/*************************************************
4911* Get othercase range *
4912*************************************************/
4913
4914/* This function is passed the start and end of a class range in UCP mode. It
4915searches up the characters, looking for ranges of characters in the "other"
4916case. Each call returns the next one, updating the start address. A character
4917with multiple other cases is returned on its own with a special return value.
4918
4919Arguments:
4920 cptr points to starting character value; updated
4921 d end value
4922 ocptr where to put start of othercase range
4923 odptr where to put end of othercase range
4924
4925Yield: -1 when no more
4926 0 when a range is returned
4927 >0 the CASESET offset for char with multiple other cases
4928 in this case, ocptr contains the original
4929*/
4930
4931static int
4932get_othercase_range(uint32_t *cptr, uint32_t d, uint32_t *ocptr,
4933 uint32_t *odptr)
4934{
4935uint32_t c, othercase, next;
4936unsigned int co;
4937
4938/* Find the first character that has an other case. If it has multiple other
4939cases, return its case offset value. */
4940
4941for (c = *cptr; c <= d; c++)
4942 {
4943 if ((co = UCD_CASESET(c)) != 0)
4944 {
4945 *ocptr = c++; /* Character that has the set */
4946 *cptr = c; /* Rest of input range */
4947 return (int)co;
4948 }
4949 if ((othercase = UCD_OTHERCASE(c)) != c) break;
4950 }
4951
4952if (c > d) return -1; /* Reached end of range */
4953
4954/* Found a character that has a single other case. Search for the end of the
4955range, which is either the end of the input range, or a character that has zero
4956or more than one other cases. */
4957
4958*ocptr = othercase;
4959next = othercase + 1;
4960
4961for (++c; c <= d; c++)
4962 {
4963 if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
4964 next++;
4965 }
4966
4967*odptr = next - 1; /* End of othercase range */
4968*cptr = c; /* Rest of input range */
4969return 0;
4970}
4971#endif /* SUPPORT_UNICODE */
4972
4973
4974
4975/*************************************************
4976* Add a character or range to a class (internal) *
4977*************************************************/
4978
4979/* This function packages up the logic of adding a character or range of
4980characters to a class. The character values in the arguments will be within the
4981valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
4982called only from within the "add to class" group of functions, some of which
4983are recursive and mutually recursive. The external entry point is
4984add_to_class().
4985
4986Arguments:
4987 classbits the bit map for characters < 256
4988 uchardptr points to the pointer for extra data
4989 options the options word
4990 cb compile data
4991 start start of range character
4992 end end of range character
4993
4994Returns: the number of < 256 characters added
4995 the pointer to extra data is updated
4996*/
4997
4998static unsigned int
4999add_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5000 uint32_t options, compile_block *cb, uint32_t start, uint32_t end)
5001{
5002uint32_t c;
5003uint32_t classbits_end = (end <= 0xff ? end : 0xff);
5004unsigned int n8 = 0;
5005
5006/* If caseless matching is required, scan the range and process alternate
5007cases. In Unicode, there are 8-bit characters that have alternate cases that
5008are greater than 255 and vice-versa. Sometimes we can just extend the original
5009range. */
5010
5011if ((options & PCRE2_CASELESS) != 0)
5012 {
5013#ifdef SUPPORT_UNICODE
5014 if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0)
5015 {
5016 int rc;
5017 uint32_t oc, od;
5018
5019 options &= ~PCRE2_CASELESS; /* Remove for recursive calls */
5020 c = start;
5021
5022 while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
5023 {
5024 /* Handle a single character that has more than one other case. */
5025
5026 if (rc > 0) n8 += add_list_to_class_internal(classbits, uchardptr, options, cb,
5027 PRIV(ucd_caseless_sets) + rc, oc);
5028
5029 /* Do nothing if the other case range is within the original range. */
5030
5031 else if (oc >= cb->class_range_start && od <= cb->class_range_end) continue;
5032
5033 /* Extend the original range if there is overlap, noting that if oc < c, we
5034 can't have od > end because a subrange is always shorter than the basic
5035 range. Otherwise, use a recursive call to add the additional range. */
5036
5037 else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
5038 else if (od > end && oc <= end + 1)
5039 {
5040 end = od; /* Extend upwards */
5041 if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
5042 }
5043 else n8 += add_to_class_internal(classbits, uchardptr, options, cb, oc, od);
5044 }
5045 }
5046 else
5047#endif /* SUPPORT_UNICODE */
5048
5049 /* Not UTF mode */
5050
5051 for (c = start; c <= classbits_end; c++)
5052 {
5053 SETBIT(classbits, cb->fcc[c]);
5054 n8++;
5055 }
5056 }
5057
5058/* Now handle the originally supplied range. Adjust the final value according
5059to the bit length - this means that the same lists of (e.g.) horizontal spaces
5060can be used in all cases. */
5061
5062if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR)
5063 end = MAX_NON_UTF_CHAR;
5064
5065if (start > cb->class_range_start && end < cb->class_range_end) return n8;
5066
5067/* Use the bitmap for characters < 256. Otherwise use extra data.*/
5068
5069for (c = start; c <= classbits_end; c++)
5070 {
5071 /* Regardless of start, c will always be <= 255. */
5072 SETBIT(classbits, c);
5073 n8++;
5074 }
5075
5076#ifdef SUPPORT_WIDE_CHARS
5077if (start <= 0xff) start = 0xff + 1;
5078
5079if (end >= start)
5080 {
5081 PCRE2_UCHAR *uchardata = *uchardptr;
5082
5083#ifdef SUPPORT_UNICODE
5084 if ((options & PCRE2_UTF) != 0)
5085 {
5086 if (start < end)
5087 {
5088 *uchardata++ = XCL_RANGE;
5089 uchardata += PRIV(ord2utf)(start, uchardata);
5090 uchardata += PRIV(ord2utf)(end, uchardata);
5091 }
5092 else if (start == end)
5093 {
5094 *uchardata++ = XCL_SINGLE;
5095 uchardata += PRIV(ord2utf)(start, uchardata);
5096 }
5097 }
5098 else
5099#endif /* SUPPORT_UNICODE */
5100
5101 /* Without UTF support, character values are constrained by the bit length,
5102 and can only be > 256 for 16-bit and 32-bit libraries. */
5103
5104#if PCRE2_CODE_UNIT_WIDTH == 8
5105 {}
5106#else
5107 if (start < end)
5108 {
5109 *uchardata++ = XCL_RANGE;
5110 *uchardata++ = start;
5111 *uchardata++ = end;
5112 }
5113 else if (start == end)
5114 {
5115 *uchardata++ = XCL_SINGLE;
5116 *uchardata++ = start;
5117 }
5118#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
5119 *uchardptr = uchardata; /* Updata extra data pointer */
5120 }
5121#else /* SUPPORT_WIDE_CHARS */
5122 (void)uchardptr; /* Avoid compiler warning */
5123#endif /* SUPPORT_WIDE_CHARS */
5124
5125return n8; /* Number of 8-bit characters */
5126}
5127
5128
5129
5130#ifdef SUPPORT_UNICODE
5131/*************************************************
5132* Add a list of characters to a class (internal) *
5133*************************************************/
5134
5135/* This function is used for adding a list of case-equivalent characters to a
5136class when in UTF mode. This function is called only from within
5137add_to_class_internal(), with which it is mutually recursive.
5138
5139Arguments:
5140 classbits the bit map for characters < 256
5141 uchardptr points to the pointer for extra data
5142 options the options word
5143 cb contains pointers to tables etc.
5144 p points to row of 32-bit values, terminated by NOTACHAR
5145 except character to omit; this is used when adding lists of
5146 case-equivalent characters to avoid including the one we
5147 already know about
5148
5149Returns: the number of < 256 characters added
5150 the pointer to extra data is updated
5151*/
5152
5153static unsigned int
5154add_list_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5155 uint32_t options, compile_block *cb, const uint32_t *p, unsigned int except)
5156{
5157unsigned int n8 = 0;
5158while (p[0] < NOTACHAR)
5159 {
5160 unsigned int n = 0;
5161 if (p[0] != except)
5162 {
5163 while(p[n+1] == p[0] + n + 1) n++;
5164 n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]);
5165 }
5166 p += n + 1;
5167 }
5168return n8;
5169}
5170#endif
5171
5172
5173
5174/*************************************************
5175* External entry point for add range to class *
5176*************************************************/
5177
5178/* This function sets the overall range so that the internal functions can try
5179to avoid duplication when handling case-independence.
5180
5181Arguments:
5182 classbits the bit map for characters < 256
5183 uchardptr points to the pointer for extra data
5184 options the options word
5185 cb compile data
5186 start start of range character
5187 end end of range character
5188
5189Returns: the number of < 256 characters added
5190 the pointer to extra data is updated
5191*/
5192
5193static unsigned int
5194add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
5195 compile_block *cb, uint32_t start, uint32_t end)
5196{
5197cb->class_range_start = start;
5198cb->class_range_end = end;
5199return add_to_class_internal(classbits, uchardptr, options, cb, start, end);
5200}
5201
5202
5203/*************************************************
5204* External entry point for add list to class *
5205*************************************************/
5206
5207/* This function is used for adding a list of horizontal or vertical whitespace
5208characters to a class. The list must be in order so that ranges of characters
5209can be detected and handled appropriately. This function sets the overall range
5210so that the internal functions can try to avoid duplication when handling
5211case-independence.
5212
5213Arguments:
5214 classbits the bit map for characters < 256
5215 uchardptr points to the pointer for extra data
5216 options the options word
5217 cb contains pointers to tables etc.
5218 p points to row of 32-bit values, terminated by NOTACHAR
5219 except character to omit; this is used when adding lists of
5220 case-equivalent characters to avoid including the one we
5221 already know about
5222
5223Returns: the number of < 256 characters added
5224 the pointer to extra data is updated
5225*/
5226
5227static unsigned int
5228add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
5229 compile_block *cb, const uint32_t *p, unsigned int except)
5230{
5231unsigned int n8 = 0;
5232while (p[0] < NOTACHAR)
5233 {
5234 unsigned int n = 0;
5235 if (p[0] != except)
5236 {
5237 while(p[n+1] == p[0] + n + 1) n++;
5238 cb->class_range_start = p[0];
5239 cb->class_range_end = p[n];
5240 n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]);
5241 }
5242 p += n + 1;
5243 }
5244return n8;
5245}
5246
5247
5248
5249/*************************************************
5250* Add characters not in a list to a class *
5251*************************************************/
5252
5253/* This function is used for adding the complement of a list of horizontal or
5254vertical whitespace to a class. The list must be in order.
5255
5256Arguments:
5257 classbits the bit map for characters < 256
5258 uchardptr points to the pointer for extra data
5259 options the options word
5260 cb contains pointers to tables etc.
5261 p points to row of 32-bit values, terminated by NOTACHAR
5262
5263Returns: the number of < 256 characters added
5264 the pointer to extra data is updated
5265*/
5266
5267static unsigned int
5268add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
5269 uint32_t options, compile_block *cb, const uint32_t *p)
5270{
5271BOOL utf = (options & PCRE2_UTF) != 0;
5272unsigned int n8 = 0;
5273if (p[0] > 0)
5274 n8 += add_to_class(classbits, uchardptr, options, cb, 0, p[0] - 1);
5275while (p[0] < NOTACHAR)
5276 {
5277 while (p[1] == p[0] + 1) p++;
5278 n8 += add_to_class(classbits, uchardptr, options, cb, p[0] + 1,
5279 (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
5280 p++;
5281 }
5282return n8;
5283}
5284
5285
5286
5287/*************************************************
5288* Find details of duplicate group names *
5289*************************************************/
5290
5291/* This is called from compile_branch() when it needs to know the index and
5292count of duplicates in the names table when processing named backreferences,
5293either directly, or as conditions.
5294
5295Arguments:
5296 name points to the name
5297 length the length of the name
5298 indexptr where to put the index
5299 countptr where to put the count of duplicates
5300 errorcodeptr where to put an error code
5301 cb the compile block
5302
5303Returns: TRUE if OK, FALSE if not, error code set
5304*/
5305
5306static BOOL
5307find_dupname_details(PCRE2_SPTR name, uint32_t length, int *indexptr,
5308 int *countptr, int *errorcodeptr, compile_block *cb)
5309{
5310uint32_t i, groupnumber;
5311int count;
5312PCRE2_UCHAR *slot = cb->name_table;
5313
5314/* Find the first entry in the table */
5315
5316for (i = 0; i < cb->names_found; i++)
5317 {
5318 if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) == 0 &&
5319 slot[IMM2_SIZE+length] == 0) break;
5320 slot += cb->name_entry_size;
5321 }
5322
5323/* This should not occur, because this function is called only when we know we
5324have duplicate names. Give an internal error. */
5325
5326if (i >= cb->names_found)
5327 {
5328 *errorcodeptr = ERR53;
5329 cb->erroroffset = name - cb->start_pattern;
5330 return FALSE;
5331 }
5332
5333/* Record the index and then see how many duplicates there are, updating the
5334backref map and maximum back reference as we do. */
5335
5336*indexptr = i;
5337count = 0;
5338
5339for (;;)
5340 {
5341 count++;
5342 groupnumber = GET2(slot,0);
5343 cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
5344 if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
5345 if (++i >= cb->names_found) break;
5346 slot += cb->name_entry_size;
5347 if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) != 0 ||
5348 (slot+IMM2_SIZE)[length] != 0) break;
5349 }
5350
5351*countptr = count;
5352return TRUE;
5353}
5354
5355
5356
5357/*************************************************
5358* Compile one branch *
5359*************************************************/
5360
5361/* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If
5362the options are changed during the branch, the pointer is used to change the
5363external options bits. This function is used during the pre-compile phase when
5364we are trying to find out the amount of memory needed, as well as during the
5365real compile phase. The value of lengthptr distinguishes the two phases.
5366
5367Arguments:
5368 optionsptr pointer to the option bits
5369 codeptr points to the pointer to the current code point
5370 pptrptr points to the current parsed pattern pointer
5371 errorcodeptr points to error code variable
5372 firstcuptr place to put the first required code unit
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07005373 firstcuflagsptr place to put the first code unit flags
Elliott Hughes5b808042021-10-01 10:56:10 -07005374 reqcuptr place to put the last required code unit
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07005375 reqcuflagsptr place to put the last required code unit flags
Elliott Hughes5b808042021-10-01 10:56:10 -07005376 bcptr points to current branch chain
5377 cb contains pointers to tables etc.
5378 lengthptr NULL during the real compile phase
5379 points to length accumulator during pre-compile phase
5380
5381Returns: 0 There's been an error, *errorcodeptr is non-zero
5382 +1 Success, this branch must match at least one character
5383 -1 Success, this branch may match an empty string
5384*/
5385
5386static int
5387compile_branch(uint32_t *optionsptr, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07005388 int *errorcodeptr, uint32_t *firstcuptr, uint32_t *firstcuflagsptr,
5389 uint32_t *reqcuptr, uint32_t *reqcuflagsptr, branch_chain *bcptr,
Elliott Hughes5b808042021-10-01 10:56:10 -07005390 compile_block *cb, PCRE2_SIZE *lengthptr)
5391{
5392int bravalue = 0;
5393int okreturn = -1;
5394int group_return = 0;
5395uint32_t repeat_min = 0, repeat_max = 0; /* To please picky compilers */
5396uint32_t greedy_default, greedy_non_default;
5397uint32_t repeat_type, op_type;
5398uint32_t options = *optionsptr; /* May change dynamically */
5399uint32_t firstcu, reqcu;
5400uint32_t zeroreqcu, zerofirstcu;
5401uint32_t escape;
5402uint32_t *pptr = *pptrptr;
5403uint32_t meta, meta_arg;
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07005404uint32_t firstcuflags, reqcuflags;
5405uint32_t zeroreqcuflags, zerofirstcuflags;
5406uint32_t req_caseopt, reqvary, tempreqvary;
Elliott Hughes5b808042021-10-01 10:56:10 -07005407PCRE2_SIZE offset = 0;
5408PCRE2_SIZE length_prevgroup = 0;
5409PCRE2_UCHAR *code = *codeptr;
5410PCRE2_UCHAR *last_code = code;
5411PCRE2_UCHAR *orig_code = code;
5412PCRE2_UCHAR *tempcode;
5413PCRE2_UCHAR *previous = NULL;
5414PCRE2_UCHAR op_previous;
5415BOOL groupsetfirstcu = FALSE;
5416BOOL had_accept = FALSE;
5417BOOL matched_char = FALSE;
5418BOOL previous_matched_char = FALSE;
5419BOOL reset_caseful = FALSE;
5420const uint8_t *cbits = cb->cbits;
5421uint8_t classbits[32];
5422
5423/* We can fish out the UTF setting once and for all into a BOOL, but we must
5424not do this for other options (e.g. PCRE2_EXTENDED) because they may change
5425dynamically as we process the pattern. */
5426
5427#ifdef SUPPORT_UNICODE
5428BOOL utf = (options & PCRE2_UTF) != 0;
5429BOOL ucp = (options & PCRE2_UCP) != 0;
5430#else /* No Unicode support */
5431BOOL utf = FALSE;
5432#endif
5433
5434/* Helper variables for OP_XCLASS opcode (for characters > 255). We define
5435class_uchardata always so that it can be passed to add_to_class() always,
5436though it will not be used in non-UTF 8-bit cases. This avoids having to supply
5437alternative calls for the different cases. */
5438
5439PCRE2_UCHAR *class_uchardata;
5440#ifdef SUPPORT_WIDE_CHARS
5441BOOL xclass;
5442PCRE2_UCHAR *class_uchardata_base;
5443#endif
5444
5445/* Set up the default and non-default settings for greediness */
5446
5447greedy_default = ((options & PCRE2_UNGREEDY) != 0);
5448greedy_non_default = greedy_default ^ 1;
5449
5450/* Initialize no first unit, no required unit. REQ_UNSET means "no char
5451matching encountered yet". It gets changed to REQ_NONE if we hit something that
5452matches a non-fixed first unit; reqcu just remains unset if we never find one.
5453
5454When we hit a repeat whose minimum is zero, we may have to adjust these values
5455to take the zero repeat into account. This is implemented by setting them to
5456zerofirstcu and zeroreqcu when such a repeat is encountered. The individual
5457item types that can be repeated set these backoff variables appropriately. */
5458
5459firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
5460firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
5461
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07005462/* The variable req_caseopt contains either the REQ_CASELESS bit or zero,
Elliott Hughes5b808042021-10-01 10:56:10 -07005463according to the current setting of the caseless flag. The REQ_CASELESS value
5464leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
5465to record the case status of the value. This is used only for ASCII characters.
5466*/
5467
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07005468req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
Elliott Hughes5b808042021-10-01 10:56:10 -07005469
5470/* Switch on next META item until the end of the branch */
5471
5472for (;; pptr++)
5473 {
5474#ifdef SUPPORT_WIDE_CHARS
5475 BOOL xclass_has_prop;
5476#endif
5477 BOOL negate_class;
5478 BOOL should_flip_negation;
5479 BOOL match_all_or_no_wide_chars;
5480 BOOL possessive_quantifier;
5481 BOOL note_group_empty;
5482 int class_has_8bitchar;
Elliott Hughes5b808042021-10-01 10:56:10 -07005483 uint32_t mclength;
5484 uint32_t skipunits;
5485 uint32_t subreqcu, subfirstcu;
5486 uint32_t groupnumber;
5487 uint32_t verbarglen, verbculen;
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07005488 uint32_t subreqcuflags, subfirstcuflags;
Elliott Hughes5b808042021-10-01 10:56:10 -07005489 open_capitem *oc;
5490 PCRE2_UCHAR mcbuffer[8];
5491
5492 /* Get next META item in the pattern and its potential argument. */
5493
5494 meta = META_CODE(*pptr);
5495 meta_arg = META_DATA(*pptr);
5496
5497 /* If we are in the pre-compile phase, accumulate the length used for the
5498 previous cycle of this loop, unless the next item is a quantifier. */
5499
5500 if (lengthptr != NULL)
5501 {
5502 if (code > cb->start_workspace + cb->workspace_size -
5503 WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */
5504 {
5505 *errorcodeptr = (code >= cb->start_workspace + cb->workspace_size)?
5506 ERR52 : ERR86;
5507 return 0;
5508 }
5509
5510 /* There is at least one situation where code goes backwards: this is the
5511 case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier
5512 is processed, the whole class is eliminated. However, it is created first,
5513 so we have to allow memory for it. Therefore, don't ever reduce the length
5514 at this point. */
5515
5516 if (code < last_code) code = last_code;
5517
5518 /* If the next thing is not a quantifier, we add the length of the previous
5519 item into the total, and reset the code pointer to the start of the
5520 workspace. Otherwise leave the previous item available to be quantified. */
5521
5522 if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5523 {
5524 if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code))
5525 {
5526 *errorcodeptr = ERR20; /* Integer overflow */
5527 return 0;
5528 }
5529 *lengthptr += (PCRE2_SIZE)(code - orig_code);
5530 if (*lengthptr > MAX_PATTERN_SIZE)
5531 {
5532 *errorcodeptr = ERR20; /* Pattern is too large */
5533 return 0;
5534 }
5535 code = orig_code;
5536 }
5537
5538 /* Remember where this code item starts so we can catch the "backwards"
5539 case above next time round. */
5540
5541 last_code = code;
5542 }
5543
5544 /* Process the next parsed pattern item. If it is not a quantifier, remember
5545 where it starts so that it can be quantified when a quantifier follows.
5546 Checking for the legality of quantifiers happens in parse_regex(), except for
5547 a quantifier after an assertion that is a condition. */
5548
5549 if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
5550 {
5551 previous = code;
5552 if (matched_char && !had_accept) okreturn = 1;
5553 }
5554
5555 previous_matched_char = matched_char;
5556 matched_char = FALSE;
5557 note_group_empty = FALSE;
5558 skipunits = 0; /* Default value for most subgroups */
5559
5560 switch(meta)
5561 {
5562 /* ===================================================================*/
5563 /* The branch terminates at pattern end or | or ) */
5564
5565 case META_END:
5566 case META_ALT:
5567 case META_KET:
5568 *firstcuptr = firstcu;
5569 *firstcuflagsptr = firstcuflags;
5570 *reqcuptr = reqcu;
5571 *reqcuflagsptr = reqcuflags;
5572 *codeptr = code;
5573 *pptrptr = pptr;
5574 return okreturn;
5575
5576
5577 /* ===================================================================*/
5578 /* Handle single-character metacharacters. In multiline mode, ^ disables
5579 the setting of any following char as a first character. */
5580
5581 case META_CIRCUMFLEX:
5582 if ((options & PCRE2_MULTILINE) != 0)
5583 {
5584 if (firstcuflags == REQ_UNSET)
5585 zerofirstcuflags = firstcuflags = REQ_NONE;
5586 *code++ = OP_CIRCM;
5587 }
5588 else *code++ = OP_CIRC;
5589 break;
5590
5591 case META_DOLLAR:
5592 *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
5593 break;
5594
5595 /* There can never be a first char if '.' is first, whatever happens about
5596 repeats. The value of reqcu doesn't change either. */
5597
5598 case META_DOT:
5599 matched_char = TRUE;
5600 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5601 zerofirstcu = firstcu;
5602 zerofirstcuflags = firstcuflags;
5603 zeroreqcu = reqcu;
5604 zeroreqcuflags = reqcuflags;
5605 *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
5606 break;
5607
5608
5609 /* ===================================================================*/
5610 /* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set.
5611 Otherwise, an initial ']' is taken as a data character. When empty classes
5612 are allowed, [] must always fail, so generate OP_FAIL, whereas [^] must
5613 match any character, so generate OP_ALLANY. */
5614
5615 case META_CLASS_EMPTY:
5616 case META_CLASS_EMPTY_NOT:
5617 matched_char = TRUE;
5618 *code++ = (meta == META_CLASS_EMPTY_NOT)? OP_ALLANY : OP_FAIL;
5619 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5620 zerofirstcu = firstcu;
5621 zerofirstcuflags = firstcuflags;
5622 break;
5623
5624
5625 /* ===================================================================*/
5626 /* Non-empty character class. If the included characters are all < 256, we
5627 build a 32-byte bitmap of the permitted characters, except in the special
5628 case where there is only one such character. For negated classes, we build
5629 the map as usual, then invert it at the end. However, we use a different
5630 opcode so that data characters > 255 can be handled correctly.
5631
5632 If the class contains characters outside the 0-255 range, a different
5633 opcode is compiled. It may optionally have a bit map for characters < 256,
5634 but those above are are explicitly listed afterwards. A flag code unit
5635 tells whether the bitmap is present, and whether this is a negated class or
5636 not. */
5637
5638 case META_CLASS_NOT:
5639 case META_CLASS:
5640 matched_char = TRUE;
5641 negate_class = meta == META_CLASS_NOT;
5642
5643 /* We can optimize the case of a single character in a class by generating
5644 OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's
5645 negative. In the negative case there can be no first char if this item is
5646 first, whatever repeat count may follow. In the case of reqcu, save the
5647 previous value for reinstating. */
5648
5649 /* NOTE: at present this optimization is not effective if the only
5650 character in a class in 32-bit, non-UCP mode has its top bit set. */
5651
5652 if (pptr[1] < META_END && pptr[2] == META_CLASS_END)
5653 {
5654#ifdef SUPPORT_UNICODE
5655 uint32_t d;
5656#endif
5657 uint32_t c = pptr[1];
5658
5659 pptr += 2; /* Move on to class end */
5660 if (meta == META_CLASS) /* A positive one-char class can be */
5661 { /* handled as a normal literal character. */
5662 meta = c; /* Set up the character */
5663 goto NORMAL_CHAR_SET;
5664 }
5665
5666 /* Handle a negative one-character class */
5667
5668 zeroreqcu = reqcu;
5669 zeroreqcuflags = reqcuflags;
5670 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5671 zerofirstcu = firstcu;
5672 zerofirstcuflags = firstcuflags;
5673
5674 /* For caseless UTF or UCP mode, check whether this character has more
5675 than one other case. If so, generate a special OP_NOTPROP item instead of
5676 OP_NOTI. */
5677
5678#ifdef SUPPORT_UNICODE
5679 if ((utf||ucp) && (options & PCRE2_CASELESS) != 0 &&
5680 (d = UCD_CASESET(c)) != 0)
5681 {
5682 *code++ = OP_NOTPROP;
5683 *code++ = PT_CLIST;
5684 *code++ = d;
5685 break; /* We are finished with this class */
5686 }
5687#endif
5688 /* Char has only one other case, or UCP not available */
5689
5690 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT;
5691 code += PUTCHAR(c, code);
5692 break; /* We are finished with this class */
5693 } /* End of 1-char optimization */
5694
5695 /* Handle character classes that contain more than just one literal
5696 character. If there are exactly two characters in a positive class, see if
5697 they are case partners. This can be optimized to generate a caseless single
5698 character match (which also sets first/required code units if relevant). */
5699
5700 if (meta == META_CLASS && pptr[1] < META_END && pptr[2] < META_END &&
5701 pptr[3] == META_CLASS_END)
5702 {
5703 uint32_t c = pptr[1];
5704
5705#ifdef SUPPORT_UNICODE
5706 if (UCD_CASESET(c) == 0)
5707#endif
5708 {
5709 uint32_t d;
5710
5711#ifdef SUPPORT_UNICODE
5712 if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else
5713#endif
5714 {
5715#if PCRE2_CODE_UNIT_WIDTH != 8
5716 if (c > 255) d = c; else
5717#endif
5718 d = TABLE_GET(c, cb->fcc, c);
5719 }
5720
5721 if (c != d && pptr[2] == d)
5722 {
5723 pptr += 3; /* Move on to class end */
5724 meta = c;
5725 if ((options & PCRE2_CASELESS) == 0)
5726 {
5727 reset_caseful = TRUE;
5728 options |= PCRE2_CASELESS;
5729 req_caseopt = REQ_CASELESS;
5730 }
5731 goto CLASS_CASELESS_CHAR;
5732 }
5733 }
5734 }
5735
5736 /* If a non-extended class contains a negative special such as \S, we need
5737 to flip the negation flag at the end, so that support for characters > 255
5738 works correctly (they are all included in the class). An extended class may
5739 need to insert specific matching or non-matching code for wide characters.
5740 */
5741
5742 should_flip_negation = match_all_or_no_wide_chars = FALSE;
5743
5744 /* Extended class (xclass) will be used when characters > 255
5745 might match. */
5746
5747#ifdef SUPPORT_WIDE_CHARS
5748 xclass = FALSE;
5749 class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */
5750 class_uchardata_base = class_uchardata; /* Save the start */
5751#endif
5752
5753 /* For optimization purposes, we track some properties of the class:
5754 class_has_8bitchar will be non-zero if the class contains at least one
5755 character with a code point less than 256; xclass_has_prop will be TRUE if
5756 Unicode property checks are present in the class. */
5757
5758 class_has_8bitchar = 0;
5759#ifdef SUPPORT_WIDE_CHARS
5760 xclass_has_prop = FALSE;
5761#endif
5762
5763 /* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
5764 in a temporary bit of memory, in case the class contains fewer than two
5765 8-bit characters because in that case the compiled code doesn't use the bit
5766 map. */
5767
5768 memset(classbits, 0, 32 * sizeof(uint8_t));
5769
5770 /* Process items until META_CLASS_END is reached. */
5771
5772 while ((meta = *(++pptr)) != META_CLASS_END)
5773 {
5774 /* Handle POSIX classes such as [:alpha:] etc. */
5775
5776 if (meta == META_POSIX || meta == META_POSIX_NEG)
5777 {
5778 BOOL local_negate = (meta == META_POSIX_NEG);
5779 int posix_class = *(++pptr);
5780 int taboffset, tabopt;
5781 uint8_t pbits[32];
5782
5783 should_flip_negation = local_negate; /* Note negative special */
5784
5785 /* If matching is caseless, upper and lower are converted to alpha.
5786 This relies on the fact that the class table starts with alpha,
5787 lower, upper as the first 3 entries. */
5788
5789 if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2)
5790 posix_class = 0;
5791
5792 /* When PCRE2_UCP is set, some of the POSIX classes are converted to
5793 different escape sequences that use Unicode properties \p or \P.
5794 Others that are not available via \p or \P have to generate
5795 XCL_PROP/XCL_NOTPROP directly, which is done here. */
5796
5797#ifdef SUPPORT_UNICODE
5798 if ((options & PCRE2_UCP) != 0) switch(posix_class)
5799 {
5800 case PC_GRAPH:
5801 case PC_PRINT:
5802 case PC_PUNCT:
5803 *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
5804 *class_uchardata++ = (PCRE2_UCHAR)
5805 ((posix_class == PC_GRAPH)? PT_PXGRAPH :
5806 (posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT);
5807 *class_uchardata++ = 0;
5808 xclass_has_prop = TRUE;
5809 goto CONTINUE_CLASS;
5810
5811 /* For the other POSIX classes (ascii, xdigit) we are going to
5812 fall through to the non-UCP case and build a bit map for
5813 characters with code points less than 256. However, if we are in
5814 a negated POSIX class, characters with code points greater than
5815 255 must either all match or all not match, depending on whether
5816 the whole class is not or is negated. For example, for
5817 [[:^ascii:]... they must all match, whereas for [^[:^xdigit:]...
5818 they must not.
5819
5820 In the special case where there are no xclass items, this is
5821 automatically handled by the use of OP_CLASS or OP_NCLASS, but an
5822 explicit range is needed for OP_XCLASS. Setting a flag here
5823 causes the range to be generated later when it is known that
5824 OP_XCLASS is required. In the 8-bit library this is relevant only in
5825 utf mode, since no wide characters can exist otherwise. */
5826
5827 default:
5828#if PCRE2_CODE_UNIT_WIDTH == 8
5829 if (utf)
5830#endif
5831 match_all_or_no_wide_chars |= local_negate;
5832 break;
5833 }
5834#endif /* SUPPORT_UNICODE */
5835
5836 /* In the non-UCP case, or when UCP makes no difference, we build the
5837 bit map for the POSIX class in a chunk of local store because we may
5838 be adding and subtracting from it, and we don't want to subtract bits
5839 that may be in the main map already. At the end we or the result into
5840 the bit map that is being built. */
5841
5842 posix_class *= 3;
5843
5844 /* Copy in the first table (always present) */
5845
5846 memcpy(pbits, cbits + posix_class_maps[posix_class],
5847 32 * sizeof(uint8_t));
5848
5849 /* If there is a second table, add or remove it as required. */
5850
5851 taboffset = posix_class_maps[posix_class + 1];
5852 tabopt = posix_class_maps[posix_class + 2];
5853
5854 if (taboffset >= 0)
5855 {
5856 if (tabopt >= 0)
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07005857 for (int i = 0; i < 32; i++) pbits[i] |= cbits[(int)i + taboffset];
Elliott Hughes5b808042021-10-01 10:56:10 -07005858 else
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07005859 for (int i = 0; i < 32; i++) pbits[i] &= ~cbits[(int)i + taboffset];
Elliott Hughes5b808042021-10-01 10:56:10 -07005860 }
5861
5862 /* Now see if we need to remove any special characters. An option
5863 value of 1 removes vertical space and 2 removes underscore. */
5864
5865 if (tabopt < 0) tabopt = -tabopt;
5866 if (tabopt == 1) pbits[1] &= ~0x3c;
5867 else if (tabopt == 2) pbits[11] &= 0x7f;
5868
5869 /* Add the POSIX table or its complement into the main table that is
5870 being built and we are done. */
5871
5872 if (local_negate)
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07005873 for (int i = 0; i < 32; i++) classbits[i] |= (uint8_t)(~pbits[i]);
Elliott Hughes5b808042021-10-01 10:56:10 -07005874 else
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07005875 for (int i = 0; i < 32; i++) classbits[i] |= pbits[i];
Elliott Hughes5b808042021-10-01 10:56:10 -07005876
5877 /* Every class contains at least one < 256 character. */
5878
5879 class_has_8bitchar = 1;
5880 goto CONTINUE_CLASS; /* End of POSIX handling */
5881 }
5882
5883 /* Other than POSIX classes, the only items we should encounter are
5884 \d-type escapes and literal characters (possibly as ranges). */
5885
5886 if (meta == META_BIGVALUE)
5887 {
5888 meta = *(++pptr);
5889 goto CLASS_LITERAL;
5890 }
5891
5892 /* Any other non-literal must be an escape */
5893
5894 if (meta >= META_END)
5895 {
5896 if (META_CODE(meta) != META_ESCAPE)
5897 {
5898#ifdef DEBUG_SHOW_PARSED
5899 fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x "
5900 "in character class\n", meta);
5901#endif
5902 *errorcodeptr = ERR89; /* Internal error - unrecognized. */
5903 return 0;
5904 }
5905 escape = META_DATA(meta);
5906
5907 /* Every class contains at least one < 256 character. */
5908
5909 class_has_8bitchar++;
5910
5911 switch(escape)
5912 {
5913 case ESC_d:
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07005914 for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit];
Elliott Hughes5b808042021-10-01 10:56:10 -07005915 break;
5916
5917 case ESC_D:
5918 should_flip_negation = TRUE;
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07005919 for (int i = 0; i < 32; i++)
5920 classbits[i] |= (uint8_t)(~cbits[i+cbit_digit]);
Elliott Hughes5b808042021-10-01 10:56:10 -07005921 break;
5922
5923 case ESC_w:
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07005924 for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word];
Elliott Hughes5b808042021-10-01 10:56:10 -07005925 break;
5926
5927 case ESC_W:
5928 should_flip_negation = TRUE;
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07005929 for (int i = 0; i < 32; i++)
5930 classbits[i] |= (uint8_t)(~cbits[i+cbit_word]);
Elliott Hughes5b808042021-10-01 10:56:10 -07005931 break;
5932
5933 /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
5934 5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5935 previously set by something earlier in the character class.
5936 Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5937 we could just adjust the appropriate bit. From PCRE 8.34 we no
5938 longer treat \s and \S specially. */
5939
5940 case ESC_s:
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07005941 for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space];
Elliott Hughes5b808042021-10-01 10:56:10 -07005942 break;
5943
5944 case ESC_S:
5945 should_flip_negation = TRUE;
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07005946 for (int i = 0; i < 32; i++)
5947 classbits[i] |= (uint8_t)(~cbits[i+cbit_space]);
Elliott Hughes5b808042021-10-01 10:56:10 -07005948 break;
5949
5950 /* When adding the horizontal or vertical space lists to a class, or
5951 their complements, disable PCRE2_CASELESS, because it justs wastes
5952 time, and in the "not-x" UTF cases can create unwanted duplicates in
5953 the XCLASS list (provoked by characters that have more than one other
5954 case and by both cases being in the same "not-x" sublist). */
5955
5956 case ESC_h:
5957 (void)add_list_to_class(classbits, &class_uchardata,
5958 options & ~PCRE2_CASELESS, cb, PRIV(hspace_list), NOTACHAR);
5959 break;
5960
5961 case ESC_H:
5962 (void)add_not_list_to_class(classbits, &class_uchardata,
5963 options & ~PCRE2_CASELESS, cb, PRIV(hspace_list));
5964 break;
5965
5966 case ESC_v:
5967 (void)add_list_to_class(classbits, &class_uchardata,
5968 options & ~PCRE2_CASELESS, cb, PRIV(vspace_list), NOTACHAR);
5969 break;
5970
5971 case ESC_V:
5972 (void)add_not_list_to_class(classbits, &class_uchardata,
5973 options & ~PCRE2_CASELESS, cb, PRIV(vspace_list));
5974 break;
5975
5976 /* If Unicode is not supported, \P and \p are not allowed and are
5977 faulted at parse time, so will never appear here. */
5978
5979#ifdef SUPPORT_UNICODE
5980 case ESC_p:
5981 case ESC_P:
5982 {
5983 uint32_t ptype = *(++pptr) >> 16;
5984 uint32_t pdata = *pptr & 0xffff;
5985 *class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP;
5986 *class_uchardata++ = ptype;
5987 *class_uchardata++ = pdata;
5988 xclass_has_prop = TRUE;
5989 class_has_8bitchar--; /* Undo! */
5990 }
5991 break;
5992#endif
5993 }
5994
5995 goto CONTINUE_CLASS;
5996 } /* End handling \d-type escapes */
5997
5998 /* A literal character may be followed by a range meta. At parse time
5999 there are checks for out-of-order characters, for ranges where the two
6000 characters are equal, and for hyphens that cannot indicate a range. At
6001 this point, therefore, no checking is needed. */
6002
6003 else
6004 {
6005 uint32_t c, d;
6006
6007 CLASS_LITERAL:
6008 c = d = meta;
6009
6010 /* Remember if \r or \n were explicitly used */
6011
6012 if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
6013
6014 /* Process a character range */
6015
6016 if (pptr[1] == META_RANGE_LITERAL || pptr[1] == META_RANGE_ESCAPED)
6017 {
6018#ifdef EBCDIC
6019 BOOL range_is_literal = (pptr[1] == META_RANGE_LITERAL);
6020#endif
6021 pptr += 2;
6022 d = *pptr;
6023 if (d == META_BIGVALUE) d = *(++pptr);
6024
6025 /* Remember an explicit \r or \n, and add the range to the class. */
6026
6027 if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
6028
6029 /* In an EBCDIC environment, Perl treats alphabetic ranges specially
6030 because there are holes in the encoding, and simply using the range
6031 A-Z (for example) would include the characters in the holes. This
6032 applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
6033
6034#ifdef EBCDIC
6035 if (range_is_literal &&
6036 (cb->ctypes[c] & ctype_letter) != 0 &&
6037 (cb->ctypes[d] & ctype_letter) != 0 &&
6038 (c <= CHAR_z) == (d <= CHAR_z))
6039 {
6040 uint32_t uc = (d <= CHAR_z)? 0 : 64;
6041 uint32_t C = c - uc;
6042 uint32_t D = d - uc;
6043
6044 if (C <= CHAR_i)
6045 {
6046 class_has_8bitchar +=
6047 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
6048 ((D < CHAR_i)? D : CHAR_i) + uc);
6049 C = CHAR_j;
6050 }
6051
6052 if (C <= D && C <= CHAR_r)
6053 {
6054 class_has_8bitchar +=
6055 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
6056 ((D < CHAR_r)? D : CHAR_r) + uc);
6057 C = CHAR_s;
6058 }
6059
6060 if (C <= D)
6061 {
6062 class_has_8bitchar +=
6063 add_to_class(classbits, &class_uchardata, options, cb, C + uc,
6064 D + uc);
6065 }
6066 }
6067 else
6068#endif
6069 /* Not an EBCDIC special range */
6070
6071 class_has_8bitchar +=
6072 add_to_class(classbits, &class_uchardata, options, cb, c, d);
6073 goto CONTINUE_CLASS; /* Go get the next char in the class */
6074 } /* End of range handling */
6075
6076
6077 /* Handle a single character. */
6078
6079 class_has_8bitchar +=
6080 add_to_class(classbits, &class_uchardata, options, cb, meta, meta);
6081 }
6082
6083 /* Continue to the next item in the class. */
6084
6085 CONTINUE_CLASS:
6086
6087#ifdef SUPPORT_WIDE_CHARS
6088 /* If any wide characters or Unicode properties have been encountered,
6089 set xclass = TRUE. Then, in the pre-compile phase, accumulate the length
6090 of the extra data and reset the pointer. This is so that very large
6091 classes that contain a zillion wide characters or Unicode property tests
6092 do not overwrite the workspace (which is on the stack). */
6093
6094 if (class_uchardata > class_uchardata_base)
6095 {
6096 xclass = TRUE;
6097 if (lengthptr != NULL)
6098 {
6099 *lengthptr += class_uchardata - class_uchardata_base;
6100 class_uchardata = class_uchardata_base;
6101 }
6102 }
6103#endif
6104
6105 continue; /* Needed to avoid error when not supporting wide chars */
6106 } /* End of main class-processing loop */
6107
6108 /* If this class is the first thing in the branch, there can be no first
6109 char setting, whatever the repeat count. Any reqcu setting must remain
6110 unchanged after any kind of repeat. */
6111
6112 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6113 zerofirstcu = firstcu;
6114 zerofirstcuflags = firstcuflags;
6115 zeroreqcu = reqcu;
6116 zeroreqcuflags = reqcuflags;
6117
6118 /* If there are characters with values > 255, or Unicode property settings
6119 (\p or \P), we have to compile an extended class, with its own opcode,
6120 unless there were no property settings and there was a negated special such
6121 as \S in the class, and PCRE2_UCP is not set, because in that case all
6122 characters > 255 are in or not in the class, so any that were explicitly
6123 given as well can be ignored.
6124
6125 In the UCP case, if certain negated POSIX classes ([:^ascii:] or
6126 [^:xdigit:]) were present in a class, we either have to match or not match
6127 all wide characters (depending on whether the whole class is or is not
6128 negated). This requirement is indicated by match_all_or_no_wide_chars being
6129 true. We do this by including an explicit range, which works in both cases.
6130 This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there
6131 cannot be any wide characters in 8-bit non-UTF mode.
6132
6133 When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit
6134 class where \S etc is present without PCRE2_UCP, causing an extended class
6135 to be compiled, we make sure that all characters > 255 are included by
6136 forcing match_all_or_no_wide_chars to be true.
6137
6138 If, when generating an xclass, there are no characters < 256, we can omit
6139 the bitmap in the actual compiled code. */
6140
6141#ifdef SUPPORT_WIDE_CHARS /* Defined for 16/32 bits, or 8-bit with Unicode */
6142 if (xclass && (
6143#ifdef SUPPORT_UNICODE
6144 (options & PCRE2_UCP) != 0 ||
6145#endif
6146 xclass_has_prop || !should_flip_negation))
6147 {
6148 if (match_all_or_no_wide_chars || (
6149#if PCRE2_CODE_UNIT_WIDTH == 8
6150 utf &&
6151#endif
6152 should_flip_negation && !negate_class && (options & PCRE2_UCP) == 0))
6153 {
6154 *class_uchardata++ = XCL_RANGE;
6155 if (utf) /* Will always be utf in the 8-bit library */
6156 {
6157 class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
6158 class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata);
6159 }
6160 else /* Can only happen for the 16-bit & 32-bit libraries */
6161 {
6162#if PCRE2_CODE_UNIT_WIDTH == 16
6163 *class_uchardata++ = 0x100;
6164 *class_uchardata++ = 0xffffu;
6165#elif PCRE2_CODE_UNIT_WIDTH == 32
6166 *class_uchardata++ = 0x100;
6167 *class_uchardata++ = 0xffffffffu;
6168#endif
6169 }
6170 }
6171 *class_uchardata++ = XCL_END; /* Marks the end of extra data */
6172 *code++ = OP_XCLASS;
6173 code += LINK_SIZE;
6174 *code = negate_class? XCL_NOT:0;
6175 if (xclass_has_prop) *code |= XCL_HASPROP;
6176
6177 /* If the map is required, move up the extra data to make room for it;
6178 otherwise just move the code pointer to the end of the extra data. */
6179
6180 if (class_has_8bitchar > 0)
6181 {
6182 *code++ |= XCL_MAP;
6183 (void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code,
6184 CU2BYTES(class_uchardata - code));
6185 if (negate_class && !xclass_has_prop)
6186 {
6187 /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07006188 for (int i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
Elliott Hughes5b808042021-10-01 10:56:10 -07006189 }
6190 memcpy(code, classbits, 32);
6191 code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));
6192 }
6193 else code = class_uchardata;
6194
6195 /* Now fill in the complete length of the item */
6196
6197 PUT(previous, 1, (int)(code - previous));
6198 break; /* End of class handling */
6199 }
6200#endif /* SUPPORT_WIDE_CHARS */
6201
6202 /* If there are no characters > 255, or they are all to be included or
6203 excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
6204 whole class was negated and whether there were negative specials such as \S
6205 (non-UCP) in the class. Then copy the 32-byte map into the code vector,
6206 negating it if necessary. */
6207
6208 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
6209 if (lengthptr == NULL) /* Save time in the pre-compile phase */
6210 {
6211 if (negate_class)
6212 {
6213 /* Using 255 ^ instead of ~ avoids clang sanitize warning. */
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07006214 for (int i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
Elliott Hughes5b808042021-10-01 10:56:10 -07006215 }
6216 memcpy(code, classbits, 32);
6217 }
6218 code += 32 / sizeof(PCRE2_UCHAR);
6219 break; /* End of class processing */
6220
6221
6222 /* ===================================================================*/
6223 /* Deal with (*VERB)s. */
6224
6225 /* Check for open captures before ACCEPT and close those that are within
6226 the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an
6227 assertion. In the first pass, just accumulate the length required;
6228 otherwise hitting (*ACCEPT) inside many nested parentheses can cause
6229 workspace overflow. Do not set firstcu after *ACCEPT. */
6230
6231 case META_ACCEPT:
6232 cb->had_accept = had_accept = TRUE;
6233 for (oc = cb->open_caps;
6234 oc != NULL && oc->assert_depth >= cb->assert_depth;
6235 oc = oc->next)
6236 {
6237 if (lengthptr != NULL)
6238 {
6239 *lengthptr += CU2BYTES(1) + IMM2_SIZE;
6240 }
6241 else
6242 {
6243 *code++ = OP_CLOSE;
6244 PUT2INC(code, 0, oc->number);
6245 }
6246 }
6247 *code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6248 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6249 break;
6250
6251 case META_PRUNE:
6252 case META_SKIP:
6253 cb->had_pruneorskip = TRUE;
6254 /* Fall through */
6255 case META_COMMIT:
6256 case META_FAIL:
6257 *code++ = verbops[(meta - META_MARK) >> 16];
6258 break;
6259
6260 case META_THEN:
6261 cb->external_flags |= PCRE2_HASTHEN;
6262 *code++ = OP_THEN;
6263 break;
6264
6265 /* Handle verbs with arguments. Arguments can be very long, especially in
6266 16- and 32-bit modes, and can overflow the workspace in the first pass.
6267 However, the argument length is constrained to be small enough to fit in
6268 one code unit. This check happens in parse_regex(). In the first pass,
6269 instead of putting the argument into memory, we just update the length
6270 counter and set up an empty argument. */
6271
6272 case META_THEN_ARG:
6273 cb->external_flags |= PCRE2_HASTHEN;
6274 goto VERB_ARG;
6275
6276 case META_PRUNE_ARG:
6277 case META_SKIP_ARG:
6278 cb->had_pruneorskip = TRUE;
6279 /* Fall through */
6280 case META_MARK:
6281 case META_COMMIT_ARG:
6282 VERB_ARG:
6283 *code++ = verbops[(meta - META_MARK) >> 16];
6284 /* The length is in characters. */
6285 verbarglen = *(++pptr);
6286 verbculen = 0;
6287 tempcode = code++;
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07006288 for (int i = 0; i < (int)verbarglen; i++)
Elliott Hughes5b808042021-10-01 10:56:10 -07006289 {
6290 meta = *(++pptr);
6291#ifdef SUPPORT_UNICODE
6292 if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
6293#endif
6294 {
6295 mclength = 1;
6296 mcbuffer[0] = meta;
6297 }
6298 if (lengthptr != NULL) *lengthptr += mclength; else
6299 {
6300 memcpy(code, mcbuffer, CU2BYTES(mclength));
6301 code += mclength;
6302 verbculen += mclength;
6303 }
6304 }
6305
6306 *tempcode = verbculen; /* Fill in the code unit length */
6307 *code++ = 0; /* Terminating zero */
6308 break;
6309
6310
6311 /* ===================================================================*/
6312 /* Handle options change. The new setting must be passed back for use in
6313 subsequent branches. Reset the greedy defaults and the case value for
6314 firstcu and reqcu. */
6315
6316 case META_OPTIONS:
6317 *optionsptr = options = *(++pptr);
6318 greedy_default = ((options & PCRE2_UNGREEDY) != 0);
6319 greedy_non_default = greedy_default ^ 1;
6320 req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
6321 break;
6322
6323
6324 /* ===================================================================*/
6325 /* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous
6326 because it could be a numerical check on recursion, or a name check on a
6327 group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that
6328 we can handle it either way. We first try for a name; if not found, process
6329 the number. */
6330
6331 case META_COND_RNUMBER: /* (?(Rdigits) */
6332 case META_COND_NAME: /* (?(name) or (?'name') or ?(<name>) */
6333 case META_COND_RNAME: /* (?(R&name) - test for recursion */
6334 bravalue = OP_COND;
6335 {
6336 int count, index;
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07006337 unsigned int i;
Elliott Hughes5b808042021-10-01 10:56:10 -07006338 PCRE2_SPTR name;
6339 named_group *ng = cb->named_groups;
6340 uint32_t length = *(++pptr);
6341
6342 GETPLUSOFFSET(offset, pptr);
6343 name = cb->start_pattern + offset;
6344
6345 /* In the first pass, the names generated in the pre-pass are available,
6346 but the main name table has not yet been created. Scan the list of names
6347 generated in the pre-pass in order to get a number and whether or not
6348 this name is duplicated. If it is not duplicated, we can handle it as a
6349 numerical group. */
6350
6351 for (i = 0; i < cb->names_found; i++, ng++)
6352 {
6353 if (length == ng->length &&
6354 PRIV(strncmp)(name, ng->name, length) == 0)
6355 {
6356 if (!ng->isdup)
6357 {
6358 code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6359 PUT2(code, 2+LINK_SIZE, ng->number);
6360 if (ng->number > cb->top_backref) cb->top_backref = ng->number;
6361 skipunits = 1+IMM2_SIZE;
6362 goto GROUP_PROCESS_NOTE_EMPTY;
6363 }
6364 break; /* Found a duplicated name */
6365 }
6366 }
6367
6368 /* If the name was not found we have a bad reference, unless we are
6369 dealing with R<digits>, which is treated as a recursion test by number.
6370 */
6371
6372 if (i >= cb->names_found)
6373 {
6374 groupnumber = 0;
6375 if (meta == META_COND_RNUMBER)
6376 {
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07006377 for (i = 1; i < length; i++)
Elliott Hughes5b808042021-10-01 10:56:10 -07006378 {
6379 groupnumber = groupnumber * 10 + name[i] - CHAR_0;
6380 if (groupnumber > MAX_GROUP_NUMBER)
6381 {
6382 *errorcodeptr = ERR61;
6383 cb->erroroffset = offset + i;
6384 return 0;
6385 }
6386 }
6387 }
6388
6389 if (meta != META_COND_RNUMBER || groupnumber > cb->bracount)
6390 {
6391 *errorcodeptr = ERR15;
6392 cb->erroroffset = offset;
6393 return 0;
6394 }
6395
6396 /* (?Rdigits) treated as a recursion reference by number. A value of
6397 zero (which is the result of both (?R) and (?R0)) means "any", and is
6398 translated into RREF_ANY (which is 0xffff). */
6399
6400 if (groupnumber == 0) groupnumber = RREF_ANY;
6401 code[1+LINK_SIZE] = OP_RREF;
6402 PUT2(code, 2+LINK_SIZE, groupnumber);
6403 skipunits = 1+IMM2_SIZE;
6404 goto GROUP_PROCESS_NOTE_EMPTY;
6405 }
6406
6407 /* A duplicated name was found. Note that if an R<digits> name is found
6408 (META_COND_RNUMBER), it is a reference test, not a recursion test. */
6409
6410 code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6411
6412 /* We have a duplicated name. In the compile pass we have to search the
6413 main table in order to get the index and count values. */
6414
6415 count = 0; /* Values for first pass (avoids compiler warning) */
6416 index = 0;
6417 if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6418 &count, errorcodeptr, cb)) return 0;
6419
6420 /* Add one to the opcode to change CREF/RREF into DNCREF/DNRREF and
6421 insert appropriate data values. */
6422
6423 code[1+LINK_SIZE]++;
6424 skipunits = 1+2*IMM2_SIZE;
6425 PUT2(code, 2+LINK_SIZE, index);
6426 PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6427 }
6428 goto GROUP_PROCESS_NOTE_EMPTY;
6429
6430 /* The DEFINE condition is always false. Its internal groups may never
6431 be called, so matched_char must remain false, hence the jump to
6432 GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */
6433
6434 case META_COND_DEFINE:
6435 bravalue = OP_COND;
6436 GETPLUSOFFSET(offset, pptr);
6437 code[1+LINK_SIZE] = OP_DEFINE;
6438 skipunits = 1;
6439 goto GROUP_PROCESS;
6440
6441 /* Conditional test of a group's being set. */
6442
6443 case META_COND_NUMBER:
6444 bravalue = OP_COND;
6445 GETPLUSOFFSET(offset, pptr);
6446 groupnumber = *(++pptr);
6447 if (groupnumber > cb->bracount)
6448 {
6449 *errorcodeptr = ERR15;
6450 cb->erroroffset = offset;
6451 return 0;
6452 }
6453 if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
6454 offset -= 2; /* Point at initial ( for too many branches error */
6455 code[1+LINK_SIZE] = OP_CREF;
6456 skipunits = 1+IMM2_SIZE;
6457 PUT2(code, 2+LINK_SIZE, groupnumber);
6458 goto GROUP_PROCESS_NOTE_EMPTY;
6459
6460 /* Test for the PCRE2 version. */
6461
6462 case META_COND_VERSION:
6463 bravalue = OP_COND;
6464 if (pptr[1] > 0)
6465 code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) ||
6466 (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))?
6467 OP_TRUE : OP_FALSE;
6468 else
6469 code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])?
6470 OP_TRUE : OP_FALSE;
6471 skipunits = 1;
6472 pptr += 3;
6473 goto GROUP_PROCESS_NOTE_EMPTY;
6474
6475 /* The condition is an assertion, possibly preceded by a callout. */
6476
6477 case META_COND_ASSERT:
6478 bravalue = OP_COND;
6479 goto GROUP_PROCESS_NOTE_EMPTY;
6480
6481
6482 /* ===================================================================*/
6483 /* Handle all kinds of nested bracketed groups. The non-capturing,
6484 non-conditional cases are here; others come to GROUP_PROCESS via goto. */
6485
6486 case META_LOOKAHEAD:
6487 bravalue = OP_ASSERT;
6488 cb->assert_depth += 1;
6489 goto GROUP_PROCESS;
6490
6491 case META_LOOKAHEAD_NA:
6492 bravalue = OP_ASSERT_NA;
6493 cb->assert_depth += 1;
6494 goto GROUP_PROCESS;
6495
6496 /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6497 thing to do, but Perl allows all assertions to be quantified, and when
6498 they contain capturing parentheses there may be a potential use for
6499 this feature. Not that that applies to a quantified (?!) but we allow
6500 it for uniformity. */
6501
6502 case META_LOOKAHEADNOT:
6503 if (pptr[1] == META_KET &&
6504 (pptr[2] < META_ASTERISK || pptr[2] > META_MINMAX_QUERY))
6505 {
6506 *code++ = OP_FAIL;
6507 pptr++;
6508 }
6509 else
6510 {
6511 bravalue = OP_ASSERT_NOT;
6512 cb->assert_depth += 1;
6513 goto GROUP_PROCESS;
6514 }
6515 break;
6516
6517 case META_LOOKBEHIND:
6518 bravalue = OP_ASSERTBACK;
6519 cb->assert_depth += 1;
6520 goto GROUP_PROCESS;
6521
6522 case META_LOOKBEHINDNOT:
6523 bravalue = OP_ASSERTBACK_NOT;
6524 cb->assert_depth += 1;
6525 goto GROUP_PROCESS;
6526
6527 case META_LOOKBEHIND_NA:
6528 bravalue = OP_ASSERTBACK_NA;
6529 cb->assert_depth += 1;
6530 goto GROUP_PROCESS;
6531
6532 case META_ATOMIC:
6533 bravalue = OP_ONCE;
6534 goto GROUP_PROCESS_NOTE_EMPTY;
6535
6536 case META_SCRIPT_RUN:
6537 bravalue = OP_SCRIPT_RUN;
6538 goto GROUP_PROCESS_NOTE_EMPTY;
6539
6540 case META_NOCAPTURE:
6541 bravalue = OP_BRA;
6542 /* Fall through */
6543
6544 /* Process nested bracketed regex. The nesting depth is maintained for the
6545 benefit of the stackguard function. The test for too deep nesting is now
6546 done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
6547 others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
6548 note of whether or not they may match an empty string. */
6549
6550 GROUP_PROCESS_NOTE_EMPTY:
6551 note_group_empty = TRUE;
6552
6553 GROUP_PROCESS:
6554 cb->parens_depth += 1;
6555 *code = bravalue;
6556 pptr++;
6557 tempcode = code;
6558 tempreqvary = cb->req_varyopt; /* Save value before group */
6559 length_prevgroup = 0; /* Initialize for pre-compile phase */
6560
6561 if ((group_return =
6562 compile_regex(
6563 options, /* The option state */
6564 &tempcode, /* Where to put code (updated) */
6565 &pptr, /* Input pointer (updated) */
6566 errorcodeptr, /* Where to put an error message */
6567 skipunits, /* Skip over bracket number */
6568 &subfirstcu, /* For possible first char */
6569 &subfirstcuflags,
6570 &subreqcu, /* For possible last char */
6571 &subreqcuflags,
6572 bcptr, /* Current branch chain */
6573 cb, /* Compile data block */
6574 (lengthptr == NULL)? NULL : /* Actual compile phase */
6575 &length_prevgroup /* Pre-compile phase */
6576 )) == 0)
6577 return 0; /* Error */
6578
6579 cb->parens_depth -= 1;
6580
6581 /* If that was a non-conditional significant group (not an assertion, not a
6582 DEFINE) that matches at least one character, then the current item matches
6583 a character. Conditionals are handled below. */
6584
6585 if (note_group_empty && bravalue != OP_COND && group_return > 0)
6586 matched_char = TRUE;
6587
6588 /* If we've just compiled an assertion, pop the assert depth. */
6589
6590 if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NA)
6591 cb->assert_depth -= 1;
6592
6593 /* At the end of compiling, code is still pointing to the start of the
6594 group, while tempcode has been updated to point past the end of the group.
6595 The parsed pattern pointer (pptr) is on the closing META_KET.
6596
6597 If this is a conditional bracket, check that there are no more than
6598 two branches in the group, or just one if it's a DEFINE group. We do this
6599 in the real compile phase, not in the pre-pass, where the whole group may
6600 not be available. */
6601
6602 if (bravalue == OP_COND && lengthptr == NULL)
6603 {
6604 PCRE2_UCHAR *tc = code;
6605 int condcount = 0;
6606
6607 do {
6608 condcount++;
6609 tc += GET(tc,1);
6610 }
6611 while (*tc != OP_KET);
6612
6613 /* A DEFINE group is never obeyed inline (the "condition" is always
6614 false). It must have only one branch. Having checked this, change the
6615 opcode to OP_FALSE. */
6616
6617 if (code[LINK_SIZE+1] == OP_DEFINE)
6618 {
6619 if (condcount > 1)
6620 {
6621 cb->erroroffset = offset;
6622 *errorcodeptr = ERR54;
6623 return 0;
6624 }
6625 code[LINK_SIZE+1] = OP_FALSE;
6626 bravalue = OP_DEFINE; /* A flag to suppress char handling below */
6627 }
6628
6629 /* A "normal" conditional group. If there is just one branch, we must not
6630 make use of its firstcu or reqcu, because this is equivalent to an
6631 empty second branch. Also, it may match an empty string. If there are two
6632 branches, this item must match a character if the group must. */
6633
6634 else
6635 {
6636 if (condcount > 2)
6637 {
6638 cb->erroroffset = offset;
6639 *errorcodeptr = ERR27;
6640 return 0;
6641 }
6642 if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;
6643 else if (group_return > 0) matched_char = TRUE;
6644 }
6645 }
6646
6647 /* In the pre-compile phase, update the length by the length of the group,
6648 less the brackets at either end. Then reduce the compiled code to just a
6649 set of non-capturing brackets so that it doesn't use much memory if it is
6650 duplicated by a quantifier.*/
6651
6652 if (lengthptr != NULL)
6653 {
6654 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
6655 {
6656 *errorcodeptr = ERR20;
6657 return 0;
6658 }
6659 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
6660 code++; /* This already contains bravalue */
6661 PUTINC(code, 0, 1 + LINK_SIZE);
6662 *code++ = OP_KET;
6663 PUTINC(code, 0, 1 + LINK_SIZE);
6664 break; /* No need to waste time with special character handling */
6665 }
6666
6667 /* Otherwise update the main code pointer to the end of the group. */
6668
6669 code = tempcode;
6670
6671 /* For a DEFINE group, required and first character settings are not
6672 relevant. */
6673
6674 if (bravalue == OP_DEFINE) break;
6675
6676 /* Handle updating of the required and first code units for other types of
6677 group. Update for normal brackets of all kinds, and conditions with two
6678 branches (see code above). If the bracket is followed by a quantifier with
6679 zero repeat, we have to back off. Hence the definition of zeroreqcu and
6680 zerofirstcu outside the main loop so that they can be accessed for the back
6681 off. */
6682
6683 zeroreqcu = reqcu;
6684 zeroreqcuflags = reqcuflags;
6685 zerofirstcu = firstcu;
6686 zerofirstcuflags = firstcuflags;
6687 groupsetfirstcu = FALSE;
6688
6689 if (bravalue >= OP_ONCE) /* Not an assertion */
6690 {
6691 /* If we have not yet set a firstcu in this branch, take it from the
6692 subpattern, remembering that it was set here so that a repeat of more
6693 than one can replicate it as reqcu if necessary. If the subpattern has
6694 no firstcu, set "none" for the whole branch. In both cases, a zero
6695 repeat forces firstcu to "none". */
6696
6697 if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
6698 {
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07006699 if (subfirstcuflags < REQ_NONE)
Elliott Hughes5b808042021-10-01 10:56:10 -07006700 {
6701 firstcu = subfirstcu;
6702 firstcuflags = subfirstcuflags;
6703 groupsetfirstcu = TRUE;
6704 }
6705 else firstcuflags = REQ_NONE;
6706 zerofirstcuflags = REQ_NONE;
6707 }
6708
6709 /* If firstcu was previously set, convert the subpattern's firstcu
6710 into reqcu if there wasn't one, using the vary flag that was in
6711 existence beforehand. */
6712
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07006713 else if (subfirstcuflags < REQ_NONE && subreqcuflags >= REQ_NONE)
Elliott Hughes5b808042021-10-01 10:56:10 -07006714 {
6715 subreqcu = subfirstcu;
6716 subreqcuflags = subfirstcuflags | tempreqvary;
6717 }
6718
6719 /* If the subpattern set a required code unit (or set a first code unit
6720 that isn't really the first code unit - see above), set it. */
6721
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07006722 if (subreqcuflags < REQ_NONE)
Elliott Hughes5b808042021-10-01 10:56:10 -07006723 {
6724 reqcu = subreqcu;
6725 reqcuflags = subreqcuflags;
6726 }
6727 }
6728
6729 /* For a forward assertion, we take the reqcu, if set, provided that the
6730 group has also set a firstcu. This can be helpful if the pattern that
6731 follows the assertion doesn't set a different char. For example, it's
6732 useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
6733 because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
6734 the "real" "a" would then become a reqcu instead of a firstcu. This is
6735 overcome by a scan at the end if there's no firstcu, looking for an
6736 asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
6737 we must only take the reqcu when the group also set a firstcu. Otherwise,
6738 in that example, 'X' ends up set for both. */
6739
6740 else if ((bravalue == OP_ASSERT || bravalue == OP_ASSERT_NA) &&
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07006741 subreqcuflags < REQ_NONE && subfirstcuflags < REQ_NONE)
Elliott Hughes5b808042021-10-01 10:56:10 -07006742 {
6743 reqcu = subreqcu;
6744 reqcuflags = subreqcuflags;
6745 }
6746
6747 break; /* End of nested group handling */
6748
6749
6750 /* ===================================================================*/
6751 /* Handle named backreferences and recursions. */
6752
6753 case META_BACKREF_BYNAME:
6754 case META_RECURSE_BYNAME:
6755 {
6756 int count, index;
6757 PCRE2_SPTR name;
6758 BOOL is_dupname = FALSE;
6759 named_group *ng = cb->named_groups;
6760 uint32_t length = *(++pptr);
6761
6762 GETPLUSOFFSET(offset, pptr);
6763 name = cb->start_pattern + offset;
6764
6765 /* In the first pass, the names generated in the pre-pass are available,
6766 but the main name table has not yet been created. Scan the list of names
6767 generated in the pre-pass in order to get a number and whether or not
6768 this name is duplicated. */
6769
6770 groupnumber = 0;
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07006771 for (unsigned int i = 0; i < cb->names_found; i++, ng++)
Elliott Hughes5b808042021-10-01 10:56:10 -07006772 {
6773 if (length == ng->length &&
6774 PRIV(strncmp)(name, ng->name, length) == 0)
6775 {
6776 is_dupname = ng->isdup;
6777 groupnumber = ng->number;
6778
6779 /* For a recursion, that's all that is needed. We can now go to
6780 the code that handles numerical recursion, applying it to the first
6781 group with the given name. */
6782
6783 if (meta == META_RECURSE_BYNAME)
6784 {
6785 meta_arg = groupnumber;
6786 goto HANDLE_NUMERICAL_RECURSION;
6787 }
6788
6789 /* For a back reference, update the back reference map and the
6790 maximum back reference. */
6791
6792 cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
6793 if (groupnumber > cb->top_backref)
6794 cb->top_backref = groupnumber;
6795 }
6796 }
6797
6798 /* If the name was not found we have a bad reference. */
6799
6800 if (groupnumber == 0)
6801 {
6802 *errorcodeptr = ERR15;
6803 cb->erroroffset = offset;
6804 return 0;
6805 }
6806
6807 /* If a back reference name is not duplicated, we can handle it as
6808 a numerical reference. */
6809
6810 if (!is_dupname)
6811 {
6812 meta_arg = groupnumber;
6813 goto HANDLE_SINGLE_REFERENCE;
6814 }
6815
6816 /* If a back reference name is duplicated, we generate a different
6817 opcode to a numerical back reference. In the second pass we must
6818 search for the index and count in the final name table. */
6819
6820 count = 0; /* Values for first pass (avoids compiler warning) */
6821 index = 0;
6822 if (lengthptr == NULL && !find_dupname_details(name, length, &index,
6823 &count, errorcodeptr, cb)) return 0;
6824
6825 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6826 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
6827 PUT2INC(code, 0, index);
6828 PUT2INC(code, 0, count);
6829 }
6830 break;
6831
6832
6833 /* ===================================================================*/
6834 /* Handle a numerical callout. */
6835
6836 case META_CALLOUT_NUMBER:
6837 code[0] = OP_CALLOUT;
6838 PUT(code, 1, pptr[1]); /* Offset to next pattern item */
6839 PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */
6840 code[1 + 2*LINK_SIZE] = pptr[3];
6841 pptr += 3;
6842 code += PRIV(OP_lengths)[OP_CALLOUT];
6843 break;
6844
6845
6846 /* ===================================================================*/
6847 /* Handle a callout with a string argument. In the pre-pass we just compute
6848 the length without generating anything. The length in pptr[3] includes both
6849 delimiters; in the actual compile only the first one is copied, but a
6850 terminating zero is added. Any doubled delimiters within the string make
6851 this an overestimate, but it is not worth bothering about. */
6852
6853 case META_CALLOUT_STRING:
6854 if (lengthptr != NULL)
6855 {
6856 *lengthptr += pptr[3] + (1 + 4*LINK_SIZE);
6857 pptr += 3;
6858 SKIPOFFSET(pptr);
6859 }
6860
6861 /* In the real compile we can copy the string. The starting delimiter is
6862 included so that the client can discover it if they want. We also pass the
6863 start offset to help a script language give better error messages. */
6864
6865 else
6866 {
6867 PCRE2_SPTR pp;
6868 uint32_t delimiter;
6869 uint32_t length = pptr[3];
6870 PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE);
6871
6872 code[0] = OP_CALLOUT_STR;
6873 PUT(code, 1, pptr[1]); /* Offset to next pattern item */
6874 PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */
6875
6876 pptr += 3;
6877 GETPLUSOFFSET(offset, pptr); /* Offset to string in pattern */
6878 pp = cb->start_pattern + offset;
6879 delimiter = *callout_string++ = *pp++;
6880 if (delimiter == CHAR_LEFT_CURLY_BRACKET)
6881 delimiter = CHAR_RIGHT_CURLY_BRACKET;
6882 PUT(code, 1 + 3*LINK_SIZE, (int)(offset + 1)); /* One after delimiter */
6883
6884 /* The syntax of the pattern was checked in the parsing scan. The length
6885 includes both delimiters, but we have passed the opening one just above,
6886 so we reduce length before testing it. The test is for > 1 because we do
6887 not want to copy the final delimiter. This also ensures that pp[1] is
6888 accessible. */
6889
6890 while (--length > 1)
6891 {
6892 if (*pp == delimiter && pp[1] == delimiter)
6893 {
6894 *callout_string++ = delimiter;
6895 pp += 2;
6896 length--;
6897 }
6898 else *callout_string++ = *pp++;
6899 }
6900 *callout_string++ = CHAR_NUL;
6901
6902 /* Set the length of the entire item, the advance to its end. */
6903
6904 PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code));
6905 code = callout_string;
6906 }
6907 break;
6908
6909
6910 /* ===================================================================*/
6911 /* Handle repetition. The different types are all sorted out in the parsing
6912 pass. */
6913
6914 case META_MINMAX_PLUS:
6915 case META_MINMAX_QUERY:
6916 case META_MINMAX:
6917 repeat_min = *(++pptr);
6918 repeat_max = *(++pptr);
6919 goto REPEAT;
6920
6921 case META_ASTERISK:
6922 case META_ASTERISK_PLUS:
6923 case META_ASTERISK_QUERY:
6924 repeat_min = 0;
6925 repeat_max = REPEAT_UNLIMITED;
6926 goto REPEAT;
6927
6928 case META_PLUS:
6929 case META_PLUS_PLUS:
6930 case META_PLUS_QUERY:
6931 repeat_min = 1;
6932 repeat_max = REPEAT_UNLIMITED;
6933 goto REPEAT;
6934
6935 case META_QUERY:
6936 case META_QUERY_PLUS:
6937 case META_QUERY_QUERY:
6938 repeat_min = 0;
6939 repeat_max = 1;
6940
6941 REPEAT:
6942 if (previous_matched_char && repeat_min > 0) matched_char = TRUE;
6943
6944 /* Remember whether this is a variable length repeat, and default to
6945 single-char opcodes. */
6946
6947 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
6948 op_type = 0;
6949
6950 /* Adjust first and required code units for a zero repeat. */
6951
6952 if (repeat_min == 0)
6953 {
6954 firstcu = zerofirstcu;
6955 firstcuflags = zerofirstcuflags;
6956 reqcu = zeroreqcu;
6957 reqcuflags = zeroreqcuflags;
6958 }
6959
6960 /* Note the greediness and possessiveness. */
6961
6962 switch (meta)
6963 {
6964 case META_MINMAX_PLUS:
6965 case META_ASTERISK_PLUS:
6966 case META_PLUS_PLUS:
6967 case META_QUERY_PLUS:
6968 repeat_type = 0; /* Force greedy */
6969 possessive_quantifier = TRUE;
6970 break;
6971
6972 case META_MINMAX_QUERY:
6973 case META_ASTERISK_QUERY:
6974 case META_PLUS_QUERY:
6975 case META_QUERY_QUERY:
6976 repeat_type = greedy_non_default;
6977 possessive_quantifier = FALSE;
6978 break;
6979
6980 default:
6981 repeat_type = greedy_default;
6982 possessive_quantifier = FALSE;
6983 break;
6984 }
6985
6986 /* Save start of previous item, in case we have to move it up in order to
6987 insert something before it, and remember what it was. */
6988
6989 tempcode = previous;
6990 op_previous = *previous;
6991
6992 /* Now handle repetition for the different types of item. If the repeat
6993 minimum and the repeat maximum are both 1, we can ignore the quantifier for
6994 non-parenthesized items, as they have only one alternative. For anything in
6995 parentheses, we must not ignore if {1} is possessive. */
6996
6997 switch (op_previous)
6998 {
6999 /* If previous was a character or negated character match, abolish the
7000 item and generate a repeat item instead. If a char item has a minimum of
7001 more than one, ensure that it is set in reqcu - it might not be if a
7002 sequence such as x{3} is the first thing in a branch because the x will
7003 have gone into firstcu instead. */
7004
7005 case OP_CHAR:
7006 case OP_CHARI:
7007 case OP_NOT:
7008 case OP_NOTI:
7009 if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7010 op_type = chartypeoffset[op_previous - OP_CHAR];
7011
7012 /* Deal with UTF characters that take up more than one code unit. */
7013
7014#ifdef MAYBE_UTF_MULTI
7015 if (utf && NOT_FIRSTCU(code[-1]))
7016 {
7017 PCRE2_UCHAR *lastchar = code - 1;
7018 BACKCHAR(lastchar);
7019 mclength = (uint32_t)(code - lastchar); /* Length of UTF character */
7020 memcpy(mcbuffer, lastchar, CU2BYTES(mclength)); /* Save the char */
7021 }
7022 else
7023#endif /* MAYBE_UTF_MULTI */
7024
7025 /* Handle the case of a single code unit - either with no UTF support, or
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07007026 with UTF disabled, or for a single-code-unit UTF character. In the latter
7027 case, for a repeated positive match, get the caseless flag for the
7028 required code unit from the previous character, because a class like [Aa]
7029 sets a caseless A but by now the req_caseopt flag has been reset. */
7030
Elliott Hughes5b808042021-10-01 10:56:10 -07007031 {
7032 mcbuffer[0] = code[-1];
7033 mclength = 1;
7034 if (op_previous <= OP_CHARI && repeat_min > 1)
7035 {
7036 reqcu = mcbuffer[0];
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07007037 reqcuflags = cb->req_varyopt;
7038 if (op_previous == OP_CHARI) reqcuflags |= REQ_CASELESS;
Elliott Hughes5b808042021-10-01 10:56:10 -07007039 }
7040 }
7041 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
7042
7043 /* If previous was a character class or a back reference, we put the
7044 repeat stuff after it, but just skip the item if the repeat was {0,0}. */
7045
7046#ifdef SUPPORT_WIDE_CHARS
7047 case OP_XCLASS:
7048#endif
7049 case OP_CLASS:
7050 case OP_NCLASS:
7051 case OP_REF:
7052 case OP_REFI:
7053 case OP_DNREF:
7054 case OP_DNREFI:
7055
7056 if (repeat_max == 0)
7057 {
7058 code = previous;
7059 goto END_REPEAT;
7060 }
7061 if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7062
7063 if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED)
7064 *code++ = OP_CRSTAR + repeat_type;
7065 else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED)
7066 *code++ = OP_CRPLUS + repeat_type;
7067 else if (repeat_min == 0 && repeat_max == 1)
7068 *code++ = OP_CRQUERY + repeat_type;
7069 else
7070 {
7071 *code++ = OP_CRRANGE + repeat_type;
7072 PUT2INC(code, 0, repeat_min);
7073 if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0; /* 2-byte encoding for max */
7074 PUT2INC(code, 0, repeat_max);
7075 }
7076 break;
7077
7078 /* If previous is OP_FAIL, it was generated by an empty class []
7079 (PCRE2_ALLOW_EMPTY_CLASS is set). The other ways in which OP_FAIL can be
7080 generated, that is by (*FAIL) or (?!), disallow a quantifier at parse
7081 time. We can just ignore this repeat. */
7082
7083 case OP_FAIL:
7084 goto END_REPEAT;
7085
7086 /* Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets
7087 because pcre2_match() could not handle backtracking into recursively
7088 called groups. Now that this backtracking is available, we no longer need
7089 to do this. However, we still need to replicate recursions as we do for
7090 groups so as to have independent backtracking points. We can replicate
7091 for the minimum number of repeats directly. For optional repeats we now
7092 wrap the recursion in OP_BRA brackets and make use of the bracket
7093 repetition. */
7094
7095 case OP_RECURSE:
7096 if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7097 goto END_REPEAT;
7098
7099 /* Generate unwrapped repeats for a non-zero minimum, except when the
7100 minimum is 1 and the maximum unlimited, because that can be handled with
7101 OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the
7102 minimum, we just need to generate the appropriate additional copies.
7103 Otherwise we need to generate one more, to simulate the situation when
7104 the minimum is zero. */
7105
7106 if (repeat_min > 0 && (repeat_min != 1 || repeat_max != REPEAT_UNLIMITED))
7107 {
7108 int replicate = repeat_min;
7109 if (repeat_min == repeat_max) replicate--;
7110
7111 /* In the pre-compile phase, we don't actually do the replication. We
7112 just adjust the length as if we had. Do some paranoid checks for
7113 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
7114 integer type when available, otherwise double. */
7115
7116 if (lengthptr != NULL)
7117 {
7118 PCRE2_SIZE delta = replicate*(1 + LINK_SIZE);
7119 if ((INT64_OR_DOUBLE)replicate*
7120 (INT64_OR_DOUBLE)(1 + LINK_SIZE) >
7121 (INT64_OR_DOUBLE)INT_MAX ||
7122 OFLOW_MAX - *lengthptr < delta)
7123 {
7124 *errorcodeptr = ERR20;
7125 return 0;
7126 }
7127 *lengthptr += delta;
7128 }
7129
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07007130 else for (int i = 0; i < replicate; i++)
Elliott Hughes5b808042021-10-01 10:56:10 -07007131 {
7132 memcpy(code, previous, CU2BYTES(1 + LINK_SIZE));
7133 previous = code;
7134 code += 1 + LINK_SIZE;
7135 }
7136
7137 /* If the number of repeats is fixed, we are done. Otherwise, adjust
7138 the counts and fall through. */
7139
7140 if (repeat_min == repeat_max) break;
7141 if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7142 repeat_min = 0;
7143 }
7144
7145 /* Wrap the recursion call in OP_BRA brackets. */
7146
7147 (void)memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(1 + LINK_SIZE));
7148 op_previous = *previous = OP_BRA;
7149 PUT(previous, 1, 2 + 2*LINK_SIZE);
7150 previous[2 + 2*LINK_SIZE] = OP_KET;
7151 PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
7152 code += 2 + 2 * LINK_SIZE;
7153 length_prevgroup = 3 + 3*LINK_SIZE;
7154 group_return = -1; /* Set "may match empty string" */
7155
7156 /* Now treat as a repeated OP_BRA. */
7157 /* Fall through */
7158
7159 /* If previous was a bracket group, we may have to replicate it in
7160 certain cases. Note that at this point we can encounter only the "basic"
7161 bracket opcodes such as BRA and CBRA, as this is the place where they get
7162 converted into the more special varieties such as BRAPOS and SBRA.
7163 Originally, PCRE did not allow repetition of assertions, but now it does,
7164 for Perl compatibility. */
7165
7166 case OP_ASSERT:
7167 case OP_ASSERT_NOT:
7168 case OP_ASSERT_NA:
7169 case OP_ASSERTBACK:
7170 case OP_ASSERTBACK_NOT:
7171 case OP_ASSERTBACK_NA:
7172 case OP_ONCE:
7173 case OP_SCRIPT_RUN:
7174 case OP_BRA:
7175 case OP_CBRA:
7176 case OP_COND:
7177 {
7178 int len = (int)(code - previous);
7179 PCRE2_UCHAR *bralink = NULL;
7180 PCRE2_UCHAR *brazeroptr = NULL;
7181
7182 if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7183 goto END_REPEAT;
7184
7185 /* Repeating a DEFINE group (or any group where the condition is always
7186 FALSE and there is only one branch) is pointless, but Perl allows the
7187 syntax, so we just ignore the repeat. */
7188
7189 if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE &&
7190 previous[GET(previous, 1)] != OP_ALT)
7191 goto END_REPEAT;
7192
7193 /* Perl allows all assertions to be quantified, and when they contain
7194 capturing parentheses and/or are optional there are potential uses for
7195 this feature. PCRE2 used to force the maximum quantifier to 1 on the
7196 invalid grounds that further repetition was never useful. This was
7197 always a bit pointless, since an assertion could be wrapped with a
7198 repeated group to achieve the effect. General repetition is now
7199 permitted, but if the maximum is unlimited it is set to one more than
7200 the minimum. */
7201
7202 if (op_previous < OP_ONCE) /* Assertion */
7203 {
7204 if (repeat_max == REPEAT_UNLIMITED) repeat_max = repeat_min + 1;
7205 }
7206
7207 /* The case of a zero minimum is special because of the need to stick
7208 OP_BRAZERO in front of it, and because the group appears once in the
7209 data, whereas in other cases it appears the minimum number of times. For
7210 this reason, it is simplest to treat this case separately, as otherwise
7211 the code gets far too messy. There are several special subcases when the
7212 minimum is zero. */
7213
7214 if (repeat_min == 0)
7215 {
7216 /* If the maximum is also zero, we used to just omit the group from
7217 the output altogether, like this:
7218
7219 ** if (repeat_max == 0)
7220 ** {
7221 ** code = previous;
7222 ** goto END_REPEAT;
7223 ** }
7224
7225 However, that fails when a group or a subgroup within it is
7226 referenced as a subroutine from elsewhere in the pattern, so now we
7227 stick in OP_SKIPZERO in front of it so that it is skipped on
7228 execution. As we don't have a list of which groups are referenced, we
7229 cannot do this selectively.
7230
7231 If the maximum is 1 or unlimited, we just have to stick in the
7232 BRAZERO and do no more at this point. */
7233
7234 if (repeat_max <= 1 || repeat_max == REPEAT_UNLIMITED)
7235 {
7236 (void)memmove(previous + 1, previous, CU2BYTES(len));
7237 code++;
7238 if (repeat_max == 0)
7239 {
7240 *previous++ = OP_SKIPZERO;
7241 goto END_REPEAT;
7242 }
7243 brazeroptr = previous; /* Save for possessive optimizing */
7244 *previous++ = OP_BRAZERO + repeat_type;
7245 }
7246
7247 /* If the maximum is greater than 1 and limited, we have to replicate
7248 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
7249 The first one has to be handled carefully because it's the original
7250 copy, which has to be moved up. The remainder can be handled by code
7251 that is common with the non-zero minimum case below. We have to
7252 adjust the value or repeat_max, since one less copy is required. */
7253
7254 else
7255 {
7256 int linkoffset;
7257 (void)memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));
7258 code += 2 + LINK_SIZE;
7259 *previous++ = OP_BRAZERO + repeat_type;
7260 *previous++ = OP_BRA;
7261
7262 /* We chain together the bracket link offset fields that have to be
7263 filled in later when the ends of the brackets are reached. */
7264
7265 linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink);
7266 bralink = previous;
7267 PUTINC(previous, 0, linkoffset);
7268 }
7269
7270 if (repeat_max != REPEAT_UNLIMITED) repeat_max--;
7271 }
7272
7273 /* If the minimum is greater than zero, replicate the group as many
7274 times as necessary, and adjust the maximum to the number of subsequent
7275 copies that we need. */
7276
7277 else
7278 {
7279 if (repeat_min > 1)
7280 {
7281 /* In the pre-compile phase, we don't actually do the replication.
7282 We just adjust the length as if we had. Do some paranoid checks for
7283 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
7284 integer type when available, otherwise double. */
7285
7286 if (lengthptr != NULL)
7287 {
7288 PCRE2_SIZE delta = (repeat_min - 1)*length_prevgroup;
7289 if ((INT64_OR_DOUBLE)(repeat_min - 1)*
7290 (INT64_OR_DOUBLE)length_prevgroup >
7291 (INT64_OR_DOUBLE)INT_MAX ||
7292 OFLOW_MAX - *lengthptr < delta)
7293 {
7294 *errorcodeptr = ERR20;
7295 return 0;
7296 }
7297 *lengthptr += delta;
7298 }
7299
7300 /* This is compiling for real. If there is a set first code unit
7301 for the group, and we have not yet set a "required code unit", set
7302 it. */
7303
7304 else
7305 {
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07007306 if (groupsetfirstcu && reqcuflags >= REQ_NONE)
Elliott Hughes5b808042021-10-01 10:56:10 -07007307 {
7308 reqcu = firstcu;
7309 reqcuflags = firstcuflags;
7310 }
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07007311 for (uint32_t i = 1; i < repeat_min; i++)
Elliott Hughes5b808042021-10-01 10:56:10 -07007312 {
7313 memcpy(code, previous, CU2BYTES(len));
7314 code += len;
7315 }
7316 }
7317 }
7318
7319 if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7320 }
7321
7322 /* This code is common to both the zero and non-zero minimum cases. If
7323 the maximum is limited, it replicates the group in a nested fashion,
7324 remembering the bracket starts on a stack. In the case of a zero
7325 minimum, the first one was set up above. In all cases the repeat_max
7326 now specifies the number of additional copies needed. Again, we must
7327 remember to replicate entries on the forward reference list. */
7328
7329 if (repeat_max != REPEAT_UNLIMITED)
7330 {
7331 /* In the pre-compile phase, we don't actually do the replication. We
7332 just adjust the length as if we had. For each repetition we must add
7333 1 to the length for BRAZERO and for all but the last repetition we
7334 must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
7335 paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type
7336 is a 64-bit integer type when available, otherwise double. */
7337
7338 if (lengthptr != NULL && repeat_max > 0)
7339 {
7340 PCRE2_SIZE delta = repeat_max*(length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
7341 2 - 2*LINK_SIZE; /* Last one doesn't nest */
7342 if ((INT64_OR_DOUBLE)repeat_max *
7343 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
7344 > (INT64_OR_DOUBLE)INT_MAX ||
7345 OFLOW_MAX - *lengthptr < delta)
7346 {
7347 *errorcodeptr = ERR20;
7348 return 0;
7349 }
7350 *lengthptr += delta;
7351 }
7352
7353 /* This is compiling for real */
7354
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07007355 else for (uint32_t i = repeat_max; i >= 1; i--)
Elliott Hughes5b808042021-10-01 10:56:10 -07007356 {
7357 *code++ = OP_BRAZERO + repeat_type;
7358
7359 /* All but the final copy start a new nesting, maintaining the
7360 chain of brackets outstanding. */
7361
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07007362 if (i != 1)
Elliott Hughes5b808042021-10-01 10:56:10 -07007363 {
7364 int linkoffset;
7365 *code++ = OP_BRA;
7366 linkoffset = (bralink == NULL)? 0 : (int)(code - bralink);
7367 bralink = code;
7368 PUTINC(code, 0, linkoffset);
7369 }
7370
7371 memcpy(code, previous, CU2BYTES(len));
7372 code += len;
7373 }
7374
7375 /* Now chain through the pending brackets, and fill in their length
7376 fields (which are holding the chain links pro tem). */
7377
7378 while (bralink != NULL)
7379 {
7380 int oldlinkoffset;
7381 int linkoffset = (int)(code - bralink + 1);
7382 PCRE2_UCHAR *bra = code - linkoffset;
7383 oldlinkoffset = GET(bra, 1);
7384 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
7385 *code++ = OP_KET;
7386 PUTINC(code, 0, linkoffset);
7387 PUT(bra, 1, linkoffset);
7388 }
7389 }
7390
7391 /* If the maximum is unlimited, set a repeater in the final copy. For
7392 SCRIPT_RUN and ONCE brackets, that's all we need to do. However,
7393 possessively repeated ONCE brackets can be converted into non-capturing
7394 brackets, as the behaviour of (?:xx)++ is the same as (?>xx)++ and this
7395 saves having to deal with possessive ONCEs specially.
7396
7397 Otherwise, when we are doing the actual compile phase, check to see
7398 whether this group is one that could match an empty string. If so,
7399 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
7400 that runtime checking can be done. [This check is also applied to ONCE
7401 and SCRIPT_RUN groups at runtime, but in a different way.]
7402
7403 Then, if the quantifier was possessive and the bracket is not a
7404 conditional, we convert the BRA code to the POS form, and the KET code
7405 to KETRPOS. (It turns out to be convenient at runtime to detect this
7406 kind of subpattern at both the start and at the end.) The use of
7407 special opcodes makes it possible to reduce greatly the stack usage in
7408 pcre2_match(). If the group is preceded by OP_BRAZERO, convert this to
7409 OP_BRAPOSZERO.
7410
7411 Then, if the minimum number of matches is 1 or 0, cancel the possessive
7412 flag so that the default action below, of wrapping everything inside
7413 atomic brackets, does not happen. When the minimum is greater than 1,
7414 there will be earlier copies of the group, and so we still have to wrap
7415 the whole thing. */
7416
7417 else
7418 {
7419 PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE;
7420 PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1);
7421
7422 /* Convert possessive ONCE brackets to non-capturing */
7423
7424 if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
7425
7426 /* For non-possessive ONCE and for SCRIPT_RUN brackets, all we need
7427 to do is to set the KET. */
7428
7429 if (*bracode == OP_ONCE || *bracode == OP_SCRIPT_RUN)
7430 *ketcode = OP_KETRMAX + repeat_type;
7431
7432 /* Handle non-SCRIPT_RUN and non-ONCE brackets and possessive ONCEs
7433 (which have been converted to non-capturing above). */
7434
7435 else
7436 {
7437 /* In the compile phase, adjust the opcode if the group can match
7438 an empty string. For a conditional group with only one branch, the
7439 value of group_return will not show "could be empty", so we must
7440 check that separately. */
7441
7442 if (lengthptr == NULL)
7443 {
7444 if (group_return < 0) *bracode += OP_SBRA - OP_BRA;
7445 if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
7446 *bracode = OP_SCOND;
7447 }
7448
7449 /* Handle possessive quantifiers. */
7450
7451 if (possessive_quantifier)
7452 {
7453 /* For COND brackets, we wrap the whole thing in a possessively
7454 repeated non-capturing bracket, because we have not invented POS
7455 versions of the COND opcodes. */
7456
7457 if (*bracode == OP_COND || *bracode == OP_SCOND)
7458 {
7459 int nlen = (int)(code - bracode);
7460 (void)memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));
7461 code += 1 + LINK_SIZE;
7462 nlen += 1 + LINK_SIZE;
7463 *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
7464 *code++ = OP_KETRPOS;
7465 PUTINC(code, 0, nlen);
7466 PUT(bracode, 1, nlen);
7467 }
7468
7469 /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
7470
7471 else
7472 {
7473 *bracode += 1; /* Switch to xxxPOS opcodes */
7474 *ketcode = OP_KETRPOS;
7475 }
7476
7477 /* If the minimum is zero, mark it as possessive, then unset the
7478 possessive flag when the minimum is 0 or 1. */
7479
7480 if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
7481 if (repeat_min < 2) possessive_quantifier = FALSE;
7482 }
7483
7484 /* Non-possessive quantifier */
7485
7486 else *ketcode = OP_KETRMAX + repeat_type;
7487 }
7488 }
7489 }
7490 break;
7491
7492 /* If previous was a character type match (\d or similar), abolish it and
7493 create a suitable repeat item. The code is shared with single-character
7494 repeats by setting op_type to add a suitable offset into repeat_type.
7495 Note the the Unicode property types will be present only when
7496 SUPPORT_UNICODE is defined, but we don't wrap the little bits of code
7497 here because it just makes it horribly messy. */
7498
7499 default:
7500 if (op_previous >= OP_EODN) /* Not a character type - internal error */
7501 {
7502 *errorcodeptr = ERR10;
7503 return 0;
7504 }
7505 else
7506 {
7507 int prop_type, prop_value;
7508 PCRE2_UCHAR *oldcode;
7509
7510 if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7511
7512 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
7513 mclength = 0; /* Not a character */
7514
7515 if (op_previous == OP_PROP || op_previous == OP_NOTPROP)
7516 {
7517 prop_type = previous[1];
7518 prop_value = previous[2];
7519 }
7520 else
7521 {
7522 /* Come here from just above with a character in mcbuffer/mclength. */
7523 OUTPUT_SINGLE_REPEAT:
7524 prop_type = prop_value = -1;
7525 }
7526
7527 /* At this point, if prop_type == prop_value == -1 we either have a
7528 character in mcbuffer when mclength is greater than zero, or we have
7529 mclength zero, in which case there is a non-property character type in
7530 op_previous. If prop_type/value are not negative, we have a property
7531 character type in op_previous. */
7532
7533 oldcode = code; /* Save where we were */
7534 code = previous; /* Usually overwrite previous item */
7535
7536 /* If the maximum is zero then the minimum must also be zero; Perl allows
7537 this case, so we do too - by simply omitting the item altogether. */
7538
7539 if (repeat_max == 0) goto END_REPEAT;
7540
7541 /* Combine the op_type with the repeat_type */
7542
7543 repeat_type += op_type;
7544
7545 /* A minimum of zero is handled either as the special case * or ?, or as
7546 an UPTO, with the maximum given. */
7547
7548 if (repeat_min == 0)
7549 {
7550 if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type;
7551 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
7552 else
7553 {
7554 *code++ = OP_UPTO + repeat_type;
7555 PUT2INC(code, 0, repeat_max);
7556 }
7557 }
7558
7559 /* A repeat minimum of 1 is optimized into some special cases. If the
7560 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
7561 left in place and, if the maximum is greater than 1, we use OP_UPTO with
7562 one less than the maximum. */
7563
7564 else if (repeat_min == 1)
7565 {
7566 if (repeat_max == REPEAT_UNLIMITED)
7567 *code++ = OP_PLUS + repeat_type;
7568 else
7569 {
7570 code = oldcode; /* Leave previous item in place */
7571 if (repeat_max == 1) goto END_REPEAT;
7572 *code++ = OP_UPTO + repeat_type;
7573 PUT2INC(code, 0, repeat_max - 1);
7574 }
7575 }
7576
7577 /* The case {n,n} is just an EXACT, while the general case {n,m} is
7578 handled as an EXACT followed by an UPTO or STAR or QUERY. */
7579
7580 else
7581 {
7582 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
7583 PUT2INC(code, 0, repeat_min);
7584
7585 /* Unless repeat_max equals repeat_min, fill in the data for EXACT,
7586 and then generate the second opcode. For a repeated Unicode property
7587 match, there are two extra values that define the required property,
7588 and mclength is set zero to indicate this. */
7589
7590 if (repeat_max != repeat_min)
7591 {
7592 if (mclength > 0)
7593 {
7594 memcpy(code, mcbuffer, CU2BYTES(mclength));
7595 code += mclength;
7596 }
7597 else
7598 {
7599 *code++ = op_previous;
7600 if (prop_type >= 0)
7601 {
7602 *code++ = prop_type;
7603 *code++ = prop_value;
7604 }
7605 }
7606
7607 /* Now set up the following opcode */
7608
7609 if (repeat_max == REPEAT_UNLIMITED)
7610 *code++ = OP_STAR + repeat_type;
7611 else
7612 {
7613 repeat_max -= repeat_min;
7614 if (repeat_max == 1)
7615 {
7616 *code++ = OP_QUERY + repeat_type;
7617 }
7618 else
7619 {
7620 *code++ = OP_UPTO + repeat_type;
7621 PUT2INC(code, 0, repeat_max);
7622 }
7623 }
7624 }
7625 }
7626
7627 /* Fill in the character or character type for the final opcode. */
7628
7629 if (mclength > 0)
7630 {
7631 memcpy(code, mcbuffer, CU2BYTES(mclength));
7632 code += mclength;
7633 }
7634 else
7635 {
7636 *code++ = op_previous;
7637 if (prop_type >= 0)
7638 {
7639 *code++ = prop_type;
7640 *code++ = prop_value;
7641 }
7642 }
7643 }
7644 break;
7645 } /* End of switch on different op_previous values */
7646
7647
7648 /* If the character following a repeat is '+', possessive_quantifier is
7649 TRUE. For some opcodes, there are special alternative opcodes for this
7650 case. For anything else, we wrap the entire repeated item inside OP_ONCE
7651 brackets. Logically, the '+' notation is just syntactic sugar, taken from
7652 Sun's Java package, but the special opcodes can optimize it.
7653
7654 Some (but not all) possessively repeated subpatterns have already been
7655 completely handled in the code just above. For them, possessive_quantifier
7656 is always FALSE at this stage. Note that the repeated item starts at
7657 tempcode, not at previous, which might be the first part of a string whose
7658 (former) last char we repeated. */
7659
7660 if (possessive_quantifier)
7661 {
7662 int len;
7663
7664 /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
7665 However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
7666 {5,}, or {5,10}). We skip over an EXACT item; if the length of what
7667 remains is greater than zero, there's a further opcode that can be
7668 handled. If not, do nothing, leaving the EXACT alone. */
7669
7670 switch(*tempcode)
7671 {
7672 case OP_TYPEEXACT:
7673 tempcode += PRIV(OP_lengths)[*tempcode] +
7674 ((tempcode[1 + IMM2_SIZE] == OP_PROP
7675 || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
7676 break;
7677
7678 /* CHAR opcodes are used for exacts whose count is 1. */
7679
7680 case OP_CHAR:
7681 case OP_CHARI:
7682 case OP_NOT:
7683 case OP_NOTI:
7684 case OP_EXACT:
7685 case OP_EXACTI:
7686 case OP_NOTEXACT:
7687 case OP_NOTEXACTI:
7688 tempcode += PRIV(OP_lengths)[*tempcode];
7689#ifdef SUPPORT_UNICODE
7690 if (utf && HAS_EXTRALEN(tempcode[-1]))
7691 tempcode += GET_EXTRALEN(tempcode[-1]);
7692#endif
7693 break;
7694
7695 /* For the class opcodes, the repeat operator appears at the end;
7696 adjust tempcode to point to it. */
7697
7698 case OP_CLASS:
7699 case OP_NCLASS:
7700 tempcode += 1 + 32/sizeof(PCRE2_UCHAR);
7701 break;
7702
7703#ifdef SUPPORT_WIDE_CHARS
7704 case OP_XCLASS:
7705 tempcode += GET(tempcode, 1);
7706 break;
7707#endif
7708 }
7709
7710 /* If tempcode is equal to code (which points to the end of the repeated
7711 item), it means we have skipped an EXACT item but there is no following
7712 QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
7713 all other cases, tempcode will be pointing to the repeat opcode, and will
7714 be less than code, so the value of len will be greater than 0. */
7715
7716 len = (int)(code - tempcode);
7717 if (len > 0)
7718 {
7719 unsigned int repcode = *tempcode;
7720
7721 /* There is a table for possessifying opcodes, all of which are less
7722 than OP_CALLOUT. A zero entry means there is no possessified version.
7723 */
7724
7725 if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
7726 *tempcode = opcode_possessify[repcode];
7727
7728 /* For opcode without a special possessified version, wrap the item in
7729 ONCE brackets. */
7730
7731 else
7732 {
7733 (void)memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));
7734 code += 1 + LINK_SIZE;
7735 len += 1 + LINK_SIZE;
7736 tempcode[0] = OP_ONCE;
7737 *code++ = OP_KET;
7738 PUTINC(code, 0, len);
7739 PUT(tempcode, 1, len);
7740 }
7741 }
7742 }
7743
7744 /* We set the "follows varying string" flag for subsequently encountered
7745 reqcus if it isn't already set and we have just passed a varying length
7746 item. */
7747
7748 END_REPEAT:
7749 cb->req_varyopt |= reqvary;
7750 break;
7751
7752
7753 /* ===================================================================*/
7754 /* Handle a 32-bit data character with a value greater than META_END. */
7755
7756 case META_BIGVALUE:
7757 pptr++;
7758 goto NORMAL_CHAR;
7759
7760
7761 /* ===============================================================*/
7762 /* Handle a back reference by number, which is the meta argument. The
7763 pattern offsets for back references to group numbers less than 10 are held
7764 in a special vector, to avoid using more than two parsed pattern elements
7765 in 64-bit environments. We only need the offset to the first occurrence,
7766 because if that doesn't fail, subsequent ones will also be OK. */
7767
7768 case META_BACKREF:
7769 if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg];
7770 else GETPLUSOFFSET(offset, pptr);
7771
7772 if (meta_arg > cb->bracount)
7773 {
7774 cb->erroroffset = offset;
7775 *errorcodeptr = ERR15; /* Non-existent subpattern */
7776 return 0;
7777 }
7778
7779 /* Come here from named backref handling when the reference is to a
7780 single group (that is, not to a duplicated name). The back reference
7781 data will have already been updated. We must disable firstcu if not
7782 set, to cope with cases like (?=(\w+))\1: which would otherwise set ':'
7783 later. */
7784
7785 HANDLE_SINGLE_REFERENCE:
7786 if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE;
7787 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
7788 PUT2INC(code, 0, meta_arg);
7789
7790 /* Update the map of back references, and keep the highest one. We
7791 could do this in parse_regex() for numerical back references, but not
7792 for named back references, because we don't know the numbers to which
7793 named back references refer. So we do it all in this function. */
7794
7795 cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1;
7796 if (meta_arg > cb->top_backref) cb->top_backref = meta_arg;
7797 break;
7798
7799
7800 /* ===============================================================*/
7801 /* Handle recursion by inserting the number of the called group (which is
7802 the meta argument) after OP_RECURSE. At the end of compiling the pattern is
7803 scanned and these numbers are replaced by offsets within the pattern. It is
7804 done like this to avoid problems with forward references and adjusting
7805 offsets when groups are duplicated and moved (as discovered in previous
7806 implementations). Note that a recursion does not have a set first
7807 character. */
7808
7809 case META_RECURSE:
7810 GETPLUSOFFSET(offset, pptr);
7811 if (meta_arg > cb->bracount)
7812 {
7813 cb->erroroffset = offset;
7814 *errorcodeptr = ERR15; /* Non-existent subpattern */
7815 return 0;
7816 }
7817 HANDLE_NUMERICAL_RECURSION:
7818 *code = OP_RECURSE;
7819 PUT(code, 1, meta_arg);
7820 code += 1 + LINK_SIZE;
7821 groupsetfirstcu = FALSE;
7822 cb->had_recurse = TRUE;
7823 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7824 zerofirstcu = firstcu;
7825 zerofirstcuflags = firstcuflags;
7826 break;
7827
7828
7829 /* ===============================================================*/
7830 /* Handle capturing parentheses; the number is the meta argument. */
7831
7832 case META_CAPTURE:
7833 bravalue = OP_CBRA;
7834 skipunits = IMM2_SIZE;
7835 PUT2(code, 1+LINK_SIZE, meta_arg);
7836 cb->lastcapture = meta_arg;
7837 goto GROUP_PROCESS_NOTE_EMPTY;
7838
7839
7840 /* ===============================================================*/
7841 /* Handle escape sequence items. For ones like \d, the ESC_values are
7842 arranged to be the same as the corresponding OP_values in the default case
7843 when PCRE2_UCP is not set (which is the only case in which they will appear
7844 here).
7845
7846 Note: \Q and \E are never seen here, as they were dealt with in
7847 parse_pattern(). Neither are numerical back references or recursions, which
7848 were turned into META_BACKREF or META_RECURSE items, respectively. \k and
7849 \g, when followed by names, are turned into META_BACKREF_BYNAME or
7850 META_RECURSE_BYNAME. */
7851
7852 case META_ESCAPE:
7853
7854 /* We can test for escape sequences that consume a character because their
7855 values lie between ESC_b and ESC_Z; this may have to change if any new ones
7856 are ever created. For these sequences, we disable the setting of a first
7857 character if it hasn't already been set. */
7858
7859 if (meta_arg > ESC_b && meta_arg < ESC_Z)
7860 {
7861 matched_char = TRUE;
7862 if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7863 }
7864
7865 /* Set values to reset to if this is followed by a zero repeat. */
7866
7867 zerofirstcu = firstcu;
7868 zerofirstcuflags = firstcuflags;
7869 zeroreqcu = reqcu;
7870 zeroreqcuflags = reqcuflags;
7871
7872 /* If Unicode is not supported, \P and \p are not allowed and are
7873 faulted at parse time, so will never appear here. */
7874
7875#ifdef SUPPORT_UNICODE
7876 if (meta_arg == ESC_P || meta_arg == ESC_p)
7877 {
7878 uint32_t ptype = *(++pptr) >> 16;
7879 uint32_t pdata = *pptr & 0xffff;
7880
7881 /* The special case of \p{Any} is compiled to OP_ALLANY so as to benefit
7882 from the auto-anchoring code. */
7883
7884 if (meta_arg == ESC_p && ptype == PT_ANY)
7885 {
7886 *code++ = OP_ALLANY;
7887 }
7888 else
7889 {
7890 *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
7891 *code++ = ptype;
7892 *code++ = pdata;
7893 }
7894 break; /* End META_ESCAPE */
7895 }
7896#endif
7897
7898 /* \K is forbidden in lookarounds since 10.38 because that's what Perl has
7899 done. However, there's an option, in case anyone was relying on it. */
7900
7901 if (cb->assert_depth > 0 && meta_arg == ESC_K &&
7902 (cb->cx->extra_options & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) == 0)
7903 {
7904 *errorcodeptr = ERR99;
7905 return 0;
7906 }
7907
7908 /* For the rest (including \X when Unicode is supported - if not it's
7909 faulted at parse time), the OP value is the escape value when PCRE2_UCP is
7910 not set; if it is set, these escapes do not show up here because they are
7911 converted into Unicode property tests in parse_regex(). Note that \b and \B
7912 do a one-character lookbehind, and \A also behaves as if it does. */
7913
7914 if (meta_arg == ESC_C) cb->external_flags |= PCRE2_HASBKC; /* Record */
7915 if ((meta_arg == ESC_b || meta_arg == ESC_B || meta_arg == ESC_A) &&
7916 cb->max_lookbehind == 0)
7917 cb->max_lookbehind = 1;
7918
7919 /* In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY
7920 instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds. */
7921
7922#if PCRE2_CODE_UNIT_WIDTH == 32
7923 *code++ = (meta_arg == ESC_C)? OP_ALLANY : meta_arg;
7924#else
7925 *code++ = (!utf && meta_arg == ESC_C)? OP_ALLANY : meta_arg;
7926#endif
7927 break; /* End META_ESCAPE */
7928
7929
7930 /* ===================================================================*/
7931 /* Handle an unrecognized meta value. A parsed pattern value less than
7932 META_END is a literal. Otherwise we have a problem. */
7933
7934 default:
7935 if (meta >= META_END)
7936 {
7937#ifdef DEBUG_SHOW_PARSED
7938 fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x\n", *pptr);
7939#endif
7940 *errorcodeptr = ERR89; /* Internal error - unrecognized. */
7941 return 0;
7942 }
7943
7944 /* Handle a literal character. We come here by goto in the case of a
7945 32-bit, non-UTF character whose value is greater than META_END. */
7946
7947 NORMAL_CHAR:
7948 meta = *pptr; /* Get the full 32 bits */
7949 NORMAL_CHAR_SET: /* Character is already in meta */
7950 matched_char = TRUE;
7951
7952 /* For caseless UTF or UCP mode, check whether this character has more than
7953 one other case. If so, generate a special OP_PROP item instead of OP_CHARI.
7954 */
7955
7956#ifdef SUPPORT_UNICODE
7957 if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
7958 {
7959 uint32_t caseset = UCD_CASESET(meta);
7960 if (caseset != 0)
7961 {
7962 *code++ = OP_PROP;
7963 *code++ = PT_CLIST;
7964 *code++ = caseset;
7965 if (firstcuflags == REQ_UNSET)
7966 firstcuflags = zerofirstcuflags = REQ_NONE;
7967 break; /* End handling this meta item */
7968 }
7969 }
7970#endif
7971
7972 /* Caseful matches, or caseless and not one of the multicase characters. We
7973 come here by goto in the case of a positive class that contains only
7974 case-partners of a character with just two cases; matched_char has already
7975 been set TRUE and options fudged if necessary. */
7976
7977 CLASS_CASELESS_CHAR:
7978
7979 /* Get the character's code units into mcbuffer, with the length in
7980 mclength. When not in UTF mode, the length is always 1. */
7981
7982#ifdef SUPPORT_UNICODE
7983 if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
7984#endif
7985 {
7986 mclength = 1;
7987 mcbuffer[0] = meta;
7988 }
7989
7990 /* Generate the appropriate code */
7991
7992 *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR;
7993 memcpy(code, mcbuffer, CU2BYTES(mclength));
7994 code += mclength;
7995
7996 /* Remember if \r or \n were seen */
7997
7998 if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
7999 cb->external_flags |= PCRE2_HASCRORLF;
8000
8001 /* Set the first and required code units appropriately. If no previous
8002 first code unit, set it from this character, but revert to none on a zero
8003 repeat. Otherwise, leave the firstcu value alone, and don't change it on
8004 a zero repeat. */
8005
8006 if (firstcuflags == REQ_UNSET)
8007 {
8008 zerofirstcuflags = REQ_NONE;
8009 zeroreqcu = reqcu;
8010 zeroreqcuflags = reqcuflags;
8011
8012 /* If the character is more than one code unit long, we can set a single
8013 firstcu only if it is not to be matched caselessly. Multiple possible
8014 starting code units may be picked up later in the studying code. */
8015
8016 if (mclength == 1 || req_caseopt == 0)
8017 {
8018 firstcu = mcbuffer[0];
8019 firstcuflags = req_caseopt;
8020 if (mclength != 1)
8021 {
8022 reqcu = code[-1];
8023 reqcuflags = cb->req_varyopt;
8024 }
8025 }
8026 else firstcuflags = reqcuflags = REQ_NONE;
8027 }
8028
8029 /* firstcu was previously set; we can set reqcu only if the length is
8030 1 or the matching is caseful. */
8031
8032 else
8033 {
8034 zerofirstcu = firstcu;
8035 zerofirstcuflags = firstcuflags;
8036 zeroreqcu = reqcu;
8037 zeroreqcuflags = reqcuflags;
8038 if (mclength == 1 || req_caseopt == 0)
8039 {
8040 reqcu = code[-1];
8041 reqcuflags = req_caseopt | cb->req_varyopt;
8042 }
8043 }
8044
8045 /* If caselessness was temporarily instated, reset it. */
8046
8047 if (reset_caseful)
8048 {
8049 options &= ~PCRE2_CASELESS;
8050 req_caseopt = 0;
8051 reset_caseful = FALSE;
8052 }
8053
8054 break; /* End literal character handling */
8055 } /* End of big switch */
8056 } /* End of big loop */
8057
8058/* Control never reaches here. */
8059}
8060
8061
8062
8063/*************************************************
8064* Compile regex: a sequence of alternatives *
8065*************************************************/
8066
8067/* On entry, pptr is pointing past the bracket meta, but on return it points to
8068the closing bracket or META_END. The code variable is pointing at the code unit
8069into which the BRA operator has been stored. This function is used during the
8070pre-compile phase when we are trying to find out the amount of memory needed,
8071as well as during the real compile phase. The value of lengthptr distinguishes
8072the two phases.
8073
8074Arguments:
8075 options option bits, including any changes for this subpattern
8076 codeptr -> the address of the current code pointer
8077 pptrptr -> the address of the current parsed pattern pointer
8078 errorcodeptr -> pointer to error code variable
8079 skipunits skip this many code units at start (for brackets and OP_COND)
8080 firstcuptr place to put the first required code unit
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07008081 firstcuflagsptr place to put the first code unit flags
Elliott Hughes5b808042021-10-01 10:56:10 -07008082 reqcuptr place to put the last required code unit
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07008083 reqcuflagsptr place to put the last required code unit flags
Elliott Hughes5b808042021-10-01 10:56:10 -07008084 bcptr pointer to the chain of currently open branches
8085 cb points to the data block with tables pointers etc.
8086 lengthptr NULL during the real compile phase
8087 points to length accumulator during pre-compile phase
8088
8089Returns: 0 There has been an error
8090 +1 Success, this group must match at least one character
8091 -1 Success, this group may match an empty string
8092*/
8093
8094static int
8095compile_regex(uint32_t options, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
8096 int *errorcodeptr, uint32_t skipunits, uint32_t *firstcuptr,
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07008097 uint32_t *firstcuflagsptr, uint32_t *reqcuptr, uint32_t *reqcuflagsptr,
Elliott Hughes5b808042021-10-01 10:56:10 -07008098 branch_chain *bcptr, compile_block *cb, PCRE2_SIZE *lengthptr)
8099{
8100PCRE2_UCHAR *code = *codeptr;
8101PCRE2_UCHAR *last_branch = code;
8102PCRE2_UCHAR *start_bracket = code;
8103BOOL lookbehind;
8104open_capitem capitem;
8105int capnumber = 0;
8106int okreturn = 1;
8107uint32_t *pptr = *pptrptr;
8108uint32_t firstcu, reqcu;
8109uint32_t lookbehindlength;
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07008110uint32_t firstcuflags, reqcuflags;
Elliott Hughes5b808042021-10-01 10:56:10 -07008111uint32_t branchfirstcu, branchreqcu;
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07008112uint32_t branchfirstcuflags, branchreqcuflags;
Elliott Hughes5b808042021-10-01 10:56:10 -07008113PCRE2_SIZE length;
8114branch_chain bc;
8115
8116/* If set, call the external function that checks for stack availability. */
8117
8118if (cb->cx->stack_guard != NULL &&
8119 cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
8120 {
8121 *errorcodeptr= ERR33;
8122 return 0;
8123 }
8124
8125/* Miscellaneous initialization */
8126
8127bc.outer = bcptr;
8128bc.current_branch = code;
8129
8130firstcu = reqcu = 0;
8131firstcuflags = reqcuflags = REQ_UNSET;
8132
8133/* Accumulate the length for use in the pre-compile phase. Start with the
8134length of the BRA and KET and any extra code units that are required at the
8135beginning. We accumulate in a local variable to save frequent testing of
8136lengthptr for NULL. We cannot do this by looking at the value of 'code' at the
8137start and end of each alternative, because compiled items are discarded during
8138the pre-compile phase so that the workspace is not exceeded. */
8139
8140length = 2 + 2*LINK_SIZE + skipunits;
8141
8142/* Remember if this is a lookbehind assertion, and if it is, save its length
8143and skip over the pattern offset. */
8144
8145lookbehind = *code == OP_ASSERTBACK ||
8146 *code == OP_ASSERTBACK_NOT ||
8147 *code == OP_ASSERTBACK_NA;
8148
8149if (lookbehind)
8150 {
8151 lookbehindlength = META_DATA(pptr[-1]);
8152 pptr += SIZEOFFSET;
8153 }
8154else lookbehindlength = 0;
8155
8156/* If this is a capturing subpattern, add to the chain of open capturing items
8157so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA
8158need be tested here; changing this opcode to one of its variants, e.g.
8159OP_SCBRAPOS, happens later, after the group has been compiled. */
8160
8161if (*code == OP_CBRA)
8162 {
8163 capnumber = GET2(code, 1 + LINK_SIZE);
8164 capitem.number = capnumber;
8165 capitem.next = cb->open_caps;
8166 capitem.assert_depth = cb->assert_depth;
8167 cb->open_caps = &capitem;
8168 }
8169
8170/* Offset is set zero to mark that this bracket is still open */
8171
8172PUT(code, 1, 0);
8173code += 1 + LINK_SIZE + skipunits;
8174
8175/* Loop for each alternative branch */
8176
8177for (;;)
8178 {
8179 int branch_return;
8180
8181 /* Insert OP_REVERSE if this is as lookbehind assertion. */
8182
8183 if (lookbehind && lookbehindlength > 0)
8184 {
8185 *code++ = OP_REVERSE;
8186 PUTINC(code, 0, lookbehindlength);
8187 length += 1 + LINK_SIZE;
8188 }
8189
8190 /* Now compile the branch; in the pre-compile phase its length gets added
8191 into the length. */
8192
8193 if ((branch_return =
8194 compile_branch(&options, &code, &pptr, errorcodeptr, &branchfirstcu,
8195 &branchfirstcuflags, &branchreqcu, &branchreqcuflags, &bc,
8196 cb, (lengthptr == NULL)? NULL : &length)) == 0)
8197 return 0;
8198
8199 /* If a branch can match an empty string, so can the whole group. */
8200
8201 if (branch_return < 0) okreturn = -1;
8202
8203 /* In the real compile phase, there is some post-processing to be done. */
8204
8205 if (lengthptr == NULL)
8206 {
8207 /* If this is the first branch, the firstcu and reqcu values for the
8208 branch become the values for the regex. */
8209
8210 if (*last_branch != OP_ALT)
8211 {
8212 firstcu = branchfirstcu;
8213 firstcuflags = branchfirstcuflags;
8214 reqcu = branchreqcu;
8215 reqcuflags = branchreqcuflags;
8216 }
8217
8218 /* If this is not the first branch, the first char and reqcu have to
8219 match the values from all the previous branches, except that if the
8220 previous value for reqcu didn't have REQ_VARY set, it can still match,
8221 and we set REQ_VARY for the group from this branch's value. */
8222
8223 else
8224 {
8225 /* If we previously had a firstcu, but it doesn't match the new branch,
8226 we have to abandon the firstcu for the regex, but if there was
8227 previously no reqcu, it takes on the value of the old firstcu. */
8228
8229 if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)
8230 {
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07008231 if (firstcuflags < REQ_NONE)
Elliott Hughes5b808042021-10-01 10:56:10 -07008232 {
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07008233 if (reqcuflags >= REQ_NONE)
Elliott Hughes5b808042021-10-01 10:56:10 -07008234 {
8235 reqcu = firstcu;
8236 reqcuflags = firstcuflags;
8237 }
8238 }
8239 firstcuflags = REQ_NONE;
8240 }
8241
8242 /* If we (now or from before) have no firstcu, a firstcu from the
8243 branch becomes a reqcu if there isn't a branch reqcu. */
8244
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07008245 if (firstcuflags >= REQ_NONE && branchfirstcuflags < REQ_NONE &&
8246 branchreqcuflags >= REQ_NONE)
Elliott Hughes5b808042021-10-01 10:56:10 -07008247 {
8248 branchreqcu = branchfirstcu;
8249 branchreqcuflags = branchfirstcuflags;
8250 }
8251
8252 /* Now ensure that the reqcus match */
8253
8254 if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) ||
8255 reqcu != branchreqcu)
8256 reqcuflags = REQ_NONE;
8257 else
8258 {
8259 reqcu = branchreqcu;
8260 reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY if present */
8261 }
8262 }
8263 }
8264
8265 /* Handle reaching the end of the expression, either ')' or end of pattern.
8266 In the real compile phase, go back through the alternative branches and
8267 reverse the chain of offsets, with the field in the BRA item now becoming an
8268 offset to the first alternative. If there are no alternatives, it points to
8269 the end of the group. The length in the terminating ket is always the length
8270 of the whole bracketed item. Return leaving the pointer at the terminating
8271 char. */
8272
8273 if (META_CODE(*pptr) != META_ALT)
8274 {
8275 if (lengthptr == NULL)
8276 {
8277 PCRE2_SIZE branch_length = code - last_branch;
8278 do
8279 {
8280 PCRE2_SIZE prev_length = GET(last_branch, 1);
8281 PUT(last_branch, 1, branch_length);
8282 branch_length = prev_length;
8283 last_branch -= branch_length;
8284 }
8285 while (branch_length > 0);
8286 }
8287
8288 /* Fill in the ket */
8289
8290 *code = OP_KET;
8291 PUT(code, 1, (int)(code - start_bracket));
8292 code += 1 + LINK_SIZE;
8293
8294 /* If it was a capturing subpattern, remove the block from the chain. */
8295
8296 if (capnumber > 0) cb->open_caps = cb->open_caps->next;
8297
8298 /* Set values to pass back */
8299
8300 *codeptr = code;
8301 *pptrptr = pptr;
8302 *firstcuptr = firstcu;
8303 *firstcuflagsptr = firstcuflags;
8304 *reqcuptr = reqcu;
8305 *reqcuflagsptr = reqcuflags;
8306 if (lengthptr != NULL)
8307 {
8308 if (OFLOW_MAX - *lengthptr < length)
8309 {
8310 *errorcodeptr = ERR20;
8311 return 0;
8312 }
8313 *lengthptr += length;
8314 }
8315 return okreturn;
8316 }
8317
8318 /* Another branch follows. In the pre-compile phase, we can move the code
8319 pointer back to where it was for the start of the first branch. (That is,
8320 pretend that each branch is the only one.)
8321
8322 In the real compile phase, insert an ALT node. Its length field points back
8323 to the previous branch while the bracket remains open. At the end the chain
8324 is reversed. It's done like this so that the start of the bracket has a
8325 zero offset until it is closed, making it possible to detect recursion. */
8326
8327 if (lengthptr != NULL)
8328 {
8329 code = *codeptr + 1 + LINK_SIZE + skipunits;
8330 length += 1 + LINK_SIZE;
8331 }
8332 else
8333 {
8334 *code = OP_ALT;
8335 PUT(code, 1, (int)(code - last_branch));
8336 bc.current_branch = last_branch = code;
8337 code += 1 + LINK_SIZE;
8338 }
8339
8340 /* Set the lookbehind length (if not in a lookbehind the value will be zero)
8341 and then advance past the vertical bar. */
8342
8343 lookbehindlength = META_DATA(*pptr);
8344 pptr++;
8345 }
8346/* Control never reaches here */
8347}
8348
8349
8350
8351/*************************************************
8352* Check for anchored pattern *
8353*************************************************/
8354
8355/* Try to find out if this is an anchored regular expression. Consider each
8356alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8357all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8358it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8359be found, because ^ generates OP_CIRCM in that mode.
8360
8361We can also consider a regex to be anchored if OP_SOM starts all its branches.
8362This is the code for \G, which means "match at start of match position, taking
8363into account the match offset".
8364
8365A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8366because that will try the rest of the pattern at all possible matching points,
8367so there is no point trying again.... er ....
8368
8369.... except when the .* appears inside capturing parentheses, and there is a
8370subsequent back reference to those parentheses. We haven't enough information
8371to catch that case precisely.
8372
8373At first, the best we could do was to detect when .* was in capturing brackets
8374and the highest back reference was greater than or equal to that level.
8375However, by keeping a bitmap of the first 31 back references, we can catch some
8376of the more common cases more precisely.
8377
8378... A second exception is when the .* appears inside an atomic group, because
8379this prevents the number of characters it matches from being adjusted.
8380
8381Arguments:
8382 code points to start of the compiled pattern
8383 bracket_map a bitmap of which brackets we are inside while testing; this
8384 handles up to substring 31; after that we just have to take
8385 the less precise approach
8386 cb points to the compile data block
8387 atomcount atomic group level
8388 inassert TRUE if in an assertion
8389
8390Returns: TRUE or FALSE
8391*/
8392
8393static BOOL
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07008394is_anchored(PCRE2_SPTR code, uint32_t bracket_map, compile_block *cb,
Elliott Hughes5b808042021-10-01 10:56:10 -07008395 int atomcount, BOOL inassert)
8396{
8397do {
8398 PCRE2_SPTR scode = first_significant_code(
8399 code + PRIV(OP_lengths)[*code], FALSE);
8400 int op = *scode;
8401
8402 /* Non-capturing brackets */
8403
8404 if (op == OP_BRA || op == OP_BRAPOS ||
8405 op == OP_SBRA || op == OP_SBRAPOS)
8406 {
8407 if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8408 return FALSE;
8409 }
8410
8411 /* Capturing brackets */
8412
8413 else if (op == OP_CBRA || op == OP_CBRAPOS ||
8414 op == OP_SCBRA || op == OP_SCBRAPOS)
8415 {
8416 int n = GET2(scode, 1+LINK_SIZE);
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07008417 uint32_t new_map = bracket_map | ((n < 32)? (1u << n) : 1);
Elliott Hughes5b808042021-10-01 10:56:10 -07008418 if (!is_anchored(scode, new_map, cb, atomcount, inassert)) return FALSE;
8419 }
8420
8421 /* Positive forward assertion */
8422
8423 else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8424 {
8425 if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8426 }
8427
8428 /* Condition. If there is no second branch, it can't be anchored. */
8429
8430 else if (op == OP_COND || op == OP_SCOND)
8431 {
8432 if (scode[GET(scode,1)] != OP_ALT) return FALSE;
8433 if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
8434 return FALSE;
8435 }
8436
8437 /* Atomic groups */
8438
8439 else if (op == OP_ONCE)
8440 {
8441 if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert))
8442 return FALSE;
8443 }
8444
8445 /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8446 it isn't in brackets that are or may be referenced or inside an atomic
8447 group or an assertion. Also the pattern must not contain *PRUNE or *SKIP,
8448 because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/
8449 with the subject "aab", which matches "b", i.e. not at the start of a line.
8450 There is also an option that disables auto-anchoring. */
8451
8452 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8453 op == OP_TYPEPOSSTAR))
8454 {
8455 if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
8456 atomcount > 0 || cb->had_pruneorskip || inassert ||
8457 (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8458 return FALSE;
8459 }
8460
8461 /* Check for explicit anchoring */
8462
8463 else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8464
8465 code += GET(code, 1);
8466 }
8467while (*code == OP_ALT); /* Loop for each alternative */
8468return TRUE;
8469}
8470
8471
8472
8473/*************************************************
8474* Check for starting with ^ or .* *
8475*************************************************/
8476
8477/* This is called to find out if every branch starts with ^ or .* so that
8478"first char" processing can be done to speed things up in multiline
8479matching and for non-DOTALL patterns that start with .* (which must start at
8480the beginning or after \n). As in the case of is_anchored() (see above), we
8481have to take account of back references to capturing brackets that contain .*
8482because in that case we can't make the assumption. Also, the appearance of .*
8483inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
8484or *SKIP does not count, because once again the assumption no longer holds.
8485
8486Arguments:
8487 code points to start of the compiled pattern or a group
8488 bracket_map a bitmap of which brackets we are inside while testing; this
8489 handles up to substring 31; after that we just have to take
8490 the less precise approach
8491 cb points to the compile data
8492 atomcount atomic group level
8493 inassert TRUE if in an assertion
8494
8495Returns: TRUE or FALSE
8496*/
8497
8498static BOOL
8499is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
8500 int atomcount, BOOL inassert)
8501{
8502do {
8503 PCRE2_SPTR scode = first_significant_code(
8504 code + PRIV(OP_lengths)[*code], FALSE);
8505 int op = *scode;
8506
8507 /* If we are at the start of a conditional assertion group, *both* the
8508 conditional assertion *and* what follows the condition must satisfy the test
8509 for start of line. Other kinds of condition fail. Note that there may be an
8510 auto-callout at the start of a condition. */
8511
8512 if (op == OP_COND)
8513 {
8514 scode += 1 + LINK_SIZE;
8515
8516 if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8517 else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE);
8518
8519 switch (*scode)
8520 {
8521 case OP_CREF:
8522 case OP_DNCREF:
8523 case OP_RREF:
8524 case OP_DNRREF:
8525 case OP_FAIL:
8526 case OP_FALSE:
8527 case OP_TRUE:
8528 return FALSE;
8529
8530 default: /* Assertion */
8531 if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
8532 do scode += GET(scode, 1); while (*scode == OP_ALT);
8533 scode += 1 + LINK_SIZE;
8534 break;
8535 }
8536 scode = first_significant_code(scode, FALSE);
8537 op = *scode;
8538 }
8539
8540 /* Non-capturing brackets */
8541
8542 if (op == OP_BRA || op == OP_BRAPOS ||
8543 op == OP_SBRA || op == OP_SBRAPOS)
8544 {
8545 if (!is_startline(scode, bracket_map, cb, atomcount, inassert))
8546 return FALSE;
8547 }
8548
8549 /* Capturing brackets */
8550
8551 else if (op == OP_CBRA || op == OP_CBRAPOS ||
8552 op == OP_SCBRA || op == OP_SCBRAPOS)
8553 {
8554 int n = GET2(scode, 1+LINK_SIZE);
8555 int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8556 if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE;
8557 }
8558
8559 /* Positive forward assertions */
8560
8561 else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8562 {
8563 if (!is_startline(scode, bracket_map, cb, atomcount, TRUE))
8564 return FALSE;
8565 }
8566
8567 /* Atomic brackets */
8568
8569 else if (op == OP_ONCE)
8570 {
8571 if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert))
8572 return FALSE;
8573 }
8574
8575 /* .* means "start at start or after \n" if it isn't in atomic brackets or
8576 brackets that may be referenced or an assertion, and as long as the pattern
8577 does not contain *PRUNE or *SKIP, because these break the feature. Consider,
8578 for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
8579 i.e. not at the start of a line. There is also an option that disables this
8580 optimization. */
8581
8582 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8583 {
8584 if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
8585 atomcount > 0 || cb->had_pruneorskip || inassert ||
8586 (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8587 return FALSE;
8588 }
8589
8590 /* Check for explicit circumflex; anything else gives a FALSE result. Note
8591 in particular that this includes atomic brackets OP_ONCE because the number
8592 of characters matched by .* cannot be adjusted inside them. */
8593
8594 else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8595
8596 /* Move on to the next alternative */
8597
8598 code += GET(code, 1);
8599 }
8600while (*code == OP_ALT); /* Loop for each alternative */
8601return TRUE;
8602}
8603
8604
8605
8606/*************************************************
8607* Scan compiled regex for recursion reference *
8608*************************************************/
8609
8610/* This function scans through a compiled pattern until it finds an instance of
8611OP_RECURSE.
8612
8613Arguments:
8614 code points to start of expression
8615 utf TRUE in UTF mode
8616
8617Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
8618*/
8619
8620static PCRE2_SPTR
8621find_recurse(PCRE2_SPTR code, BOOL utf)
8622{
8623for (;;)
8624 {
8625 PCRE2_UCHAR c = *code;
8626 if (c == OP_END) return NULL;
8627 if (c == OP_RECURSE) return code;
8628
8629 /* XCLASS is used for classes that cannot be represented just by a bit map.
8630 This includes negated single high-valued characters. CALLOUT_STR is used for
8631 callouts with string arguments. In both cases the length in the table is
8632 zero; the actual length is stored in the compiled code. */
8633
8634 if (c == OP_XCLASS) code += GET(code, 1);
8635 else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
8636
8637 /* Otherwise, we can get the item's length from the table, except that for
8638 repeated character types, we have to test for \p and \P, which have an extra
8639 two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument,
8640 we must add in its length. */
8641
8642 else
8643 {
8644 switch(c)
8645 {
8646 case OP_TYPESTAR:
8647 case OP_TYPEMINSTAR:
8648 case OP_TYPEPLUS:
8649 case OP_TYPEMINPLUS:
8650 case OP_TYPEQUERY:
8651 case OP_TYPEMINQUERY:
8652 case OP_TYPEPOSSTAR:
8653 case OP_TYPEPOSPLUS:
8654 case OP_TYPEPOSQUERY:
8655 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
8656 break;
8657
8658 case OP_TYPEPOSUPTO:
8659 case OP_TYPEUPTO:
8660 case OP_TYPEMINUPTO:
8661 case OP_TYPEEXACT:
8662 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
8663 code += 2;
8664 break;
8665
8666 case OP_MARK:
8667 case OP_COMMIT_ARG:
8668 case OP_PRUNE_ARG:
8669 case OP_SKIP_ARG:
8670 case OP_THEN_ARG:
8671 code += code[1];
8672 break;
8673 }
8674
8675 /* Add in the fixed length from the table */
8676
8677 code += PRIV(OP_lengths)[c];
8678
8679 /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may
8680 be followed by a multi-unit character. The length in the table is a
8681 minimum, so we have to arrange to skip the extra units. */
8682
8683#ifdef MAYBE_UTF_MULTI
8684 if (utf) switch(c)
8685 {
8686 case OP_CHAR:
8687 case OP_CHARI:
8688 case OP_NOT:
8689 case OP_NOTI:
8690 case OP_EXACT:
8691 case OP_EXACTI:
8692 case OP_NOTEXACT:
8693 case OP_NOTEXACTI:
8694 case OP_UPTO:
8695 case OP_UPTOI:
8696 case OP_NOTUPTO:
8697 case OP_NOTUPTOI:
8698 case OP_MINUPTO:
8699 case OP_MINUPTOI:
8700 case OP_NOTMINUPTO:
8701 case OP_NOTMINUPTOI:
8702 case OP_POSUPTO:
8703 case OP_POSUPTOI:
8704 case OP_NOTPOSUPTO:
8705 case OP_NOTPOSUPTOI:
8706 case OP_STAR:
8707 case OP_STARI:
8708 case OP_NOTSTAR:
8709 case OP_NOTSTARI:
8710 case OP_MINSTAR:
8711 case OP_MINSTARI:
8712 case OP_NOTMINSTAR:
8713 case OP_NOTMINSTARI:
8714 case OP_POSSTAR:
8715 case OP_POSSTARI:
8716 case OP_NOTPOSSTAR:
8717 case OP_NOTPOSSTARI:
8718 case OP_PLUS:
8719 case OP_PLUSI:
8720 case OP_NOTPLUS:
8721 case OP_NOTPLUSI:
8722 case OP_MINPLUS:
8723 case OP_MINPLUSI:
8724 case OP_NOTMINPLUS:
8725 case OP_NOTMINPLUSI:
8726 case OP_POSPLUS:
8727 case OP_POSPLUSI:
8728 case OP_NOTPOSPLUS:
8729 case OP_NOTPOSPLUSI:
8730 case OP_QUERY:
8731 case OP_QUERYI:
8732 case OP_NOTQUERY:
8733 case OP_NOTQUERYI:
8734 case OP_MINQUERY:
8735 case OP_MINQUERYI:
8736 case OP_NOTMINQUERY:
8737 case OP_NOTMINQUERYI:
8738 case OP_POSQUERY:
8739 case OP_POSQUERYI:
8740 case OP_NOTPOSQUERY:
8741 case OP_NOTPOSQUERYI:
8742 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
8743 break;
8744 }
8745#else
8746 (void)(utf); /* Keep compiler happy by referencing function argument */
8747#endif /* MAYBE_UTF_MULTI */
8748 }
8749 }
8750}
8751
8752
8753
8754/*************************************************
8755* Check for asserted fixed first code unit *
8756*************************************************/
8757
8758/* During compilation, the "first code unit" settings from forward assertions
8759are discarded, because they can cause conflicts with actual literals that
8760follow. However, if we end up without a first code unit setting for an
8761unanchored pattern, it is worth scanning the regex to see if there is an
8762initial asserted first code unit. If all branches start with the same asserted
8763code unit, or with a non-conditional bracket all of whose alternatives start
8764with the same asserted code unit (recurse ad lib), then we return that code
8765unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
8766REQ_NONE in the flags.
8767
8768Arguments:
8769 code points to start of compiled pattern
8770 flags points to the first code unit flags
8771 inassert non-zero if in an assertion
8772
8773Returns: the fixed first code unit, or 0 with REQ_NONE in flags
8774*/
8775
8776static uint32_t
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07008777find_firstassertedcu(PCRE2_SPTR code, uint32_t *flags, uint32_t inassert)
Elliott Hughes5b808042021-10-01 10:56:10 -07008778{
8779uint32_t c = 0;
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07008780uint32_t cflags = REQ_NONE;
Elliott Hughes5b808042021-10-01 10:56:10 -07008781
8782*flags = REQ_NONE;
8783do {
8784 uint32_t d;
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07008785 uint32_t dflags;
Elliott Hughes5b808042021-10-01 10:56:10 -07008786 int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
8787 *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
8788 PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
8789 PCRE2_UCHAR op = *scode;
8790
8791 switch(op)
8792 {
8793 default:
8794 return 0;
8795
8796 case OP_BRA:
8797 case OP_BRAPOS:
8798 case OP_CBRA:
8799 case OP_SCBRA:
8800 case OP_CBRAPOS:
8801 case OP_SCBRAPOS:
8802 case OP_ASSERT:
8803 case OP_ASSERT_NA:
8804 case OP_ONCE:
8805 case OP_SCRIPT_RUN:
8806 d = find_firstassertedcu(scode, &dflags, inassert +
8807 ((op == OP_ASSERT || op == OP_ASSERT_NA)?1:0));
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07008808 if (dflags >= REQ_NONE) return 0;
8809 if (cflags >= REQ_NONE) { c = d; cflags = dflags; }
Elliott Hughes5b808042021-10-01 10:56:10 -07008810 else if (c != d || cflags != dflags) return 0;
8811 break;
8812
8813 case OP_EXACT:
8814 scode += IMM2_SIZE;
8815 /* Fall through */
8816
8817 case OP_CHAR:
8818 case OP_PLUS:
8819 case OP_MINPLUS:
8820 case OP_POSPLUS:
8821 if (inassert == 0) return 0;
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07008822 if (cflags >= REQ_NONE) { c = scode[1]; cflags = 0; }
Elliott Hughes5b808042021-10-01 10:56:10 -07008823 else if (c != scode[1]) return 0;
8824 break;
8825
8826 case OP_EXACTI:
8827 scode += IMM2_SIZE;
8828 /* Fall through */
8829
8830 case OP_CHARI:
8831 case OP_PLUSI:
8832 case OP_MINPLUSI:
8833 case OP_POSPLUSI:
8834 if (inassert == 0) return 0;
8835
8836 /* If the character is more than one code unit long, we cannot set its
8837 first code unit when matching caselessly. Later scanning may pick up
8838 multiple code units. */
8839
8840#ifdef SUPPORT_UNICODE
8841#if PCRE2_CODE_UNIT_WIDTH == 8
8842 if (scode[1] >= 0x80) return 0;
8843#elif PCRE2_CODE_UNIT_WIDTH == 16
8844 if (scode[1] >= 0xd800 && scode[1] <= 0xdfff) return 0;
8845#endif
8846#endif
8847
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07008848 if (cflags >= REQ_NONE) { c = scode[1]; cflags = REQ_CASELESS; }
Elliott Hughes5b808042021-10-01 10:56:10 -07008849 else if (c != scode[1]) return 0;
8850 break;
8851 }
8852
8853 code += GET(code, 1);
8854 }
8855while (*code == OP_ALT);
8856
8857*flags = cflags;
8858return c;
8859}
8860
8861
8862
8863/*************************************************
8864* Add an entry to the name/number table *
8865*************************************************/
8866
8867/* This function is called between compiling passes to add an entry to the
8868name/number table, maintaining alphabetical order. Checking for permitted
8869and forbidden duplicates has already been done.
8870
8871Arguments:
8872 cb the compile data block
8873 name the name to add
8874 length the length of the name
8875 groupno the group number
8876 tablecount the count of names in the table so far
8877
8878Returns: nothing
8879*/
8880
8881static void
8882add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length,
8883 unsigned int groupno, uint32_t tablecount)
8884{
8885uint32_t i;
8886PCRE2_UCHAR *slot = cb->name_table;
8887
8888for (i = 0; i < tablecount; i++)
8889 {
8890 int crc = memcmp(name, slot+IMM2_SIZE, CU2BYTES(length));
8891 if (crc == 0 && slot[IMM2_SIZE+length] != 0)
8892 crc = -1; /* Current name is a substring */
8893
8894 /* Make space in the table and break the loop for an earlier name. For a
8895 duplicate or later name, carry on. We do this for duplicates so that in the
8896 simple case (when ?(| is not used) they are in order of their numbers. In all
8897 cases they are in the order in which they appear in the pattern. */
8898
8899 if (crc < 0)
8900 {
8901 (void)memmove(slot + cb->name_entry_size, slot,
8902 CU2BYTES((tablecount - i) * cb->name_entry_size));
8903 break;
8904 }
8905
8906 /* Continue the loop for a later or duplicate name */
8907
8908 slot += cb->name_entry_size;
8909 }
8910
8911PUT2(slot, 0, groupno);
8912memcpy(slot + IMM2_SIZE, name, CU2BYTES(length));
8913
8914/* Add a terminating zero and fill the rest of the slot with zeroes so that
8915the memory is all initialized. Otherwise valgrind moans about uninitialized
8916memory when saving serialized compiled patterns. */
8917
8918memset(slot + IMM2_SIZE + length, 0,
8919 CU2BYTES(cb->name_entry_size - length - IMM2_SIZE));
8920}
8921
8922
8923
8924/*************************************************
8925* Skip in parsed pattern *
8926*************************************************/
8927
8928/* This function is called to skip parts of the parsed pattern when finding the
8929length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find
8930the end of the branch, it is called to skip over an internal lookaround or
8931(DEFINE) group, and it is also called to skip to the end of a class, during
8932which it will never encounter nested groups (but there's no need to have
8933special code for that).
8934
8935When called to find the end of a branch or group, pptr must point to the first
8936meta code inside the branch, not the branch-starting code. In other cases it
8937can point to the item that causes the function to be called.
8938
8939Arguments:
8940 pptr current pointer to skip from
8941 skiptype PSKIP_CLASS when skipping to end of class
8942 PSKIP_ALT when META_ALT ends the skip
8943 PSKIP_KET when only META_KET ends the skip
8944
8945Returns: new value of pptr
8946 NULL if META_END is reached - should never occur
8947 or for an unknown meta value - likewise
8948*/
8949
8950static uint32_t *
8951parsed_skip(uint32_t *pptr, uint32_t skiptype)
8952{
8953uint32_t nestlevel = 0;
8954
8955for (;; pptr++)
8956 {
8957 uint32_t meta = META_CODE(*pptr);
8958
8959 switch(meta)
8960 {
8961 default: /* Just skip over most items */
8962 if (meta < META_END) continue; /* Literal */
8963 break;
8964
8965 /* This should never occur. */
8966
8967 case META_END:
8968 return NULL;
8969
8970 /* The data for these items is variable in length. */
8971
8972 case META_BACKREF: /* Offset is present only if group >= 10 */
8973 if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET;
8974 break;
8975
8976 case META_ESCAPE: /* A few escapes are followed by data items. */
8977 switch (META_DATA(*pptr))
8978 {
8979 case ESC_P:
8980 case ESC_p:
8981 pptr += 1;
8982 break;
8983
8984 case ESC_g:
8985 case ESC_k:
8986 pptr += 1 + SIZEOFFSET;
8987 break;
8988 }
8989 break;
8990
8991 case META_MARK: /* Add the length of the name. */
8992 case META_COMMIT_ARG:
8993 case META_PRUNE_ARG:
8994 case META_SKIP_ARG:
8995 case META_THEN_ARG:
8996 pptr += pptr[1];
8997 break;
8998
8999 /* These are the "active" items in this loop. */
9000
9001 case META_CLASS_END:
9002 if (skiptype == PSKIP_CLASS) return pptr;
9003 break;
9004
9005 case META_ATOMIC:
9006 case META_CAPTURE:
9007 case META_COND_ASSERT:
9008 case META_COND_DEFINE:
9009 case META_COND_NAME:
9010 case META_COND_NUMBER:
9011 case META_COND_RNAME:
9012 case META_COND_RNUMBER:
9013 case META_COND_VERSION:
9014 case META_LOOKAHEAD:
9015 case META_LOOKAHEADNOT:
9016 case META_LOOKAHEAD_NA:
9017 case META_LOOKBEHIND:
9018 case META_LOOKBEHINDNOT:
9019 case META_LOOKBEHIND_NA:
9020 case META_NOCAPTURE:
9021 case META_SCRIPT_RUN:
9022 nestlevel++;
9023 break;
9024
9025 case META_ALT:
9026 if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr;
9027 break;
9028
9029 case META_KET:
9030 if (nestlevel == 0) return pptr;
9031 nestlevel--;
9032 break;
9033 }
9034
9035 /* The extra data item length for each meta is in a table. */
9036
9037 meta = (meta >> 16) & 0x7fff;
9038 if (meta >= sizeof(meta_extra_lengths)) return NULL;
9039 pptr += meta_extra_lengths[meta];
9040 }
9041/* Control never reaches here */
9042return pptr;
9043}
9044
9045
9046
9047/*************************************************
9048* Find length of a parsed group *
9049*************************************************/
9050
9051/* This is called for nested groups within a branch of a lookbehind whose
9052length is being computed. If all the branches in the nested group have the same
9053length, that is OK. On entry, the pointer must be at the first element after
9054the group initializing code. On exit it points to OP_KET. Caching is used to
9055improve processing speed when the same capturing group occurs many times.
9056
9057Arguments:
9058 pptrptr pointer to pointer in the parsed pattern
9059 isinline FALSE if a reference or recursion; TRUE for inline group
9060 errcodeptr pointer to the errorcode
9061 lcptr pointer to the loop counter
9062 group number of captured group or -1 for a non-capturing group
9063 recurses chain of recurse_check to catch mutual recursion
9064 cb pointer to the compile data
9065
9066Returns: the group length or a negative number
9067*/
9068
9069static int
9070get_grouplength(uint32_t **pptrptr, BOOL isinline, int *errcodeptr, int *lcptr,
9071 int group, parsed_recurse_check *recurses, compile_block *cb)
9072{
9073int branchlength;
9074int grouplength = -1;
9075
9076/* The cache can be used only if there is no possibility of there being two
9077groups with the same number. We do not need to set the end pointer for a group
9078that is being processed as a back reference or recursion, but we must do so for
9079an inline group. */
9080
9081if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
9082 {
9083 uint32_t groupinfo = cb->groupinfo[group];
9084 if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1;
9085 if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
9086 {
9087 if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);
9088 return groupinfo & GI_FIXED_LENGTH_MASK;
9089 }
9090 }
9091
9092/* Scan the group. In this case we find the end pointer of necessity. */
9093
9094for(;;)
9095 {
9096 branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
9097 if (branchlength < 0) goto ISNOTFIXED;
9098 if (grouplength == -1) grouplength = branchlength;
9099 else if (grouplength != branchlength) goto ISNOTFIXED;
9100 if (**pptrptr == META_KET) break;
9101 *pptrptr += 1; /* Skip META_ALT */
9102 }
9103
9104if (group > 0)
9105 cb->groupinfo[group] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength);
9106return grouplength;
9107
9108ISNOTFIXED:
9109if (group > 0) cb->groupinfo[group] |= GI_NOT_FIXED_LENGTH;
9110return -1;
9111}
9112
9113
9114
9115/*************************************************
9116* Find length of a parsed branch *
9117*************************************************/
9118
9119/* Return a fixed length for a branch in a lookbehind, giving an error if the
9120length is not fixed. On entry, *pptrptr points to the first element inside the
9121branch. On exit it is set to point to the ALT or KET.
9122
9123Arguments:
9124 pptrptr pointer to pointer in the parsed pattern
9125 errcodeptr pointer to error code
9126 lcptr pointer to loop counter
9127 recurses chain of recurse_check to catch mutual recursion
9128 cb pointer to compile block
9129
9130Returns: the length, or a negative value on error
9131*/
9132
9133static int
9134get_branchlength(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
9135 parsed_recurse_check *recurses, compile_block *cb)
9136{
9137int branchlength = 0;
9138int grouplength;
9139uint32_t lastitemlength = 0;
9140uint32_t *pptr = *pptrptr;
9141PCRE2_SIZE offset;
9142parsed_recurse_check this_recurse;
9143
9144/* A large and/or complex regex can take too long to process. This can happen
9145more often when (?| groups are present in the pattern because their length
9146cannot be cached. */
9147
9148if ((*lcptr)++ > 2000)
9149 {
9150 *errcodeptr = ERR35; /* Lookbehind is too complicated */
9151 return -1;
9152 }
9153
9154/* Scan the branch, accumulating the length. */
9155
9156for (;; pptr++)
9157 {
9158 parsed_recurse_check *r;
9159 uint32_t *gptr, *gptrend;
9160 uint32_t escape;
9161 uint32_t group = 0;
9162 uint32_t itemlength = 0;
9163
9164 if (*pptr < META_END)
9165 {
9166 itemlength = 1;
9167 }
9168
9169 else switch (META_CODE(*pptr))
9170 {
9171 case META_KET:
9172 case META_ALT:
9173 goto EXIT;
9174
9175 /* (*ACCEPT) and (*FAIL) terminate the branch, but we must skip to the
9176 actual termination. */
9177
9178 case META_ACCEPT:
9179 case META_FAIL:
9180 pptr = parsed_skip(pptr, PSKIP_ALT);
9181 if (pptr == NULL) goto PARSED_SKIP_FAILED;
9182 goto EXIT;
9183
9184 case META_MARK:
9185 case META_COMMIT_ARG:
9186 case META_PRUNE_ARG:
9187 case META_SKIP_ARG:
9188 case META_THEN_ARG:
9189 pptr += pptr[1] + 1;
9190 break;
9191
9192 case META_CIRCUMFLEX:
9193 case META_COMMIT:
9194 case META_DOLLAR:
9195 case META_PRUNE:
9196 case META_SKIP:
9197 case META_THEN:
9198 break;
9199
9200 case META_OPTIONS:
9201 pptr += 1;
9202 break;
9203
9204 case META_BIGVALUE:
9205 itemlength = 1;
9206 pptr += 1;
9207 break;
9208
9209 case META_CLASS:
9210 case META_CLASS_NOT:
9211 itemlength = 1;
9212 pptr = parsed_skip(pptr, PSKIP_CLASS);
9213 if (pptr == NULL) goto PARSED_SKIP_FAILED;
9214 break;
9215
9216 case META_CLASS_EMPTY_NOT:
9217 case META_DOT:
9218 itemlength = 1;
9219 break;
9220
9221 case META_CALLOUT_NUMBER:
9222 pptr += 3;
9223 break;
9224
9225 case META_CALLOUT_STRING:
9226 pptr += 3 + SIZEOFFSET;
9227 break;
9228
9229 /* Only some escapes consume a character. Of those, \R and \X are never
9230 allowed because they might match more than character. \C is allowed only in
9231 32-bit and non-UTF 8/16-bit modes. */
9232
9233 case META_ESCAPE:
9234 escape = META_DATA(*pptr);
9235 if (escape == ESC_R || escape == ESC_X) return -1;
9236 if (escape > ESC_b && escape < ESC_Z)
9237 {
9238#if PCRE2_CODE_UNIT_WIDTH != 32
9239 if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C)
9240 {
9241 *errcodeptr = ERR36;
9242 return -1;
9243 }
9244#endif
9245 itemlength = 1;
9246 if (escape == ESC_p || escape == ESC_P) pptr++; /* Skip prop data */
9247 }
9248 break;
9249
9250 /* Lookaheads do not contribute to the length of this branch, but they may
9251 contain lookbehinds within them whose lengths need to be set. */
9252
9253 case META_LOOKAHEAD:
9254 case META_LOOKAHEADNOT:
9255 case META_LOOKAHEAD_NA:
9256 *errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb, lcptr);
9257 if (*errcodeptr != 0) return -1;
9258
9259 /* Ignore any qualifiers that follow a lookahead assertion. */
9260
9261 switch (pptr[1])
9262 {
9263 case META_ASTERISK:
9264 case META_ASTERISK_PLUS:
9265 case META_ASTERISK_QUERY:
9266 case META_PLUS:
9267 case META_PLUS_PLUS:
9268 case META_PLUS_QUERY:
9269 case META_QUERY:
9270 case META_QUERY_PLUS:
9271 case META_QUERY_QUERY:
9272 pptr++;
9273 break;
9274
9275 case META_MINMAX:
9276 case META_MINMAX_PLUS:
9277 case META_MINMAX_QUERY:
9278 pptr += 3;
9279 break;
9280
9281 default:
9282 break;
9283 }
9284 break;
9285
9286 /* A nested lookbehind does not contribute any length to this lookbehind,
9287 but must itself be checked and have its lengths set. */
9288
9289 case META_LOOKBEHIND:
9290 case META_LOOKBEHINDNOT:
9291 case META_LOOKBEHIND_NA:
9292 if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb))
9293 return -1;
9294 break;
9295
9296 /* Back references and recursions are handled by very similar code. At this
9297 stage, the names generated in the parsing pass are available, but the main
9298 name table has not yet been created. So for the named varieties, scan the
9299 list of names in order to get the number of the first one in the pattern,
9300 and whether or not this name is duplicated. */
9301
9302 case META_BACKREF_BYNAME:
9303 if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0)
9304 goto ISNOTFIXED;
9305 /* Fall through */
9306
9307 case META_RECURSE_BYNAME:
9308 {
9309 int i;
9310 PCRE2_SPTR name;
9311 BOOL is_dupname = FALSE;
9312 named_group *ng = cb->named_groups;
9313 uint32_t meta_code = META_CODE(*pptr);
9314 uint32_t length = *(++pptr);
9315
9316 GETPLUSOFFSET(offset, pptr);
9317 name = cb->start_pattern + offset;
9318 for (i = 0; i < cb->names_found; i++, ng++)
9319 {
9320 if (length == ng->length && PRIV(strncmp)(name, ng->name, length) == 0)
9321 {
9322 group = ng->number;
9323 is_dupname = ng->isdup;
9324 break;
9325 }
9326 }
9327
9328 if (group == 0)
9329 {
9330 *errcodeptr = ERR15; /* Non-existent subpattern */
9331 cb->erroroffset = offset;
9332 return -1;
9333 }
9334
9335 /* A numerical back reference can be fixed length if duplicate capturing
9336 groups are not being used. A non-duplicate named back reference can also
9337 be handled. */
9338
9339 if (meta_code == META_RECURSE_BYNAME ||
9340 (!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0))
9341 goto RECURSE_OR_BACKREF_LENGTH; /* Handle as a numbered version. */
9342 }
9343 goto ISNOTFIXED; /* Duplicate name or number */
9344
9345 /* The offset values for back references < 10 are in a separate vector
9346 because otherwise they would use more than two parsed pattern elements on
9347 64-bit systems. */
9348
9349 case META_BACKREF:
9350 if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 ||
9351 (cb->external_flags & PCRE2_DUPCAPUSED) != 0)
9352 goto ISNOTFIXED;
9353 group = META_DATA(*pptr);
9354 if (group < 10)
9355 {
9356 offset = cb->small_ref_offset[group];
9357 goto RECURSE_OR_BACKREF_LENGTH;
9358 }
9359
9360 /* Fall through */
9361 /* For groups >= 10 - picking up group twice does no harm. */
9362
9363 /* A true recursion implies not fixed length, but a subroutine call may
9364 be OK. Back reference "recursions" are also failed. */
9365
9366 case META_RECURSE:
9367 group = META_DATA(*pptr);
9368 GETPLUSOFFSET(offset, pptr);
9369
9370 RECURSE_OR_BACKREF_LENGTH:
9371 if (group > cb->bracount)
9372 {
9373 cb->erroroffset = offset;
9374 *errcodeptr = ERR15; /* Non-existent subpattern */
9375 return -1;
9376 }
9377 if (group == 0) goto ISNOTFIXED; /* Local recursion */
9378 for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++)
9379 {
9380 if (META_CODE(*gptr) == META_BIGVALUE) gptr++;
9381 else if (*gptr == (META_CAPTURE | group)) break;
9382 }
9383
9384 /* We must start the search for the end of the group at the first meta code
9385 inside the group. Otherwise it will be treated as an enclosed group. */
9386
9387 gptrend = parsed_skip(gptr + 1, PSKIP_KET);
9388 if (gptrend == NULL) goto PARSED_SKIP_FAILED;
9389 if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED; /* Local recursion */
9390 for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break;
9391 if (r != NULL) goto ISNOTFIXED; /* Mutual recursion */
9392 this_recurse.prev = recurses;
9393 this_recurse.groupptr = gptr;
9394
9395 /* We do not need to know the position of the end of the group, that is,
9396 gptr is not used after the call to get_grouplength(). Setting the second
9397 argument FALSE stops it scanning for the end when the length can be found
9398 in the cache. */
9399
9400 gptr++;
9401 grouplength = get_grouplength(&gptr, FALSE, errcodeptr, lcptr, group,
9402 &this_recurse, cb);
9403 if (grouplength < 0)
9404 {
9405 if (*errcodeptr == 0) goto ISNOTFIXED;
9406 return -1; /* Error already set */
9407 }
9408 itemlength = grouplength;
9409 break;
9410
9411 /* A (DEFINE) group is never obeyed inline and so it does not contribute to
9412 the length of this branch. Skip from the following item to the next
9413 unpaired ket. */
9414
9415 case META_COND_DEFINE:
9416 pptr = parsed_skip(pptr + 1, PSKIP_KET);
9417 break;
9418
9419 /* Check other nested groups - advance past the initial data for each type
9420 and then seek a fixed length with get_grouplength(). */
9421
9422 case META_COND_NAME:
9423 case META_COND_NUMBER:
9424 case META_COND_RNAME:
9425 case META_COND_RNUMBER:
9426 pptr += 2 + SIZEOFFSET;
9427 goto CHECK_GROUP;
9428
9429 case META_COND_ASSERT:
9430 pptr += 1;
9431 goto CHECK_GROUP;
9432
9433 case META_COND_VERSION:
9434 pptr += 4;
9435 goto CHECK_GROUP;
9436
9437 case META_CAPTURE:
9438 group = META_DATA(*pptr);
9439 /* Fall through */
9440
9441 case META_ATOMIC:
9442 case META_NOCAPTURE:
9443 case META_SCRIPT_RUN:
9444 pptr++;
9445 CHECK_GROUP:
9446 grouplength = get_grouplength(&pptr, TRUE, errcodeptr, lcptr, group,
9447 recurses, cb);
9448 if (grouplength < 0) return -1;
9449 itemlength = grouplength;
9450 break;
9451
9452 /* Exact repetition is OK; variable repetition is not. A repetition of zero
9453 must subtract the length that has already been added. */
9454
9455 case META_MINMAX:
9456 case META_MINMAX_PLUS:
9457 case META_MINMAX_QUERY:
9458 if (pptr[1] == pptr[2])
9459 {
9460 switch(pptr[1])
9461 {
9462 case 0:
9463 branchlength -= lastitemlength;
9464 break;
9465
9466 case 1:
9467 itemlength = 0;
9468 break;
9469
9470 default: /* Check for integer overflow */
9471 if (lastitemlength != 0 && /* Should not occur, but just in case */
9472 INT_MAX/lastitemlength < pptr[1] - 1)
9473 {
9474 *errcodeptr = ERR87; /* Integer overflow; lookbehind too big */
9475 return -1;
9476 }
9477 itemlength = (pptr[1] - 1) * lastitemlength;
9478 break;
9479 }
9480 pptr += 2;
9481 break;
9482 }
9483 /* Fall through */
9484
9485 /* Any other item means this branch does not have a fixed length. */
9486
9487 default:
9488 ISNOTFIXED:
9489 *errcodeptr = ERR25; /* Not fixed length */
9490 return -1;
9491 }
9492
9493 /* Add the item length to the branchlength, checking for integer overflow and
9494 for the branch length exceeding the limit. */
9495
9496 if (INT_MAX - branchlength < (int)itemlength ||
9497 (branchlength += itemlength) > LOOKBEHIND_MAX)
9498 {
9499 *errcodeptr = ERR87;
9500 return -1;
9501 }
9502
9503 /* Save this item length for use if the next item is a quantifier. */
9504
9505 lastitemlength = itemlength;
9506 }
9507
9508EXIT:
9509*pptrptr = pptr;
9510return branchlength;
9511
9512PARSED_SKIP_FAILED:
9513*errcodeptr = ERR90;
9514return -1;
9515}
9516
9517
9518
9519/*************************************************
9520* Set lengths in a lookbehind *
9521*************************************************/
9522
9523/* This function is called for each lookbehind, to set the lengths in its
9524branches. An error occurs if any branch does not have a fixed length that is
9525less than the maximum (65535). On exit, the pointer must be left on the final
9526ket.
9527
9528The function also maintains the max_lookbehind value. Any lookbehind branch
9529that contains a nested lookbehind may actually look further back than the
9530length of the branch. The additional amount is passed back from
9531get_branchlength() as an "extra" value.
9532
9533Arguments:
9534 pptrptr pointer to pointer in the parsed pattern
9535 errcodeptr pointer to error code
9536 lcptr pointer to loop counter
9537 recurses chain of recurse_check to catch mutual recursion
9538 cb pointer to compile block
9539
9540Returns: TRUE if all is well
9541 FALSE otherwise, with error code and offset set
9542*/
9543
9544static BOOL
9545set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
9546 parsed_recurse_check *recurses, compile_block *cb)
9547{
9548PCRE2_SIZE offset;
9549int branchlength;
9550uint32_t *bptr = *pptrptr;
9551
9552READPLUSOFFSET(offset, bptr); /* Offset for error messages */
9553*pptrptr += SIZEOFFSET;
9554
9555do
9556 {
9557 *pptrptr += 1;
9558 branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
9559 if (branchlength < 0)
9560 {
9561 /* The errorcode and offset may already be set from a nested lookbehind. */
9562 if (*errcodeptr == 0) *errcodeptr = ERR25;
9563 if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
9564 return FALSE;
9565 }
9566 if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength;
9567 *bptr |= branchlength; /* branchlength never more than 65535 */
9568 bptr = *pptrptr;
9569 }
9570while (*bptr == META_ALT);
9571
9572return TRUE;
9573}
9574
9575
9576
9577/*************************************************
9578* Check parsed pattern lookbehinds *
9579*************************************************/
9580
9581/* This function is called at the end of parsing a pattern if any lookbehinds
9582were encountered. It scans the parsed pattern for them, calling
9583set_lookbehind_lengths() for each one. At the start, the errorcode is zero and
9584the error offset is marked unset. The enables the functions above not to
9585override settings from deeper nestings.
9586
9587This function is called recursively from get_branchlength() for lookaheads in
9588order to process any lookbehinds that they may contain. It stops when it hits a
9589non-nested closing parenthesis in this case, returning a pointer to it.
9590
9591Arguments
9592 pptr points to where to start (start of pattern or start of lookahead)
9593 retptr if not NULL, return the ket pointer here
9594 recurses chain of recurse_check to catch mutual recursion
9595 cb points to the compile block
9596 lcptr points to loop counter
9597
9598Returns: 0 on success, or an errorcode (cb->erroroffset will be set)
9599*/
9600
9601static int
9602check_lookbehinds(uint32_t *pptr, uint32_t **retptr,
9603 parsed_recurse_check *recurses, compile_block *cb, int *lcptr)
9604{
9605int errorcode = 0;
9606int nestlevel = 0;
9607
9608cb->erroroffset = PCRE2_UNSET;
9609
9610for (; *pptr != META_END; pptr++)
9611 {
9612 if (*pptr < META_END) continue; /* Literal */
9613
9614 switch (META_CODE(*pptr))
9615 {
9616 default:
9617 return ERR70; /* Unrecognized meta code */
9618
9619 case META_ESCAPE:
9620 if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
9621 pptr += 1;
9622 break;
9623
9624 case META_KET:
9625 if (--nestlevel < 0)
9626 {
9627 if (retptr != NULL) *retptr = pptr;
9628 return 0;
9629 }
9630 break;
9631
9632 case META_ATOMIC:
9633 case META_CAPTURE:
9634 case META_COND_ASSERT:
9635 case META_LOOKAHEAD:
9636 case META_LOOKAHEADNOT:
9637 case META_LOOKAHEAD_NA:
9638 case META_NOCAPTURE:
9639 case META_SCRIPT_RUN:
9640 nestlevel++;
9641 break;
9642
9643 case META_ACCEPT:
9644 case META_ALT:
9645 case META_ASTERISK:
9646 case META_ASTERISK_PLUS:
9647 case META_ASTERISK_QUERY:
9648 case META_BACKREF:
9649 case META_CIRCUMFLEX:
9650 case META_CLASS:
9651 case META_CLASS_EMPTY:
9652 case META_CLASS_EMPTY_NOT:
9653 case META_CLASS_END:
9654 case META_CLASS_NOT:
9655 case META_COMMIT:
9656 case META_DOLLAR:
9657 case META_DOT:
9658 case META_FAIL:
9659 case META_PLUS:
9660 case META_PLUS_PLUS:
9661 case META_PLUS_QUERY:
9662 case META_PRUNE:
9663 case META_QUERY:
9664 case META_QUERY_PLUS:
9665 case META_QUERY_QUERY:
9666 case META_RANGE_ESCAPED:
9667 case META_RANGE_LITERAL:
9668 case META_SKIP:
9669 case META_THEN:
9670 break;
9671
9672 case META_RECURSE:
9673 pptr += SIZEOFFSET;
9674 break;
9675
9676 case META_BACKREF_BYNAME:
9677 case META_RECURSE_BYNAME:
9678 pptr += 1 + SIZEOFFSET;
9679 break;
9680
9681 case META_COND_DEFINE:
9682 pptr += SIZEOFFSET;
9683 nestlevel++;
9684 break;
9685
9686 case META_COND_NAME:
9687 case META_COND_NUMBER:
9688 case META_COND_RNAME:
9689 case META_COND_RNUMBER:
9690 pptr += 1 + SIZEOFFSET;
9691 nestlevel++;
9692 break;
9693
9694 case META_COND_VERSION:
9695 pptr += 3;
9696 nestlevel++;
9697 break;
9698
9699 case META_CALLOUT_STRING:
9700 pptr += 3 + SIZEOFFSET;
9701 break;
9702
9703 case META_BIGVALUE:
9704 case META_OPTIONS:
9705 case META_POSIX:
9706 case META_POSIX_NEG:
9707 pptr += 1;
9708 break;
9709
9710 case META_MINMAX:
9711 case META_MINMAX_QUERY:
9712 case META_MINMAX_PLUS:
9713 pptr += 2;
9714 break;
9715
9716 case META_CALLOUT_NUMBER:
9717 pptr += 3;
9718 break;
9719
9720 case META_MARK:
9721 case META_COMMIT_ARG:
9722 case META_PRUNE_ARG:
9723 case META_SKIP_ARG:
9724 case META_THEN_ARG:
9725 pptr += 1 + pptr[1];
9726 break;
9727
9728 case META_LOOKBEHIND:
9729 case META_LOOKBEHINDNOT:
9730 case META_LOOKBEHIND_NA:
9731 if (!set_lookbehind_lengths(&pptr, &errorcode, lcptr, recurses, cb))
9732 return errorcode;
9733 break;
9734 }
9735 }
9736
9737return 0;
9738}
9739
9740
9741
9742/*************************************************
9743* External function to compile a pattern *
9744*************************************************/
9745
9746/* This function reads a regular expression in the form of a string and returns
9747a pointer to a block of store holding a compiled version of the expression.
9748
9749Arguments:
9750 pattern the regular expression
9751 patlen the length of the pattern, or PCRE2_ZERO_TERMINATED
9752 options option bits
9753 errorptr pointer to errorcode
9754 erroroffset pointer to error offset
9755 ccontext points to a compile context or is NULL
9756
9757Returns: pointer to compiled data block, or NULL on error,
9758 with errorcode and erroroffset set
9759*/
9760
9761PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
9762pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
9763 int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
9764{
9765BOOL utf; /* Set TRUE for UTF mode */
9766BOOL ucp; /* Set TRUE for UCP mode */
9767BOOL has_lookbehind = FALSE; /* Set TRUE if a lookbehind is found */
9768BOOL zero_terminated; /* Set TRUE for zero-terminated pattern */
9769pcre2_real_code *re = NULL; /* What we will return */
9770compile_block cb; /* "Static" compile-time data */
9771const uint8_t *tables; /* Char tables base pointer */
9772
9773PCRE2_UCHAR *code; /* Current pointer in compiled code */
9774PCRE2_SPTR codestart; /* Start of compiled code */
9775PCRE2_SPTR ptr; /* Current pointer in pattern */
9776uint32_t *pptr; /* Current pointer in parsed pattern */
9777
9778PCRE2_SIZE length = 1; /* Allow for final END opcode */
9779PCRE2_SIZE usedlength; /* Actual length used */
9780PCRE2_SIZE re_blocksize; /* Size of memory block */
9781PCRE2_SIZE big32count = 0; /* 32-bit literals >= 0x80000000 */
9782PCRE2_SIZE parsed_size_needed; /* Needed for parsed pattern */
9783
Elliott Hughes4e19c8e2022-04-15 15:11:02 -07009784uint32_t firstcuflags, reqcuflags; /* Type of first/req code unit */
Elliott Hughes5b808042021-10-01 10:56:10 -07009785uint32_t firstcu, reqcu; /* Value of first/req code unit */
9786uint32_t setflags = 0; /* NL and BSR set flags */
9787
9788uint32_t skipatstart; /* When checking (*UTF) etc */
9789uint32_t limit_heap = UINT32_MAX;
9790uint32_t limit_match = UINT32_MAX; /* Unset match limits */
9791uint32_t limit_depth = UINT32_MAX;
9792
9793int newline = 0; /* Unset; can be set by the pattern */
9794int bsr = 0; /* Unset; can be set by the pattern */
9795int errorcode = 0; /* Initialize to avoid compiler warn */
9796int regexrc; /* Return from compile */
9797
9798uint32_t i; /* Local loop counter */
9799
9800/* Comments at the head of this file explain about these variables. */
9801
9802uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE];
9803uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE];
9804named_group named_groups[NAMED_GROUP_LIST_SIZE];
9805
9806/* The workspace is used in different ways in the different compiling phases.
9807It needs to be 16-bit aligned for the preliminary parsing scan. */
9808
9809uint32_t c16workspace[C16_WORK_SIZE];
9810PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c16workspace;
9811
9812
9813/* -------------- Check arguments and set up the pattern ----------------- */
9814
9815/* There must be error code and offset pointers. */
9816
9817if (errorptr == NULL || erroroffset == NULL) return NULL;
9818*errorptr = ERR0;
9819*erroroffset = 0;
9820
9821/* There must be a pattern! */
9822
9823if (pattern == NULL)
9824 {
9825 *errorptr = ERR16;
9826 return NULL;
9827 }
9828
9829/* A NULL compile context means "use a default context" */
9830
9831if (ccontext == NULL)
9832 ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
9833
9834/* PCRE2_MATCH_INVALID_UTF implies UTF */
9835
9836if ((options & PCRE2_MATCH_INVALID_UTF) != 0) options |= PCRE2_UTF;
9837
9838/* Check that all undefined public option bits are zero. */
9839
9840if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 ||
9841 (ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0)
9842 {
9843 *errorptr = ERR17;
9844 return NULL;
9845 }
9846
9847if ((options & PCRE2_LITERAL) != 0 &&
9848 ((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 ||
9849 (ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0))
9850 {
9851 *errorptr = ERR92;
9852 return NULL;
9853 }
9854
9855/* A zero-terminated pattern is indicated by the special length value
9856PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */
9857
9858if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED)))
9859 patlen = PRIV(strlen)(pattern);
9860
9861if (patlen > ccontext->max_pattern_length)
9862 {
9863 *errorptr = ERR88;
9864 return NULL;
9865 }
9866
9867/* From here on, all returns from this function should end up going via the
9868EXIT label. */
9869
9870
9871/* ------------ Initialize the "static" compile data -------------- */
9872
9873tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);
9874
9875cb.lcc = tables + lcc_offset; /* Individual */
9876cb.fcc = tables + fcc_offset; /* character */
9877cb.cbits = tables + cbits_offset; /* tables */
9878cb.ctypes = tables + ctypes_offset;
9879
9880cb.assert_depth = 0;
9881cb.bracount = 0;
9882cb.cx = ccontext;
9883cb.dupnames = FALSE;
9884cb.end_pattern = pattern + patlen;
9885cb.erroroffset = 0;
9886cb.external_flags = 0;
9887cb.external_options = options;
9888cb.groupinfo = stack_groupinfo;
9889cb.had_recurse = FALSE;
9890cb.lastcapture = 0;
9891cb.max_lookbehind = 0;
9892cb.name_entry_size = 0;
9893cb.name_table = NULL;
9894cb.named_groups = named_groups;
9895cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;
9896cb.names_found = 0;
9897cb.open_caps = NULL;
9898cb.parens_depth = 0;
9899cb.parsed_pattern = stack_parsed_pattern;
9900cb.req_varyopt = 0;
9901cb.start_code = cworkspace;
9902cb.start_pattern = pattern;
9903cb.start_workspace = cworkspace;
9904cb.workspace_size = COMPILE_WORK_SIZE;
9905
9906/* Maximum back reference and backref bitmap. The bitmap records up to 31 back
9907references to help in deciding whether (.*) can be treated as anchored or not.
9908*/
9909
9910cb.top_backref = 0;
9911cb.backref_map = 0;
9912
9913/* Escape sequences \1 to \9 are always back references, but as they are only
9914two characters long, only two elements can be used in the parsed_pattern
9915vector. The first contains the reference, and we'd like to use the second to
9916record the offset in the pattern, so that forward references to non-existent
9917groups can be diagnosed later with an offset. However, on 64-bit systems,
9918PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first
9919occurrence of \1 to \9, indexed by the second parsed_pattern value. All other
9920references have enough space for the offset to be put into the parsed pattern.
9921*/
9922
9923for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET;
9924
9925
9926/* --------------- Start looking at the pattern --------------- */
9927
9928/* Unless PCRE2_LITERAL is set, check for global one-time option settings at
9929the start of the pattern, and remember the offset to the actual regex. With
9930valgrind support, make the terminator of a zero-terminated pattern
9931inaccessible. This catches bugs that would otherwise only show up for
9932non-zero-terminated patterns. */
9933
9934#ifdef SUPPORT_VALGRIND
9935if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1));
9936#endif
9937
9938ptr = pattern;
9939skipatstart = 0;
9940
9941if ((options & PCRE2_LITERAL) == 0)
9942 {
9943 while (patlen - skipatstart >= 2 &&
9944 ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
9945 ptr[skipatstart+1] == CHAR_ASTERISK)
9946 {
9947 for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)
9948 {
9949 uint32_t c, pp;
9950 pso *p = pso_list + i;
9951
9952 if (patlen - skipatstart - 2 >= p->length &&
9953 PRIV(strncmp_c8)(ptr + skipatstart + 2, (char *)(p->name),
9954 p->length) == 0)
9955 {
9956 skipatstart += p->length + 2;
9957 switch(p->type)
9958 {
9959 case PSO_OPT:
9960 cb.external_options |= p->value;
9961 break;
9962
9963 case PSO_FLG:
9964 setflags |= p->value;
9965 break;
9966
9967 case PSO_NL:
9968 newline = p->value;
9969 setflags |= PCRE2_NL_SET;
9970 break;
9971
9972 case PSO_BSR:
9973 bsr = p->value;
9974 setflags |= PCRE2_BSR_SET;
9975 break;
9976
9977 case PSO_LIMM:
9978 case PSO_LIMD:
9979 case PSO_LIMH:
9980 c = 0;
9981 pp = skipatstart;
9982 if (!IS_DIGIT(ptr[pp]))
9983 {
9984 errorcode = ERR60;
9985 ptr += pp;
9986 goto HAD_EARLY_ERROR;
9987 }
9988 while (IS_DIGIT(ptr[pp]))
9989 {
9990 if (c > UINT32_MAX / 10 - 1) break; /* Integer overflow */
9991 c = c*10 + (ptr[pp++] - CHAR_0);
9992 }
9993 if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS)
9994 {
9995 errorcode = ERR60;
9996 ptr += pp;
9997 goto HAD_EARLY_ERROR;
9998 }
9999 if (p->type == PSO_LIMH) limit_heap = c;
10000 else if (p->type == PSO_LIMM) limit_match = c;
10001 else limit_depth = c;
10002 skipatstart += pp - skipatstart;
10003 break;
10004 }
10005 break; /* Out of the table scan loop */
10006 }
10007 }
10008 if (i >= sizeof(pso_list)/sizeof(pso)) break; /* Out of pso loop */
10009 }
10010 }
10011
10012/* End of pattern-start options; advance to start of real regex. */
10013
10014ptr += skipatstart;
10015
10016/* Can't support UTF or UCP if PCRE2 was built without Unicode support. */
10017
10018#ifndef SUPPORT_UNICODE
10019if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
10020 {
10021 errorcode = ERR32;
10022 goto HAD_EARLY_ERROR;
10023 }
10024#endif
10025
10026/* Check UTF. We have the original options in 'options', with that value as
10027modified by (*UTF) etc in cb->external_options. The extra option
10028PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the
10029surrogate code points cannot be represented in UTF-16. */
10030
10031utf = (cb.external_options & PCRE2_UTF) != 0;
10032if (utf)
10033 {
10034 if ((options & PCRE2_NEVER_UTF) != 0)
10035 {
10036 errorcode = ERR74;
10037 goto HAD_EARLY_ERROR;
10038 }
10039 if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
10040 (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
10041 goto HAD_ERROR; /* Offset was set by valid_utf() */
10042
10043#if PCRE2_CODE_UNIT_WIDTH == 16
10044 if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)
10045 {
10046 errorcode = ERR91;
10047 goto HAD_EARLY_ERROR;
10048 }
10049#endif
10050 }
10051
10052/* Check UCP lockout. */
10053
10054ucp = (cb.external_options & PCRE2_UCP) != 0;
10055if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)
10056 {
10057 errorcode = ERR75;
10058 goto HAD_EARLY_ERROR;
10059 }
10060
10061/* Process the BSR setting. */
10062
10063if (bsr == 0) bsr = ccontext->bsr_convention;
10064
10065/* Process the newline setting. */
10066
10067if (newline == 0) newline = ccontext->newline_convention;
10068cb.nltype = NLTYPE_FIXED;
10069switch(newline)
10070 {
10071 case PCRE2_NEWLINE_CR:
10072 cb.nllen = 1;
10073 cb.nl[0] = CHAR_CR;
10074 break;
10075
10076 case PCRE2_NEWLINE_LF:
10077 cb.nllen = 1;
10078 cb.nl[0] = CHAR_NL;
10079 break;
10080
10081 case PCRE2_NEWLINE_NUL:
10082 cb.nllen = 1;
10083 cb.nl[0] = CHAR_NUL;
10084 break;
10085
10086 case PCRE2_NEWLINE_CRLF:
10087 cb.nllen = 2;
10088 cb.nl[0] = CHAR_CR;
10089 cb.nl[1] = CHAR_NL;
10090 break;
10091
10092 case PCRE2_NEWLINE_ANY:
10093 cb.nltype = NLTYPE_ANY;
10094 break;
10095
10096 case PCRE2_NEWLINE_ANYCRLF:
10097 cb.nltype = NLTYPE_ANYCRLF;
10098 break;
10099
10100 default:
10101 errorcode = ERR56;
10102 goto HAD_EARLY_ERROR;
10103 }
10104
10105/* Pre-scan the pattern to do two things: (1) Discover the named groups and
10106their numerical equivalents, so that this information is always available for
10107the remaining processing. (2) At the same time, parse the pattern and put a
10108processed version into the parsed_pattern vector. This has escapes interpreted
10109and comments removed (amongst other things).
10110
10111In all but one case, when PCRE2_AUTO_CALLOUT is not set, the number of unsigned
1011232-bit ints in the parsed pattern is bounded by the length of the pattern plus
10113one (for the terminator) plus four if PCRE2_EXTRA_WORD or PCRE2_EXTRA_LINE is
10114set. The exceptional case is when running in 32-bit, non-UTF mode, when literal
10115characters greater than META_END (0x80000000) have to be coded as two units. In
10116this case, therefore, we scan the pattern to check for such values. */
10117
10118#if PCRE2_CODE_UNIT_WIDTH == 32
10119if (!utf)
10120 {
10121 PCRE2_SPTR p;
10122 for (p = ptr; p < cb.end_pattern; p++) if (*p >= META_END) big32count++;
10123 }
10124#endif
10125
10126/* Ensure that the parsed pattern buffer is big enough. When PCRE2_AUTO_CALLOUT
10127is set we have to assume a numerical callout (4 elements) for each character
10128plus one at the end. This is overkill, but memory is plentiful these days. For
10129many smaller patterns the vector on the stack (which was set up above) can be
10130used. */
10131
10132parsed_size_needed = patlen - skipatstart + big32count;
10133
10134if ((ccontext->extra_options &
10135 (PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0)
10136 parsed_size_needed += 4;
10137
10138if ((options & PCRE2_AUTO_CALLOUT) != 0)
10139 parsed_size_needed = (parsed_size_needed + 1) * 5;
10140
10141if (parsed_size_needed >= PARSED_PATTERN_DEFAULT_SIZE)
10142 {
10143 uint32_t *heap_parsed_pattern = ccontext->memctl.malloc(
10144 (parsed_size_needed + 1) * sizeof(uint32_t), ccontext->memctl.memory_data);
10145 if (heap_parsed_pattern == NULL)
10146 {
10147 *errorptr = ERR21;
10148 goto EXIT;
10149 }
10150 cb.parsed_pattern = heap_parsed_pattern;
10151 }
10152cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed + 1;
10153
10154/* Do the parsing scan. */
10155
10156errorcode = parse_regex(ptr, cb.external_options, &has_lookbehind, &cb);
10157if (errorcode != 0) goto HAD_CB_ERROR;
10158
10159/* Workspace is needed to remember information about numbered groups: whether a
10160group can match an empty string and what its fixed length is. This is done to
10161avoid the possibility of recursive references causing very long compile times
10162when checking these features. Unnumbered groups do not have this exposure since
10163they cannot be referenced. We use an indexed vector for this purpose. If there
10164are sufficiently few groups, the default vector on the stack, as set up above,
10165can be used. Otherwise we have to get/free a special vector. The vector must be
10166initialized to zero. */
10167
10168if (cb.bracount >= GROUPINFO_DEFAULT_SIZE)
10169 {
10170 cb.groupinfo = ccontext->memctl.malloc(
10171 (cb.bracount + 1)*sizeof(uint32_t), ccontext->memctl.memory_data);
10172 if (cb.groupinfo == NULL)
10173 {
10174 errorcode = ERR21;
10175 cb.erroroffset = 0;
10176 goto HAD_CB_ERROR;
10177 }
10178 }
10179memset(cb.groupinfo, 0, (cb.bracount + 1) * sizeof(uint32_t));
10180
10181/* If there were any lookbehinds, scan the parsed pattern to figure out their
10182lengths. */
10183
10184if (has_lookbehind)
10185 {
10186 int loopcount = 0;
10187 errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb, &loopcount);
10188 if (errorcode != 0) goto HAD_CB_ERROR;
10189 }
10190
10191/* For debugging, there is a function that shows the parsed data vector. */
10192
10193#ifdef DEBUG_SHOW_PARSED
10194fprintf(stderr, "+++ Pre-scan complete:\n");
10195show_parsed(&cb);
10196#endif
10197
10198/* For debugging capturing information this code can be enabled. */
10199
10200#ifdef DEBUG_SHOW_CAPTURES
10201 {
10202 named_group *ng = cb.named_groups;
10203 fprintf(stderr, "+++Captures: %d\n", cb.bracount);
10204 for (i = 0; i < cb.names_found; i++, ng++)
10205 {
10206 fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);
10207 }
10208 }
10209#endif
10210
10211/* Pretend to compile the pattern while actually just accumulating the amount
10212of memory required in the 'length' variable. This behaviour is triggered by
10213passing a non-NULL final argument to compile_regex(). We pass a block of
10214workspace (cworkspace) for it to compile parts of the pattern into; the
10215compiled code is discarded when it is no longer needed, so hopefully this
10216workspace will never overflow, though there is a test for its doing so.
10217
10218On error, errorcode will be set non-zero, so we don't need to look at the
10219result of the function. The initial options have been put into the cb block,
10220but we still have to pass a separate options variable (the first argument)
10221because the options may change as the pattern is processed. */
10222
10223cb.erroroffset = patlen; /* For any subsequent errors that do not set it */
10224pptr = cb.parsed_pattern;
10225code = cworkspace;
10226*code = OP_BRA;
10227
10228(void)compile_regex(cb.external_options, &code, &pptr, &errorcode, 0, &firstcu,
10229 &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, &length);
10230
10231if (errorcode != 0) goto HAD_CB_ERROR; /* Offset is in cb.erroroffset */
10232
10233/* This should be caught in compile_regex(), but just in case... */
10234
10235if (length > MAX_PATTERN_SIZE)
10236 {
10237 errorcode = ERR20;
10238 goto HAD_CB_ERROR;
10239 }
10240
10241/* Compute the size of, and then get and initialize, the data block for storing
10242the compiled pattern and names table. Integer overflow should no longer be
10243possible because nowadays we limit the maximum value of cb.names_found and
10244cb.name_entry_size. */
10245
10246re_blocksize = sizeof(pcre2_real_code) +
10247 CU2BYTES(length +
10248 (PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size);
10249re = (pcre2_real_code *)
10250 ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
10251if (re == NULL)
10252 {
10253 errorcode = ERR21;
10254 goto HAD_CB_ERROR;
10255 }
10256
10257/* The compiler may put padding at the end of the pcre2_real_code structure in
10258order to round it up to a multiple of 4 or 8 bytes. This means that when a
10259compiled pattern is copied (for example, when serialized) undefined bytes are
10260read, and this annoys debuggers such as valgrind. To avoid this, we explicitly
10261write to the last 8 bytes of the structure before setting the fields. */
10262
10263memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8);
10264re->memctl = ccontext->memctl;
10265re->tables = tables;
10266re->executable_jit = NULL;
10267memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
10268re->blocksize = re_blocksize;
10269re->magic_number = MAGIC_NUMBER;
10270re->compile_options = options;
10271re->overall_options = cb.external_options;
10272re->extra_options = ccontext->extra_options;
10273re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags;
10274re->limit_heap = limit_heap;
10275re->limit_match = limit_match;
10276re->limit_depth = limit_depth;
10277re->first_codeunit = 0;
10278re->last_codeunit = 0;
10279re->bsr_convention = bsr;
10280re->newline_convention = newline;
10281re->max_lookbehind = 0;
10282re->minlength = 0;
10283re->top_bracket = 0;
10284re->top_backref = 0;
10285re->name_entry_size = cb.name_entry_size;
10286re->name_count = cb.names_found;
10287
10288/* The basic block is immediately followed by the name table, and the compiled
10289code follows after that. */
10290
10291codestart = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) +
10292 re->name_entry_size * re->name_count;
10293
10294/* Update the compile data block for the actual compile. The starting points of
10295the name/number translation table and of the code are passed around in the
10296compile data block. The start/end pattern and initial options are already set
10297from the pre-compile phase, as is the name_entry_size field. */
10298
10299cb.parens_depth = 0;
10300cb.assert_depth = 0;
10301cb.lastcapture = 0;
10302cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
10303cb.start_code = codestart;
10304cb.req_varyopt = 0;
10305cb.had_accept = FALSE;
10306cb.had_pruneorskip = FALSE;
10307cb.open_caps = NULL;
10308
10309/* If any named groups were found, create the name/number table from the list
10310created in the pre-pass. */
10311
10312if (cb.names_found > 0)
10313 {
10314 named_group *ng = cb.named_groups;
10315 for (i = 0; i < cb.names_found; i++, ng++)
10316 add_name_to_table(&cb, ng->name, ng->length, ng->number, i);
10317 }
10318
10319/* Set up a starting, non-extracting bracket, then compile the expression. On
10320error, errorcode will be set non-zero, so we don't need to look at the result
10321of the function here. */
10322
10323pptr = cb.parsed_pattern;
10324code = (PCRE2_UCHAR *)codestart;
10325*code = OP_BRA;
10326regexrc = compile_regex(re->overall_options, &code, &pptr, &errorcode, 0,
10327 &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, NULL);
10328if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY;
10329re->top_bracket = cb.bracount;
10330re->top_backref = cb.top_backref;
10331re->max_lookbehind = cb.max_lookbehind;
10332
10333if (cb.had_accept)
10334 {
10335 reqcu = 0; /* Must disable after (*ACCEPT) */
10336 reqcuflags = REQ_NONE;
10337 re->flags |= PCRE2_HASACCEPT; /* Disables minimum length */
10338 }
10339
10340/* Fill in the final opcode and check for disastrous overflow. If no overflow,
10341but the estimated length exceeds the really used length, adjust the value of
10342re->blocksize, and if valgrind support is configured, mark the extra allocated
10343memory as unaddressable, so that any out-of-bound reads can be detected. */
10344
10345*code++ = OP_END;
10346usedlength = code - codestart;
10347if (usedlength > length) errorcode = ERR23; else
10348 {
10349 re->blocksize -= CU2BYTES(length - usedlength);
10350#ifdef SUPPORT_VALGRIND
10351 VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
10352#endif
10353 }
10354
10355/* Scan the pattern for recursion/subroutine calls and convert the group
10356numbers into offsets. Maintain a small cache so that repeated groups containing
10357recursions are efficiently handled. */
10358
10359#define RSCAN_CACHE_SIZE 8
10360
10361if (errorcode == 0 && cb.had_recurse)
10362 {
10363 PCRE2_UCHAR *rcode;
10364 PCRE2_SPTR rgroup;
10365 unsigned int ccount = 0;
10366 int start = RSCAN_CACHE_SIZE;
10367 recurse_cache rc[RSCAN_CACHE_SIZE];
10368
10369 for (rcode = (PCRE2_UCHAR *)find_recurse(codestart, utf);
10370 rcode != NULL;
10371 rcode = (PCRE2_UCHAR *)find_recurse(rcode + 1 + LINK_SIZE, utf))
10372 {
10373 int p, groupnumber;
10374
10375 groupnumber = (int)GET(rcode, 1);
10376 if (groupnumber == 0) rgroup = codestart; else
10377 {
10378 PCRE2_SPTR search_from = codestart;
10379 rgroup = NULL;
10380 for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
10381 {
10382 if (groupnumber == rc[p].groupnumber)
10383 {
10384 rgroup = rc[p].group;
10385 break;
10386 }
10387
10388 /* Group n+1 must always start to the right of group n, so we can save
10389 search time below when the new group number is greater than any of the
10390 previously found groups. */
10391
10392 if (groupnumber > rc[p].groupnumber) search_from = rc[p].group;
10393 }
10394
10395 if (rgroup == NULL)
10396 {
10397 rgroup = PRIV(find_bracket)(search_from, utf, groupnumber);
10398 if (rgroup == NULL)
10399 {
10400 errorcode = ERR53;
10401 break;
10402 }
10403 if (--start < 0) start = RSCAN_CACHE_SIZE - 1;
10404 rc[start].groupnumber = groupnumber;
10405 rc[start].group = rgroup;
10406 if (ccount < RSCAN_CACHE_SIZE) ccount++;
10407 }
10408 }
10409
10410 PUT(rcode, 1, rgroup - codestart);
10411 }
10412 }
10413
10414/* In rare debugging situations we sometimes need to look at the compiled code
10415at this stage. */
10416
10417#ifdef DEBUG_CALL_PRINTINT
10418pcre2_printint(re, stderr, TRUE);
10419fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);
10420#endif
10421
10422/* Unless disabled, check whether any single character iterators can be
10423auto-possessified. The function overwrites the appropriate opcode values, so
10424the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
10425used in this code because at least one compiler gives a warning about loss of
10426"const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the
10427function call. */
10428
10429if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
10430 {
10431 PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
10432 if (PRIV(auto_possessify)(temp, &cb) != 0) errorcode = ERR80;
10433 }
10434
10435/* Failed to compile, or error while post-processing. */
10436
10437if (errorcode != 0) goto HAD_CB_ERROR;
10438
10439/* Successful compile. If the anchored option was not passed, set it if
10440we can determine that the pattern is anchored by virtue of ^ characters or \A
10441or anything else, such as starting with non-atomic .* when DOTALL is set and
10442there are no occurrences of *PRUNE or *SKIP (though there is an option to
10443disable this case). */
10444
10445if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
10446 is_anchored(codestart, 0, &cb, 0, FALSE))
10447 re->overall_options |= PCRE2_ANCHORED;
10448
10449/* Set up the first code unit or startline flag, the required code unit, and
10450then study the pattern. This code need not be obeyed if PCRE2_NO_START_OPTIMIZE
10451is set, as the data it would create will not be used. Note that a first code
10452unit (but not the startline flag) is useful for anchored patterns because it
10453can still give a quick "no match" and also avoid searching for a last code
10454unit. */
10455
10456if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
10457 {
10458 int minminlength = 0; /* For minimal minlength from first/required CU */
10459
10460 /* If we do not have a first code unit, see if there is one that is asserted
10461 (these are not saved during the compile because they can cause conflicts with
10462 actual literals that follow). */
10463
Elliott Hughes4e19c8e2022-04-15 15:11:02 -070010464 if (firstcuflags >= REQ_NONE)
Elliott Hughes5b808042021-10-01 10:56:10 -070010465 firstcu = find_firstassertedcu(codestart, &firstcuflags, 0);
10466
10467 /* Save the data for a first code unit. The existence of one means the
10468 minimum length must be at least 1. */
10469
Elliott Hughes4e19c8e2022-04-15 15:11:02 -070010470 if (firstcuflags < REQ_NONE)
Elliott Hughes5b808042021-10-01 10:56:10 -070010471 {
10472 re->first_codeunit = firstcu;
10473 re->flags |= PCRE2_FIRSTSET;
10474 minminlength++;
10475
10476 /* Handle caseless first code units. */
10477
10478 if ((firstcuflags & REQ_CASELESS) != 0)
10479 {
10480 if (firstcu < 128 || (!utf && !ucp && firstcu < 255))
10481 {
10482 if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
10483 }
10484
10485 /* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.
10486 In 8-bit UTF mode, codepoints in the range 128-255 are introductory code
10487 points and cannot have another case, but if UCP is set they may do. */
10488
10489#ifdef SUPPORT_UNICODE
10490#if PCRE2_CODE_UNIT_WIDTH == 8
10491 else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
10492 re->flags |= PCRE2_FIRSTCASELESS;
10493#else
10494 else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&
10495 UCD_OTHERCASE(firstcu) != firstcu)
10496 re->flags |= PCRE2_FIRSTCASELESS;
10497#endif
10498#endif /* SUPPORT_UNICODE */
10499 }
10500 }
10501
10502 /* When there is no first code unit, for non-anchored patterns, see if we can
10503 set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all
10504 branches start with ^ and also when all branches start with non-atomic .* for
10505 non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option
10506 that disables this case.) */
10507
10508 else if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
10509 is_startline(codestart, 0, &cb, 0, FALSE))
10510 re->flags |= PCRE2_STARTLINE;
10511
10512 /* Handle the "required code unit", if one is set. In the UTF case we can
10513 increment the minimum minimum length only if we are sure this really is a
10514 different character and not a non-starting code unit of the first character,
10515 because the minimum length count is in characters, not code units. */
10516
Elliott Hughes4e19c8e2022-04-15 15:11:02 -070010517 if (reqcuflags < REQ_NONE)
Elliott Hughes5b808042021-10-01 10:56:10 -070010518 {
10519#if PCRE2_CODE_UNIT_WIDTH == 16
10520 if ((re->overall_options & PCRE2_UTF) == 0 || /* Not UTF */
Elliott Hughes4e19c8e2022-04-15 15:11:02 -070010521 firstcuflags >= REQ_NONE || /* First not set */
Elliott Hughes5b808042021-10-01 10:56:10 -070010522 (firstcu & 0xf800) != 0xd800 || /* First not surrogate */
10523 (reqcu & 0xfc00) != 0xdc00) /* Req not low surrogate */
10524#elif PCRE2_CODE_UNIT_WIDTH == 8
10525 if ((re->overall_options & PCRE2_UTF) == 0 || /* Not UTF */
Elliott Hughes4e19c8e2022-04-15 15:11:02 -070010526 firstcuflags >= REQ_NONE || /* First not set */
Elliott Hughes5b808042021-10-01 10:56:10 -070010527 (firstcu & 0x80) == 0 || /* First is ASCII */
10528 (reqcu & 0x80) == 0) /* Req is ASCII */
10529#endif
10530 {
10531 minminlength++;
10532 }
10533
10534 /* In the case of an anchored pattern, set up the value only if it follows
10535 a variable length item in the pattern. */
10536
10537 if ((re->overall_options & PCRE2_ANCHORED) == 0 ||
10538 (reqcuflags & REQ_VARY) != 0)
10539 {
10540 re->last_codeunit = reqcu;
10541 re->flags |= PCRE2_LASTSET;
10542
10543 /* Handle caseless required code units as for first code units (above). */
10544
10545 if ((reqcuflags & REQ_CASELESS) != 0)
10546 {
10547 if (reqcu < 128 || (!utf && !ucp && reqcu < 255))
10548 {
10549 if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
10550 }
10551#ifdef SUPPORT_UNICODE
10552#if PCRE2_CODE_UNIT_WIDTH == 8
10553 else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
10554 re->flags |= PCRE2_LASTCASELESS;
10555#else
10556 else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&
10557 UCD_OTHERCASE(reqcu) != reqcu)
10558 re->flags |= PCRE2_LASTCASELESS;
10559#endif
10560#endif /* SUPPORT_UNICODE */
10561 }
10562 }
10563 }
10564
10565 /* Study the compiled pattern to set up information such as a bitmap of
10566 starting code units and a minimum matching length. */
10567
10568 if (PRIV(study)(re) != 0)
10569 {
10570 errorcode = ERR31;
10571 goto HAD_CB_ERROR;
10572 }
10573
10574 /* If study() set a bitmap of starting code units, it implies a minimum
10575 length of at least one. */
10576
10577 if ((re->flags & PCRE2_FIRSTMAPSET) != 0 && minminlength == 0)
10578 minminlength = 1;
10579
10580 /* If the minimum length set (or not set) by study() is less than the minimum
10581 implied by required code units, override it. */
10582
10583 if (re->minlength < minminlength) re->minlength = minminlength;
10584 } /* End of start-of-match optimizations. */
10585
10586/* Control ends up here in all cases. When running under valgrind, make a
10587pattern's terminating zero defined again. If memory was obtained for the parsed
10588version of the pattern, free it before returning. Also free the list of named
10589groups if a larger one had to be obtained, and likewise the group information
10590vector. */
10591
10592EXIT:
10593#ifdef SUPPORT_VALGRIND
10594if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1));
10595#endif
10596if (cb.parsed_pattern != stack_parsed_pattern)
10597 ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data);
10598if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)
10599 ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);
10600if (cb.groupinfo != stack_groupinfo)
10601 ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);
10602return re; /* Will be NULL after an error */
10603
10604/* Errors discovered in parse_regex() set the offset value in the compile
10605block. Errors discovered before it is called must compute it from the ptr
10606value. After parse_regex() is called, the offset in the compile block is set to
10607the end of the pattern, but certain errors in compile_regex() may reset it if
10608an offset is available in the parsed pattern. */
10609
10610HAD_CB_ERROR:
10611ptr = pattern + cb.erroroffset;
10612
10613HAD_EARLY_ERROR:
10614*erroroffset = ptr - pattern;
10615
10616HAD_ERROR:
10617*errorptr = errorcode;
10618pcre2_code_free(re);
10619re = NULL;
10620goto EXIT;
10621}
10622
10623/* End of pcre2_compile.c */