Blame - src/pcre2_compile.c - platform/external/pcre

blob: de259c9c40573d352b9769b16a1a09c813ab6ee6 [file] [log] [blame]

Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	1	/*************************************************
				2	* Perl-Compatible Regular Expressions *
				3	*************************************************/
				4
				5	/* PCRE is a library of functions to support regular expressions whose syntax
				6	and semantics are as close as possible to those of the Perl 5 language.
				7
				8	Written by Philip Hazel
				9	Original API code Copyright (c) 1997-2012 University of Cambridge
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	10	New API code Copyright (c) 2016-2022 University of Cambridge
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	11
				12	-----------------------------------------------------------------------------
				13	Redistribution and use in source and binary forms, with or without
				14	modification, are permitted provided that the following conditions are met:
				15
				16	* Redistributions of source code must retain the above copyright notice,
				17	this list of conditions and the following disclaimer.
				18
				19	* Redistributions in binary form must reproduce the above copyright
				20	notice, this list of conditions and the following disclaimer in the
				21	documentation and/or other materials provided with the distribution.
				22
				23	* Neither the name of the University of Cambridge nor the names of its
				24	contributors may be used to endorse or promote products derived from
				25	this software without specific prior written permission.
				26
				27	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
				28	AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
				29	IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
				30	ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
				31	LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
				32	CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
				33	SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
				34	INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
				35	CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
				36	ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
				37	POSSIBILITY OF SUCH DAMAGE.
				38	-----------------------------------------------------------------------------
				39	*/
				40
				41
				42	#ifdef HAVE_CONFIG_H
				43	#include "config.h"
				44	#endif
				45
				46	#define NLBLOCK cb /* Block containing newline information */
				47	#define PSSTART start_pattern /* Field containing processed string start */
				48	#define PSEND end_pattern /* Field containing processed string end */
				49
				50	#include "pcre2_internal.h"
				51
				52	/* In rare error cases debugging might require calling pcre2_printint(). */
				53
				54	#if 0
				55	#ifdef EBCDIC
				56	#define PRINTABLE(c) ((c) >= 64 && (c) < 255)
				57	#else
				58	#define PRINTABLE(c) ((c) >= 32 && (c) < 127)
				59	#endif
				60	#include "pcre2_printint.c"
				61	#define DEBUG_CALL_PRINTINT
				62	#endif
				63
				64	/* Other debugging code can be enabled by these defines. */
				65
				66	/* #define DEBUG_SHOW_CAPTURES */
				67	/* #define DEBUG_SHOW_PARSED */
				68
				69	/* There are a few things that vary with different code unit sizes. Handle them
				70	by defining macros in order to minimize #if usage. */
				71
				72	#if PCRE2_CODE_UNIT_WIDTH == 8
				73	#define STRING_UTFn_RIGHTPAR STRING_UTF8_RIGHTPAR, 5
				74	#define XDIGIT(c) xdigitab[c]
				75
				76	#else /* Either 16-bit or 32-bit */
				77	#define XDIGIT(c) (MAX_255(c)? xdigitab[c] : 0xff)
				78
				79	#if PCRE2_CODE_UNIT_WIDTH == 16
				80	#define STRING_UTFn_RIGHTPAR STRING_UTF16_RIGHTPAR, 6
				81
				82	#else /* 32-bit */
				83	#define STRING_UTFn_RIGHTPAR STRING_UTF32_RIGHTPAR, 6
				84	#endif
				85	#endif
				86
				87	/* Macros to store and retrieve a PCRE2_SIZE value in the parsed pattern, which
				88	consists of uint32_t elements. Assume that if uint32_t can't hold it, two of
				89	them will be able to (i.e. assume a 64-bit world). */
				90
				91	#if PCRE2_SIZE_MAX <= UINT32_MAX
				92	#define PUTOFFSET(s,p) *p++ = s
				93	#define GETOFFSET(s,p) s = *p++
				94	#define GETPLUSOFFSET(s,p) s = *(++p)
				95	#define READPLUSOFFSET(s,p) s = p[1]
				96	#define SKIPOFFSET(p) p++
				97	#define SIZEOFFSET 1
				98	#else
				99	#define PUTOFFSET(s,p) \
				100	{ p++ = (uint32_t)(s >> 32); p++ = (uint32_t)(s & 0xffffffff); }
				101	#define GETOFFSET(s,p) \
				102	{ s = ((PCRE2_SIZE)p[0] << 32) \| (PCRE2_SIZE)p[1]; p += 2; }
				103	#define GETPLUSOFFSET(s,p) \
				104	{ s = ((PCRE2_SIZE)p[1] << 32) \| (PCRE2_SIZE)p[2]; p += 2; }
				105	#define READPLUSOFFSET(s,p) \
				106	{ s = ((PCRE2_SIZE)p[1] << 32) \| (PCRE2_SIZE)p[2]; }
				107	#define SKIPOFFSET(p) p += 2
				108	#define SIZEOFFSET 2
				109	#endif
				110
				111	/* Macros for manipulating elements of the parsed pattern vector. */
				112
				113	#define META_CODE(x) (x & 0xffff0000u)
				114	#define META_DATA(x) (x & 0x0000ffffu)
				115	#define META_DIFF(x,y) ((x-y)>>16)
				116
				117	/* Function definitions to allow mutual recursion */
				118
				119	#ifdef SUPPORT_UNICODE
				120	static unsigned int
				121	add_list_to_class_internal(uint8_t , PCRE2_UCHAR *, uint32_t,
				122	compile_block , const uint32_t , unsigned int);
				123	#endif
				124
				125	static int
				126	compile_regex(uint32_t, PCRE2_UCHAR , uint32_t , int *, uint32_t,
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	127	uint32_t , uint32_t , uint32_t , uint32_t , branch_chain *,
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	128	compile_block , PCRE2_SIZE );
				129
				130	static int
				131	get_branchlength(uint32_t *, int , int , parsed_recurse_check ,
				132	compile_block *);
				133
				134	static BOOL
				135	set_lookbehind_lengths(uint32_t *, int , int , parsed_recurse_check ,
				136	compile_block *);
				137
				138	static int
				139	check_lookbehinds(uint32_t , uint32_t , parsed_recurse_check ,
				140	compile_block , int );
				141
				142
				143	/*************************************************
				144	* Code parameters and static tables *
				145	*************************************************/
				146
				147	#define MAX_GROUP_NUMBER 65535u
				148	#define MAX_REPEAT_COUNT 65535u
				149	#define REPEAT_UNLIMITED (MAX_REPEAT_COUNT+1)
				150
				151	/* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in
				152	different ways in the different pattern scans. The parsing and group-
				153	identifying pre-scan uses it to handle nesting, and needs it to be 16-bit
				154	aligned for this. Having defined the size in code units, we set up
				155	C16_WORK_SIZE as the number of elements in the 16-bit vector.
				156
				157	During the first compiling phase, when determining how much memory is required,
				158	the regex is partly compiled into this space, but the compiled parts are
				159	discarded as soon as they can be, so that hopefully there will never be an
				160	overrun. The code does, however, check for an overrun, which can occur for
				161	pathological patterns. The size of the workspace depends on LINK_SIZE because
				162	the length of compiled items varies with this.
				163
				164	In the real compile phase, this workspace is not currently used. */
				165
				166	#define COMPILE_WORK_SIZE (3000LINK_SIZE) / Size in code units */
				167
				168	#define C16_WORK_SIZE \
				169	((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t))
				170
				171	/* A uint32_t vector is used for caching information about the size of
				172	capturing groups, to improve performance. A default is created on the stack of
				173	this size. */
				174
				175	#define GROUPINFO_DEFAULT_SIZE 256
				176
				177	/* The overrun tests check for a slightly smaller size so that they detect the
				178	overrun before it actually does run off the end of the data block. */
				179
				180	#define WORK_SIZE_SAFETY_MARGIN (100)
				181
				182	/* This value determines the size of the initial vector that is used for
				183	remembering named groups during the pre-compile. It is allocated on the stack,
				184	but if it is too small, it is expanded, in a similar way to the workspace. The
				185	value is the number of slots in the list. */
				186
				187	#define NAMED_GROUP_LIST_SIZE 20
				188
				189	/* The pre-compiling pass over the pattern creates a parsed pattern in a vector
				190	of uint32_t. For short patterns this lives on the stack, with this size. Heap
				191	memory is used for longer patterns. */
				192
				193	#define PARSED_PATTERN_DEFAULT_SIZE 1024
				194
				195	/* Maximum length value to check against when making sure that the variable
				196	that holds the compiled pattern length does not overflow. We make it a bit less
				197	than INT_MAX to allow for adding in group terminating code units, so that we
				198	don't have to check them every time. */
				199
				200	#define OFLOW_MAX (INT_MAX - 20)
				201
				202	/* Code values for parsed patterns, which are stored in a vector of 32-bit
				203	unsigned ints. Values less than META_END are literal data values. The coding
				204	for identifying the item is in the top 16-bits, leaving 16 bits for the
				205	additional data that some of them need. The META_CODE, META_DATA, and META_DIFF
				206	macros are used to manipulate parsed pattern elements.
				207
				208	NOTE: When these definitions are changed, the table of extra lengths for each
				209	code (meta_extra_lengths, just below) must be updated to remain in step. */
				210
				211	#define META_END 0x80000000u /* End of pattern */
				212
				213	#define META_ALT 0x80010000u /* alternation */
				214	#define META_ATOMIC 0x80020000u /* atomic group */
				215	#define META_BACKREF 0x80030000u /* Back ref */
				216	#define META_BACKREF_BYNAME 0x80040000u /* \k'name' */
				217	#define META_BIGVALUE 0x80050000u /* Next is a literal > META_END */
				218	#define META_CALLOUT_NUMBER 0x80060000u /* (?C with numerical argument */
				219	#define META_CALLOUT_STRING 0x80070000u /* (?C with string argument */
				220	#define META_CAPTURE 0x80080000u /* Capturing parenthesis */
				221	#define META_CIRCUMFLEX 0x80090000u /* ^ metacharacter */
				222	#define META_CLASS 0x800a0000u /* start non-empty class */
				223	#define META_CLASS_EMPTY 0x800b0000u /* empty class */
				224	#define META_CLASS_EMPTY_NOT 0x800c0000u /* negative empty class */
				225	#define META_CLASS_END 0x800d0000u /* end of non-empty class */
				226	#define META_CLASS_NOT 0x800e0000u /* start non-empty negative class */
				227	#define META_COND_ASSERT 0x800f0000u /* (?(?assertion)... */
				228	#define META_COND_DEFINE 0x80100000u /* (?(DEFINE)... */
				229	#define META_COND_NAME 0x80110000u /* (?(<name>)... */
				230	#define META_COND_NUMBER 0x80120000u /* (?(digits)... */
				231	#define META_COND_RNAME 0x80130000u /* (?(R&name)... */
				232	#define META_COND_RNUMBER 0x80140000u /* (?(Rdigits)... */
				233	#define META_COND_VERSION 0x80150000u /* (?(VERSION<op>x.y)... */
				234	#define META_DOLLAR 0x80160000u /* $ metacharacter */
				235	#define META_DOT 0x80170000u /* . metacharacter */
				236	#define META_ESCAPE 0x80180000u /* \d and friends */
				237	#define META_KET 0x80190000u /* closing parenthesis */
				238	#define META_NOCAPTURE 0x801a0000u /* no capture parens */
				239	#define META_OPTIONS 0x801b0000u /* (?i) and friends */
				240	#define META_POSIX 0x801c0000u /* POSIX class item */
				241	#define META_POSIX_NEG 0x801d0000u /* negative POSIX class item */
				242	#define META_RANGE_ESCAPED 0x801e0000u /* range with at least one escape */
				243	#define META_RANGE_LITERAL 0x801f0000u /* range defined literally */
				244	#define META_RECURSE 0x80200000u /* Recursion */
				245	#define META_RECURSE_BYNAME 0x80210000u /* (?&name) */
				246	#define META_SCRIPT_RUN 0x80220000u /* (script_run:...) /
				247
				248	/* These must be kept together to make it easy to check that an assertion
				249	is present where expected in a conditional group. */
				250
				251	#define META_LOOKAHEAD 0x80230000u /* (?= */
				252	#define META_LOOKAHEADNOT 0x80240000u /* (?! */
				253	#define META_LOOKBEHIND 0x80250000u /* (?<= */
				254	#define META_LOOKBEHINDNOT 0x80260000u /* (?<! */
				255
				256	/* These cannot be conditions */
				257
				258	#define META_LOOKAHEAD_NA 0x80270000u /* (napla: /
				259	#define META_LOOKBEHIND_NA 0x80280000u /* (naplb: /
				260
				261	/* These must be kept in this order, with consecutive values, and the _ARG
				262	versions of COMMIT, PRUNE, SKIP, and THEN immediately after their non-argument
				263	versions. */
				264
				265	#define META_MARK 0x80290000u /* (MARK) /
				266	#define META_ACCEPT 0x802a0000u /* (ACCEPT) /
				267	#define META_FAIL 0x802b0000u /* (FAIL) /
				268	#define META_COMMIT 0x802c0000u /* These */
				269	#define META_COMMIT_ARG 0x802d0000u /* pairs */
				270	#define META_PRUNE 0x802e0000u /* must */
				271	#define META_PRUNE_ARG 0x802f0000u /* be */
				272	#define META_SKIP 0x80300000u /* kept */
				273	#define META_SKIP_ARG 0x80310000u /* in */
				274	#define META_THEN 0x80320000u /* this */
				275	#define META_THEN_ARG 0x80330000u /* order */
				276
				277	/* These must be kept in groups of adjacent 3 values, and all together. */
				278
				279	#define META_ASTERISK 0x80340000u /* * */
				280	#define META_ASTERISK_PLUS 0x80350000u /* + /
				281	#define META_ASTERISK_QUERY 0x80360000u /* ? /
				282	#define META_PLUS 0x80370000u /* + */
				283	#define META_PLUS_PLUS 0x80380000u /* ++ */
				284	#define META_PLUS_QUERY 0x80390000u /* +? */
				285	#define META_QUERY 0x803a0000u /* ? */
				286	#define META_QUERY_PLUS 0x803b0000u /* ?+ */
				287	#define META_QUERY_QUERY 0x803c0000u /* ?? */
				288	#define META_MINMAX 0x803d0000u /* {n,m} repeat */
				289	#define META_MINMAX_PLUS 0x803e0000u /* {n,m}+ repeat */
				290	#define META_MINMAX_QUERY 0x803f0000u /* {n,m}? repeat */
				291
				292	#define META_FIRST_QUANTIFIER META_ASTERISK
				293	#define META_LAST_QUANTIFIER META_MINMAX_QUERY
				294
				295	/* This is a special "meta code" that is used only to distinguish (*asr: from
				296	(*sr: in the table of aphabetic assertions. It is never stored in the parsed
				297	pattern because (asr: is turned into (sr:(*atomic: at that stage. There is
				298	therefore no need for it to have a length entry, so use a high value. */
				299
				300	#define META_ATOMIC_SCRIPT_RUN 0x8fff0000u
				301
				302	/* Table of extra lengths for each of the meta codes. Must be kept in step with
				303	the definitions above. For some items these values are a basic length to which
				304	a variable amount has to be added. */
				305
				306	static unsigned char meta_extra_lengths[] = {
				307	0, /* META_END */
				308	0, /* META_ALT */
				309	0, /* META_ATOMIC */
				310	0, /* META_BACKREF - more if group is >= 10 */
				311	1+SIZEOFFSET, /* META_BACKREF_BYNAME */
				312	1, /* META_BIGVALUE */
				313	3, /* META_CALLOUT_NUMBER */
				314	3+SIZEOFFSET, /* META_CALLOUT_STRING */
				315	0, /* META_CAPTURE */
				316	0, /* META_CIRCUMFLEX */
				317	0, /* META_CLASS */
				318	0, /* META_CLASS_EMPTY */
				319	0, /* META_CLASS_EMPTY_NOT */
				320	0, /* META_CLASS_END */
				321	0, /* META_CLASS_NOT */
				322	0, /* META_COND_ASSERT */
				323	SIZEOFFSET, /* META_COND_DEFINE */
				324	1+SIZEOFFSET, /* META_COND_NAME */
				325	1+SIZEOFFSET, /* META_COND_NUMBER */
				326	1+SIZEOFFSET, /* META_COND_RNAME */
				327	1+SIZEOFFSET, /* META_COND_RNUMBER */
				328	3, /* META_COND_VERSION */
				329	0, /* META_DOLLAR */
				330	0, /* META_DOT */
				331	0, /* META_ESCAPE - more for ESC_P, ESC_p, ESC_g, ESC_k */
				332	0, /* META_KET */
				333	0, /* META_NOCAPTURE */
				334	1, /* META_OPTIONS */
				335	1, /* META_POSIX */
				336	1, /* META_POSIX_NEG */
				337	0, /* META_RANGE_ESCAPED */
				338	0, /* META_RANGE_LITERAL */
				339	SIZEOFFSET, /* META_RECURSE */
				340	1+SIZEOFFSET, /* META_RECURSE_BYNAME */
				341	0, /* META_SCRIPT_RUN */
				342	0, /* META_LOOKAHEAD */
				343	0, /* META_LOOKAHEADNOT */
				344	SIZEOFFSET, /* META_LOOKBEHIND */
				345	SIZEOFFSET, /* META_LOOKBEHINDNOT */
				346	0, /* META_LOOKAHEAD_NA */
				347	SIZEOFFSET, /* META_LOOKBEHIND_NA */
				348	1, /* META_MARK - plus the string length */
				349	0, /* META_ACCEPT */
				350	0, /* META_FAIL */
				351	0, /* META_COMMIT */
				352	1, /* META_COMMIT_ARG - plus the string length */
				353	0, /* META_PRUNE */
				354	1, /* META_PRUNE_ARG - plus the string length */
				355	0, /* META_SKIP */
				356	1, /* META_SKIP_ARG - plus the string length */
				357	0, /* META_THEN */
				358	1, /* META_THEN_ARG - plus the string length */
				359	0, /* META_ASTERISK */
				360	0, /* META_ASTERISK_PLUS */
				361	0, /* META_ASTERISK_QUERY */
				362	0, /* META_PLUS */
				363	0, /* META_PLUS_PLUS */
				364	0, /* META_PLUS_QUERY */
				365	0, /* META_QUERY */
				366	0, /* META_QUERY_PLUS */
				367	0, /* META_QUERY_QUERY */
				368	2, /* META_MINMAX */
				369	2, /* META_MINMAX_PLUS */
				370	2 /* META_MINMAX_QUERY */
				371	};
				372
				373	/* Types for skipping parts of a parsed pattern. */
				374
				375	enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET };
				376
				377	/* Macro for setting individual bits in class bitmaps. It took some
				378	experimenting to figure out how to stop gcc 5.3.0 from warning with
				379	-Wconversion. This version gets a warning:
				380
				381	#define SETBIT(a,b) a[(b)/8] \|= (uint8_t)(1u << ((b)&7))
				382
				383	Let's hope the apparently less efficient version isn't actually so bad if the
				384	compiler is clever with identical subexpressions. */
				385
				386	#define SETBIT(a,b) a[(b)/8] = (uint8_t)(a[(b)/8] \| (1u << ((b)&7)))
				387
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	388	/* Values and flags for the unsigned xxcuflags variables that accompany xxcu
				389	variables, which are concerned with first and required code units. A value
				390	greater than or equal to REQ_NONE means "no code unit set"; otherwise the
				391	matching xxcu variable is set, and the low valued bits are relevant. */
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	392
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	393	#define REQ_UNSET 0xffffffffu /* Not yet found anything */
				394	#define REQ_NONE 0xfffffffeu /* Found not fixed character */
				395	#define REQ_CASELESS 0x00000001u /* Code unit in xxcu is caseless */
				396	#define REQ_VARY 0x00000002u /* Code unit is followed by non-literal */
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	397
				398	/* These flags are used in the groupinfo vector. */
				399
				400	#define GI_SET_FIXED_LENGTH 0x80000000u
				401	#define GI_NOT_FIXED_LENGTH 0x40000000u
				402	#define GI_FIXED_LENGTH_MASK 0x0000ffffu
				403
				404	/* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
				405	and is fast (a good compiler can turn it into a subtraction and unsigned
				406	comparison). */
				407
				408	#define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
				409
				410	/* Table to identify hex digits. The tables in chartables are dependent on the
				411	locale, and may mark arbitrary characters as digits. We want to recognize only
				412	0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
				413	costs 256 bytes, but it is a lot faster than doing character value tests (at
				414	least in some simple cases I timed), and in some applications one wants PCRE2
				415	to compile efficiently as well as match efficiently. The value in the table is
				416	the binary hex digit value, or 0xff for non-hex digits. */
				417
				418	/* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
				419	UTF-8 mode. */
				420
				421	#ifndef EBCDIC
				422	static const uint8_t xdigitab[] =
				423	{
				424	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 */
				425	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */
				426	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 */
				427	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */
				428	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - ' */
				429	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ( - / */
				430	0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 */
				431	0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /* 8 - ? */
				432	0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* @ - G */
				433	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H - O */
				434	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* P - W */
				435	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* X - _ */
				436	0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* ` - g */
				437	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h - o */
				438	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* p - w */
				439	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* x -127 */
				440	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */
				441	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */
				442	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */
				443	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */
				444	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */
				445	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */
				446	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */
				447	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
				448	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */
				449	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */
				450	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */
				451	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */
				452	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */
				453	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */
				454	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */
				455	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */
				456
				457	#else
				458
				459	/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
				460
				461	static const uint8_t xdigitab[] =
				462	{
				463	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 0- 7 0 */
				464	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 8- 15 */
				465	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 16- 23 10 */
				466	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 24- 31 */
				467	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 32- 39 20 */
				468	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 40- 47 */
				469	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 48- 55 30 */
				470	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 56- 63 */
				471	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - 71 40 */
				472	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 72- \| */
				473	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* & - 87 50 */
				474	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 88- 95 */
				475	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* - -103 60 */
				476	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ? */
				477	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */
				478	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- " */
				479	0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g 80 */
				480	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* h -143 */
				481	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p 90 */
				482	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* q -159 */
				483	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x A0 */
				484	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* y -175 */
				485	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* ^ -183 B0 */
				486	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
				487	0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* { - G C0 */
				488	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* H -207 */
				489	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* } - P D0 */
				490	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Q -223 */
				491	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* \ - X E0 */
				492	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* Y -239 */
				493	0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /* 0 - 7 F0 */
				494	0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/* 8 -255 */
				495	#endif /* EBCDIC */
				496
				497
				498	/* Table for handling alphanumeric escaped characters. Positive returns are
				499	simple data values; negative values are for special things like \d and so on.
				500	Zero means further processing is needed (for things like \x), or the escape is
				501	invalid. */
				502
				503	/* This is the "normal" table for ASCII systems or for EBCDIC systems running
				504	in UTF-8 mode. It runs from '0' to 'z'. */
				505
				506	#ifndef EBCDIC
				507	#define ESCAPES_FIRST CHAR_0
				508	#define ESCAPES_LAST CHAR_z
				509	#define UPPER_CASE(c) (c-32)
				510
				511	static const short int escapes[] = {
				512	0, 0,
				513	0, 0,
				514	0, 0,
				515	0, 0,
				516	0, 0,
				517	CHAR_COLON, CHAR_SEMICOLON,
				518	CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
				519	CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
				520	CHAR_COMMERCIAL_AT, -ESC_A,
				521	-ESC_B, -ESC_C,
				522	-ESC_D, -ESC_E,
				523	0, -ESC_G,
				524	-ESC_H, 0,
				525	0, -ESC_K,
				526	0, 0,
				527	-ESC_N, 0,
				528	-ESC_P, -ESC_Q,
				529	-ESC_R, -ESC_S,
				530	0, 0,
				531	-ESC_V, -ESC_W,
				532	-ESC_X, 0,
				533	-ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
				534	CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
				535	CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
				536	CHAR_GRAVE_ACCENT, CHAR_BEL,
				537	-ESC_b, 0,
				538	-ESC_d, CHAR_ESC,
				539	CHAR_FF, 0,
				540	-ESC_h, 0,
				541	0, -ESC_k,
				542	0, 0,
				543	CHAR_LF, 0,
				544	-ESC_p, 0,
				545	CHAR_CR, -ESC_s,
				546	CHAR_HT, 0,
				547	-ESC_v, -ESC_w,
				548	0, 0,
				549	-ESC_z
				550	};
				551
				552	#else
				553
				554	/* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
				555	It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code
				556	is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a
				557	because it is defined as 'a', which of course picks up the ASCII value. */
				558
				559	#if 'a' == 0x81 /* Check for a real EBCDIC environment */
				560	#define ESCAPES_FIRST CHAR_a
				561	#define ESCAPES_LAST CHAR_9
				562	#define UPPER_CASE(c) (c+64)
				563	#else /* Testing in an ASCII environment */
				564	#define ESCAPES_FIRST ((unsigned char)'\x81') /* EBCDIC 'a' */
				565	#define ESCAPES_LAST ((unsigned char)'\xf9') /* EBCDIC '9' */
				566	#define UPPER_CASE(c) (c-32)
				567	#endif
				568
				569	static const short int escapes[] = {
				570	/* 80 */ CHAR_BEL, -ESC_b, 0, -ESC_d, CHAR_ESC, CHAR_FF, 0,
				571	/* 88 */ -ESC_h, 0, 0, '{', 0, 0, 0, 0,
				572	/* 90 */ 0, 0, -ESC_k, 0, 0, CHAR_LF, 0, -ESC_p,
				573	/* 98 */ 0, CHAR_CR, 0, '}', 0, 0, 0, 0,
				574	/* A0 */ 0, '~', -ESC_s, CHAR_HT, 0, -ESC_v, -ESC_w, 0,
				575	/* A8 */ 0, -ESC_z, 0, 0, 0, '[', 0, 0,
				576	/* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
				577	/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
				578	/* C0 */ '{', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G,
				579	/* C8 */ -ESC_H, 0, 0, 0, 0, 0, 0, 0,
				580	/* D0 */ '}', 0, -ESC_K, 0, 0, -ESC_N, 0, -ESC_P,
				581	/* D8 */ -ESC_Q, -ESC_R, 0, 0, 0, 0, 0, 0,
				582	/* E0 */ '\\', 0, -ESC_S, 0, 0, -ESC_V, -ESC_W, -ESC_X,
				583	/* E8 */ 0, -ESC_Z, 0, 0, 0, 0, 0, 0,
				584	/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
				585	/* F8 */ 0, 0
				586	};
				587
				588	/* We also need a table of characters that may follow \c in an EBCDIC
				589	environment for characters 0-31. */
				590
				591	static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
				592
				593	#endif /* EBCDIC */
				594
				595
				596	/* Table of special "verbs" like (*PRUNE). This is a short table, so it is
				597	searched linearly. Put all the names into a single string, in order to reduce
				598	the number of relocations when a shared library is dynamically linked. The
				599	string is built from string macros so that it works in UTF-8 mode on EBCDIC
				600	platforms. */
				601
				602	typedef struct verbitem {
				603	unsigned int len; /* Length of verb name */
				604	uint32_t meta; /* Base META_ code */
				605	int has_arg; /* Argument requirement */
				606	} verbitem;
				607
				608	static const char verbnames[] =
				609	"\0" /* Empty name is a shorthand for MARK */
				610	STRING_MARK0
				611	STRING_ACCEPT0
				612	STRING_F0
				613	STRING_FAIL0
				614	STRING_COMMIT0
				615	STRING_PRUNE0
				616	STRING_SKIP0
				617	STRING_THEN;
				618
				619	static const verbitem verbs[] = {
				620	{ 0, META_MARK, +1 }, /* > 0 => must have an argument */
				621	{ 4, META_MARK, +1 },
				622	{ 6, META_ACCEPT, -1 }, /* < 0 => Optional argument, convert to pre-MARK */
				623	{ 1, META_FAIL, -1 },
				624	{ 4, META_FAIL, -1 },
				625	{ 6, META_COMMIT, 0 },
				626	{ 5, META_PRUNE, 0 }, /* Optional argument; bump META code if found */
				627	{ 4, META_SKIP, 0 },
				628	{ 4, META_THEN, 0 }
				629	};
				630
				631	static const int verbcount = sizeof(verbs)/sizeof(verbitem);
				632
				633	/* Verb opcodes, indexed by their META code offset from META_MARK. */
				634
				635	static const uint32_t verbops[] = {
				636	OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE,
				637	OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG };
				638
				639	/* Table of "alpha assertions" like (pla:...), similar to the (VERB) table. */
				640
				641	typedef struct alasitem {
				642	unsigned int len; /* Length of name */
				643	uint32_t meta; /* Base META_ code */
				644	} alasitem;
				645
				646	static const char alasnames[] =
				647	STRING_pla0
				648	STRING_plb0
				649	STRING_napla0
				650	STRING_naplb0
				651	STRING_nla0
				652	STRING_nlb0
				653	STRING_positive_lookahead0
				654	STRING_positive_lookbehind0
				655	STRING_non_atomic_positive_lookahead0
				656	STRING_non_atomic_positive_lookbehind0
				657	STRING_negative_lookahead0
				658	STRING_negative_lookbehind0
				659	STRING_atomic0
				660	STRING_sr0
				661	STRING_asr0
				662	STRING_script_run0
				663	STRING_atomic_script_run;
				664
				665	static const alasitem alasmeta[] = {
				666	{ 3, META_LOOKAHEAD },
				667	{ 3, META_LOOKBEHIND },
				668	{ 5, META_LOOKAHEAD_NA },
				669	{ 5, META_LOOKBEHIND_NA },
				670	{ 3, META_LOOKAHEADNOT },
				671	{ 3, META_LOOKBEHINDNOT },
				672	{ 18, META_LOOKAHEAD },
				673	{ 19, META_LOOKBEHIND },
				674	{ 29, META_LOOKAHEAD_NA },
				675	{ 30, META_LOOKBEHIND_NA },
				676	{ 18, META_LOOKAHEADNOT },
				677	{ 19, META_LOOKBEHINDNOT },
				678	{ 6, META_ATOMIC },
				679	{ 2, META_SCRIPT_RUN }, /* sr = script run */
				680	{ 3, META_ATOMIC_SCRIPT_RUN }, /* asr = atomic script run */
				681	{ 10, META_SCRIPT_RUN }, /* script run */
				682	{ 17, META_ATOMIC_SCRIPT_RUN } /* atomic script run */
				683	};
				684
				685	static const int alascount = sizeof(alasmeta)/sizeof(alasitem);
				686
				687	/* Offsets from OP_STAR for case-independent and negative repeat opcodes. */
				688
				689	static uint32_t chartypeoffset[] = {
				690	OP_STAR - OP_STAR, OP_STARI - OP_STAR,
				691	OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR };
				692
				693	/* Tables of names of POSIX character classes and their lengths. The names are
				694	now all in a single string, to reduce the number of relocations when a shared
				695	library is dynamically loaded. The list of lengths is terminated by a zero
				696	length entry. The first three must be alpha, lower, upper, as this is assumed
				697	for handling case independence. The indices for graph, print, and punct are
				698	needed, so identify them. */
				699
				700	static const char posix_names[] =
				701	STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
				702	STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
				703	STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
				704	STRING_word0 STRING_xdigit;
				705
				706	static const uint8_t posix_name_lengths[] = {
				707	5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
				708
				709	#define PC_GRAPH 8
				710	#define PC_PRINT 9
				711	#define PC_PUNCT 10
				712
				713	/* Table of class bit maps for each POSIX class. Each class is formed from a
				714	base map, with an optional addition or removal of another map. Then, for some
				715	classes, there is some additional tweaking: for [:blank:] the vertical space
				716	characters are removed, and for [:alpha:] and [:alnum:] the underscore
				717	character is removed. The triples in the table consist of the base map offset,
				718	second map offset or -1 if no second map, and a non-negative value for map
				719	addition or a negative value for map subtraction (if there are two maps). The
				720	absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
				721	remove vertical space characters, 2 => remove underscore. */
				722
				723	static const int posix_class_maps[] = {
				724	cbit_word, cbit_digit, -2, /* alpha */
				725	cbit_lower, -1, 0, /* lower */
				726	cbit_upper, -1, 0, /* upper */
				727	cbit_word, -1, 2, /* alnum - word without underscore */
				728	cbit_print, cbit_cntrl, 0, /* ascii */
				729	cbit_space, -1, 1, /* blank - a GNU extension */
				730	cbit_cntrl, -1, 0, /* cntrl */
				731	cbit_digit, -1, 0, /* digit */
				732	cbit_graph, -1, 0, /* graph */
				733	cbit_print, -1, 0, /* print */
				734	cbit_punct, -1, 0, /* punct */
				735	cbit_space, -1, 0, /* space */
				736	cbit_word, -1, 0, /* word - a Perl extension */
				737	cbit_xdigit,-1, 0 /* xdigit */
				738	};
				739
				740	#ifdef SUPPORT_UNICODE
				741
				742	/* The POSIX class Unicode property substitutes that are used in UCP mode must
				743	be in the order of the POSIX class names, defined above. */
				744
				745	static int posix_substitutes[] = {
				746	PT_GC, ucp_L, /* alpha */
				747	PT_PC, ucp_Ll, /* lower */
				748	PT_PC, ucp_Lu, /* upper */
				749	PT_ALNUM, 0, /* alnum */
				750	-1, 0, /* ascii, treat as non-UCP */
				751	-1, 1, /* blank, treat as \h */
				752	PT_PC, ucp_Cc, /* cntrl */
				753	PT_PC, ucp_Nd, /* digit */
				754	PT_PXGRAPH, 0, /* graph */
				755	PT_PXPRINT, 0, /* print */
				756	PT_PXPUNCT, 0, /* punct */
				757	PT_PXSPACE, 0, /* space / / Xps is POSIX space, but from 8.34 */
				758	PT_WORD, 0, /* word / / Perl and POSIX space are the same */
				759	-1, 0 /* xdigit, treat as non-UCP */
				760	};
				761	#define POSIX_SUBSIZE (sizeof(posix_substitutes) / (2*sizeof(uint32_t)))
				762	#endif /* SUPPORT_UNICODE */
				763
				764	/* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset
				765	are allowed. */
				766
				767	#define PUBLIC_LITERAL_COMPILE_OPTIONS \
				768	(PCRE2_ANCHORED\|PCRE2_AUTO_CALLOUT\|PCRE2_CASELESS\|PCRE2_ENDANCHORED\| \
				769	PCRE2_FIRSTLINE\|PCRE2_LITERAL\|PCRE2_MATCH_INVALID_UTF\| \
				770	PCRE2_NO_START_OPTIMIZE\|PCRE2_NO_UTF_CHECK\|PCRE2_USE_OFFSET_LIMIT\|PCRE2_UTF)
				771
				772	#define PUBLIC_COMPILE_OPTIONS \
				773	(PUBLIC_LITERAL_COMPILE_OPTIONS\| \
				774	PCRE2_ALLOW_EMPTY_CLASS\|PCRE2_ALT_BSUX\|PCRE2_ALT_CIRCUMFLEX\| \
				775	PCRE2_ALT_VERBNAMES\|PCRE2_DOLLAR_ENDONLY\|PCRE2_DOTALL\|PCRE2_DUPNAMES\| \
				776	PCRE2_EXTENDED\|PCRE2_EXTENDED_MORE\|PCRE2_MATCH_UNSET_BACKREF\| \
				777	PCRE2_MULTILINE\|PCRE2_NEVER_BACKSLASH_C\|PCRE2_NEVER_UCP\| \
				778	PCRE2_NEVER_UTF\|PCRE2_NO_AUTO_CAPTURE\|PCRE2_NO_AUTO_POSSESS\| \
				779	PCRE2_NO_DOTSTAR_ANCHOR\|PCRE2_UCP\|PCRE2_UNGREEDY)
				780
				781	#define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \
				782	(PCRE2_EXTRA_MATCH_LINE\|PCRE2_EXTRA_MATCH_WORD)
				783
				784	#define PUBLIC_COMPILE_EXTRA_OPTIONS \
				785	(PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS\| \
				786	PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES\|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL\| \
				787	PCRE2_EXTRA_ESCAPED_CR_IS_LF\|PCRE2_EXTRA_ALT_BSUX\| \
				788	PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK)
				789
				790	/* Compile time error code numbers. They are given names so that they can more
				791	easily be tracked. When a new number is added, the tables called eint1 and
				792	eint2 in pcre2posix.c may need to be updated, and a new error text must be
				793	added to compile_error_texts in pcre2_error.c. Also, the error codes in
				794	pcre2.h.in must be updated - their values are exactly 100 greater than these
				795	values. */
				796
				797	enum { ERR0 = COMPILE_ERROR_BASE,
				798	ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR10,
				799	ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20,
				800	ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30,
				801	ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40,
				802	ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50,
				803	ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
				804	ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
				805	ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
				806	ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90,
				807	ERR91, ERR92, ERR93, ERR94, ERR95, ERR96, ERR97, ERR98, ERR99 };
				808
				809	/* This is a table of start-of-pattern options such as (*UTF) and settings such
				810	as (LIMIT_MATCH=nnnn) and (CRLF). For completeness and backward
				811	compatibility, (UTFn) is supported in the relevant libraries, but (UTF) is
				812	generic and always supported. */
				813
				814	enum { PSO_OPT, /* Value is an option bit */
				815	PSO_FLG, /* Value is a flag bit */
				816	PSO_NL, /* Value is a newline type */
				817	PSO_BSR, /* Value is a \R type */
				818	PSO_LIMH, /* Read integer value for heap limit */
				819	PSO_LIMM, /* Read integer value for match limit */
				820	PSO_LIMD }; /* Read integer value for depth limit */
				821
				822	typedef struct pso {
				823	const uint8_t *name;
				824	uint16_t length;
				825	uint16_t type;
				826	uint32_t value;
				827	} pso;
				828
				829	/* NB: STRING_UTFn_RIGHTPAR contains the length as well */
				830
				831	static pso pso_list[] = {
				832	{ (uint8_t *)STRING_UTFn_RIGHTPAR, PSO_OPT, PCRE2_UTF },
				833	{ (uint8_t *)STRING_UTF_RIGHTPAR, 4, PSO_OPT, PCRE2_UTF },
				834	{ (uint8_t *)STRING_UCP_RIGHTPAR, 4, PSO_OPT, PCRE2_UCP },
				835	{ (uint8_t *)STRING_NOTEMPTY_RIGHTPAR, 9, PSO_FLG, PCRE2_NOTEMPTY_SET },
				836	{ (uint8_t *)STRING_NOTEMPTY_ATSTART_RIGHTPAR, 17, PSO_FLG, PCRE2_NE_ATST_SET },
				837	{ (uint8_t *)STRING_NO_AUTO_POSSESS_RIGHTPAR, 16, PSO_OPT, PCRE2_NO_AUTO_POSSESS },
				838	{ (uint8_t *)STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR },
				839	{ (uint8_t *)STRING_NO_JIT_RIGHTPAR, 7, PSO_FLG, PCRE2_NOJIT },
				840	{ (uint8_t *)STRING_NO_START_OPT_RIGHTPAR, 13, PSO_OPT, PCRE2_NO_START_OPTIMIZE },
				841	{ (uint8_t *)STRING_LIMIT_HEAP_EQ, 11, PSO_LIMH, 0 },
				842	{ (uint8_t *)STRING_LIMIT_MATCH_EQ, 12, PSO_LIMM, 0 },
				843	{ (uint8_t *)STRING_LIMIT_DEPTH_EQ, 12, PSO_LIMD, 0 },
				844	{ (uint8_t *)STRING_LIMIT_RECURSION_EQ, 16, PSO_LIMD, 0 },
				845	{ (uint8_t *)STRING_CR_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_CR },
				846	{ (uint8_t *)STRING_LF_RIGHTPAR, 3, PSO_NL, PCRE2_NEWLINE_LF },
				847	{ (uint8_t *)STRING_CRLF_RIGHTPAR, 5, PSO_NL, PCRE2_NEWLINE_CRLF },
				848	{ (uint8_t *)STRING_ANY_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_ANY },
				849	{ (uint8_t *)STRING_NUL_RIGHTPAR, 4, PSO_NL, PCRE2_NEWLINE_NUL },
				850	{ (uint8_t *)STRING_ANYCRLF_RIGHTPAR, 8, PSO_NL, PCRE2_NEWLINE_ANYCRLF },
				851	{ (uint8_t *)STRING_BSR_ANYCRLF_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_ANYCRLF },
				852	{ (uint8_t *)STRING_BSR_UNICODE_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_UNICODE }
				853	};
				854
				855	/* This table is used when converting repeating opcodes into possessified
				856	versions as a result of an explicit possessive quantifier such as ++. A zero
				857	value means there is no possessified version - in those cases the item in
				858	question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
				859	because all relevant opcodes are less than that. */
				860
				861	static const uint8_t opcode_possessify[] = {
				862	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 15 */
				863	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 - 31 */
				864
				865	0, /* NOTI */
				866	OP_POSSTAR, 0, /* STAR, MINSTAR */
				867	OP_POSPLUS, 0, /* PLUS, MINPLUS */
				868	OP_POSQUERY, 0, /* QUERY, MINQUERY */
				869	OP_POSUPTO, 0, /* UPTO, MINUPTO */
				870	0, /* EXACT */
				871	0, 0, 0, 0, /* POS{STAR,PLUS,QUERY,UPTO} */
				872
				873	OP_POSSTARI, 0, /* STARI, MINSTARI */
				874	OP_POSPLUSI, 0, /* PLUSI, MINPLUSI */
				875	OP_POSQUERYI, 0, /* QUERYI, MINQUERYI */
				876	OP_POSUPTOI, 0, /* UPTOI, MINUPTOI */
				877	0, /* EXACTI */
				878	0, 0, 0, 0, /* POS{STARI,PLUSI,QUERYI,UPTOI} */
				879
				880	OP_NOTPOSSTAR, 0, /* NOTSTAR, NOTMINSTAR */
				881	OP_NOTPOSPLUS, 0, /* NOTPLUS, NOTMINPLUS */
				882	OP_NOTPOSQUERY, 0, /* NOTQUERY, NOTMINQUERY */
				883	OP_NOTPOSUPTO, 0, /* NOTUPTO, NOTMINUPTO */
				884	0, /* NOTEXACT */
				885	0, 0, 0, 0, /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
				886
				887	OP_NOTPOSSTARI, 0, /* NOTSTARI, NOTMINSTARI */
				888	OP_NOTPOSPLUSI, 0, /* NOTPLUSI, NOTMINPLUSI */
				889	OP_NOTPOSQUERYI, 0, /* NOTQUERYI, NOTMINQUERYI */
				890	OP_NOTPOSUPTOI, 0, /* NOTUPTOI, NOTMINUPTOI */
				891	0, /* NOTEXACTI */
				892	0, 0, 0, 0, /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
				893
				894	OP_TYPEPOSSTAR, 0, /* TYPESTAR, TYPEMINSTAR */
				895	OP_TYPEPOSPLUS, 0, /* TYPEPLUS, TYPEMINPLUS */
				896	OP_TYPEPOSQUERY, 0, /* TYPEQUERY, TYPEMINQUERY */
				897	OP_TYPEPOSUPTO, 0, /* TYPEUPTO, TYPEMINUPTO */
				898	0, /* TYPEEXACT */
				899	0, 0, 0, 0, /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
				900
				901	OP_CRPOSSTAR, 0, /* CRSTAR, CRMINSTAR */
				902	OP_CRPOSPLUS, 0, /* CRPLUS, CRMINPLUS */
				903	OP_CRPOSQUERY, 0, /* CRQUERY, CRMINQUERY */
				904	OP_CRPOSRANGE, 0, /* CRRANGE, CRMINRANGE */
				905	0, 0, 0, 0, /* CRPOS{STAR,PLUS,QUERY,RANGE} */
				906
				907	0, 0, 0, /* CLASS, NCLASS, XCLASS */
				908	0, 0, /* REF, REFI */
				909	0, 0, /* DNREF, DNREFI */
				910	0, 0 /* RECURSE, CALLOUT */
				911	};
				912
				913
				914	#ifdef DEBUG_SHOW_PARSED
				915	/*************************************************
				916	* Show the parsed pattern for debugging *
				917	*************************************************/
				918
				919	/* For debugging the pre-scan, this code, which outputs the parsed data vector,
				920	can be enabled. */
				921
				922	static void show_parsed(compile_block *cb)
				923	{
				924	uint32_t *pptr = cb->parsed_pattern;
				925
				926	for (;;)
				927	{
				928	int max, min;
				929	PCRE2_SIZE offset;
				930	uint32_t i;
				931	uint32_t length;
				932	uint32_t meta_arg = META_DATA(*pptr);
				933
				934	fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr);
				935
				936	if (*pptr < META_END)
				937	{
				938	if (pptr > 32 && pptr < 128) fprintf(stderr, "%c", *pptr);
				939	pptr++;
				940	}
				941
				942	else switch (META_CODE(*pptr++))
				943	{
				944	default:
				945	fprintf(stderr, "** OOPS - unknown META value - giving up **\n");
				946	return;
				947
				948	case META_END:
				949	fprintf(stderr, "META_END\n");
				950	return;
				951
				952	case META_CAPTURE:
				953	fprintf(stderr, "META_CAPTURE %d", meta_arg);
				954	break;
				955
				956	case META_RECURSE:
				957	GETOFFSET(offset, pptr);
				958	fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset);
				959	break;
				960
				961	case META_BACKREF:
				962	if (meta_arg < 10)
				963	offset = cb->small_ref_offset[meta_arg];
				964	else
				965	GETOFFSET(offset, pptr);
				966	fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset);
				967	break;
				968
				969	case META_ESCAPE:
				970	if (meta_arg == ESC_P \|\| meta_arg == ESC_p)
				971	{
				972	uint32_t ptype = *pptr >> 16;
				973	uint32_t pvalue = *pptr++ & 0xffff;
				974	fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? 'P':'p',
				975	ptype, pvalue);
				976	}
				977	else
				978	{
				979	uint32_t cc;
				980	/* There's just one escape we might have here that isn't negated in the
				981	escapes table. */
				982	if (meta_arg == ESC_g) cc = CHAR_g;
				983	else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++)
				984	{
				985	if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break;
				986	}
				987	if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK;
				988	fprintf(stderr, "META \\%c", cc);
				989	}
				990	break;
				991
				992	case META_MINMAX:
				993	min = *pptr++;
				994	max = *pptr++;
				995	if (max != REPEAT_UNLIMITED)
				996	fprintf(stderr, "META {%d,%d}", min, max);
				997	else
				998	fprintf(stderr, "META {%d,}", min);
				999	break;
				1000
				1001	case META_MINMAX_QUERY:
				1002	min = *pptr++;
				1003	max = *pptr++;
				1004	if (max != REPEAT_UNLIMITED)
				1005	fprintf(stderr, "META {%d,%d}?", min, max);
				1006	else
				1007	fprintf(stderr, "META {%d,}?", min);
				1008	break;
				1009
				1010	case META_MINMAX_PLUS:
				1011	min = *pptr++;
				1012	max = *pptr++;
				1013	if (max != REPEAT_UNLIMITED)
				1014	fprintf(stderr, "META {%d,%d}+", min, max);
				1015	else
				1016	fprintf(stderr, "META {%d,}+", min);
				1017	break;
				1018
				1019	case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break;
				1020	case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break;
				1021	case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break;
				1022	case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break;
				1023	case META_DOT: fprintf(stderr, "META_DOT"); break;
				1024	case META_ASTERISK: fprintf(stderr, "META *"); break;
				1025	case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break;
				1026	case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break;
				1027	case META_PLUS: fprintf(stderr, "META +"); break;
				1028	case META_PLUS_QUERY: fprintf(stderr, "META +?"); break;
				1029	case META_PLUS_PLUS: fprintf(stderr, "META ++"); break;
				1030	case META_QUERY: fprintf(stderr, "META ?"); break;
				1031	case META_QUERY_QUERY: fprintf(stderr, "META ??"); break;
				1032	case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break;
				1033
				1034	case META_ATOMIC: fprintf(stderr, "META (?>"); break;
				1035	case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;
				1036	case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;
				1037	case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;
				1038	case META_LOOKAHEAD_NA: fprintf(stderr, "META (*napla:"); break;
				1039	case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break;
				1040	case META_KET: fprintf(stderr, "META )"); break;
				1041	case META_ALT: fprintf(stderr, "META \| %d", meta_arg); break;
				1042
				1043	case META_CLASS: fprintf(stderr, "META ["); break;
				1044	case META_CLASS_NOT: fprintf(stderr, "META [^"); break;
				1045	case META_CLASS_END: fprintf(stderr, "META ]"); break;
				1046	case META_CLASS_EMPTY: fprintf(stderr, "META []"); break;
				1047	case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break;
				1048
				1049	case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break;
				1050	case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break;
				1051
				1052	case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break;
				1053	case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break;
				1054
				1055	case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break;
				1056	case META_FAIL: fprintf(stderr, "META (*FAIL)"); break;
				1057	case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break;
				1058	case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break;
				1059	case META_SKIP: fprintf(stderr, "META (*SKIP)"); break;
				1060	case META_THEN: fprintf(stderr, "META (*THEN)"); break;
				1061
				1062	case META_OPTIONS: fprintf(stderr, "META_OPTIONS 0x%02x", *pptr++); break;
				1063
				1064	case META_LOOKBEHIND:
				1065	fprintf(stderr, "META (?<= %d offset=", meta_arg);
				1066	GETOFFSET(offset, pptr);
				1067	fprintf(stderr, "%zd", offset);
				1068	break;
				1069
				1070	case META_LOOKBEHIND_NA:
				1071	fprintf(stderr, "META (*naplb: %d offset=", meta_arg);
				1072	GETOFFSET(offset, pptr);
				1073	fprintf(stderr, "%zd", offset);
				1074	break;
				1075
				1076	case META_LOOKBEHINDNOT:
				1077	fprintf(stderr, "META (?<! %d offset=", meta_arg);
				1078	GETOFFSET(offset, pptr);
				1079	fprintf(stderr, "%zd", offset);
				1080	break;
				1081
				1082	case META_CALLOUT_NUMBER:
				1083	fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0],
				1084	pptr[1]);
				1085	pptr += 3;
				1086	break;
				1087
				1088	case META_CALLOUT_STRING:
				1089	{
				1090	uint32_t patoffset = pptr++; / Offset of next pattern item */
				1091	uint32_t patlength = pptr++; / Length of next pattern item */
				1092	fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++);
				1093	GETOFFSET(offset, pptr);
				1094	fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength);
				1095	}
				1096	break;
				1097
				1098	case META_RECURSE_BYNAME:
				1099	fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++);
				1100	GETOFFSET(offset, pptr);
				1101	fprintf(stderr, "%zd", offset);
				1102	break;
				1103
				1104	case META_BACKREF_BYNAME:
				1105	fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++);
				1106	GETOFFSET(offset, pptr);
				1107	fprintf(stderr, "%zd", offset);
				1108	break;
				1109
				1110	case META_COND_NUMBER:
				1111	fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]);
				1112	GETOFFSET(offset, pptr);
				1113	fprintf(stderr, "%zd", offset);
				1114	pptr++;
				1115	break;
				1116
				1117	case META_COND_DEFINE:
				1118	fprintf(stderr, "META (?(DEFINE) offset=");
				1119	GETOFFSET(offset, pptr);
				1120	fprintf(stderr, "%zd", offset);
				1121	break;
				1122
				1123	case META_COND_VERSION:
				1124	fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">=");
				1125	fprintf(stderr, "%d.", *pptr++);
				1126	fprintf(stderr, "%d)", *pptr++);
				1127	break;
				1128
				1129	case META_COND_NAME:
				1130	fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++);
				1131	GETOFFSET(offset, pptr);
				1132	fprintf(stderr, "%zd", offset);
				1133	break;
				1134
				1135	case META_COND_RNAME:
				1136	fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++);
				1137	GETOFFSET(offset, pptr);
				1138	fprintf(stderr, "%zd", offset);
				1139	break;
				1140
				1141	/* This is kept as a name, because it might be. */
				1142
				1143	case META_COND_RNUMBER:
				1144	fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++);
				1145	GETOFFSET(offset, pptr);
				1146	fprintf(stderr, "%zd", offset);
				1147	break;
				1148
				1149	case META_MARK:
				1150	fprintf(stderr, "META (*MARK:");
				1151	goto SHOWARG;
				1152
				1153	case META_COMMIT_ARG:
				1154	fprintf(stderr, "META (*COMMIT:");
				1155	goto SHOWARG;
				1156
				1157	case META_PRUNE_ARG:
				1158	fprintf(stderr, "META (*PRUNE:");
				1159	goto SHOWARG;
				1160
				1161	case META_SKIP_ARG:
				1162	fprintf(stderr, "META (*SKIP:");
				1163	goto SHOWARG;
				1164
				1165	case META_THEN_ARG:
				1166	fprintf(stderr, "META (*THEN:");
				1167	SHOWARG:
				1168	length = *pptr++;
				1169	for (i = 0; i < length; i++)
				1170	{
				1171	uint32_t cc = *pptr++;
				1172	if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc);
				1173	else fprintf(stderr, "\\x{%x}", cc);
				1174	}
				1175	fprintf(stderr, ") length=%u", length);
				1176	break;
				1177	}
				1178	fprintf(stderr, "\n");
				1179	}
				1180	return;
				1181	}
				1182	#endif /* DEBUG_SHOW_PARSED */
				1183
				1184
				1185
				1186	/*************************************************
				1187	* Copy compiled code *
				1188	*************************************************/
				1189
				1190	/* Compiled JIT code cannot be copied, so the new compiled block has no
				1191	associated JIT data. */
				1192
				1193	PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
				1194	pcre2_code_copy(const pcre2_code *code)
				1195	{
				1196	PCRE2_SIZE* ref_count;
				1197	pcre2_code *newcode;
				1198
				1199	if (code == NULL) return NULL;
				1200	newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
				1201	if (newcode == NULL) return NULL;
				1202	memcpy(newcode, code, code->blocksize);
				1203	newcode->executable_jit = NULL;
				1204
				1205	/* If the code is one that has been deserialized, increment the reference count
				1206	in the decoded tables. */
				1207
				1208	if ((code->flags & PCRE2_DEREF_TABLES) != 0)
				1209	{
				1210	ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
				1211	(*ref_count)++;
				1212	}
				1213
				1214	return newcode;
				1215	}
				1216
				1217
				1218
				1219	/*************************************************
				1220	* Copy compiled code and character tables *
				1221	*************************************************/
				1222
				1223	/* Compiled JIT code cannot be copied, so the new compiled block has no
				1224	associated JIT data. This version of code_copy also makes a separate copy of
				1225	the character tables. */
				1226
				1227	PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
				1228	pcre2_code_copy_with_tables(const pcre2_code *code)
				1229	{
				1230	PCRE2_SIZE* ref_count;
				1231	pcre2_code *newcode;
				1232	uint8_t *newtables;
				1233
				1234	if (code == NULL) return NULL;
				1235	newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
				1236	if (newcode == NULL) return NULL;
				1237	memcpy(newcode, code, code->blocksize);
				1238	newcode->executable_jit = NULL;
				1239
				1240	newtables = code->memctl.malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE),
				1241	code->memctl.memory_data);
				1242	if (newtables == NULL)
				1243	{
				1244	code->memctl.free((void *)newcode, code->memctl.memory_data);
				1245	return NULL;
				1246	}
				1247	memcpy(newtables, code->tables, TABLES_LENGTH);
				1248	ref_count = (PCRE2_SIZE *)(newtables + TABLES_LENGTH);
				1249	*ref_count = 1;
				1250
				1251	newcode->tables = newtables;
				1252	newcode->flags \|= PCRE2_DEREF_TABLES;
				1253	return newcode;
				1254	}
				1255
				1256
				1257
				1258	/*************************************************
				1259	* Free compiled code *
				1260	*************************************************/
				1261
				1262	PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
				1263	pcre2_code_free(pcre2_code *code)
				1264	{
				1265	PCRE2_SIZE* ref_count;
				1266
				1267	if (code != NULL)
				1268	{
				1269	if (code->executable_jit != NULL)
				1270	PRIV(jit_free)(code->executable_jit, &code->memctl);
				1271
				1272	if ((code->flags & PCRE2_DEREF_TABLES) != 0)
				1273	{
				1274	/* Decoded tables belong to the codes after deserialization, and they must
				1275	be freed when there are no more references to them. The *ref_count should
				1276	always be > 0. */
				1277
				1278	ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
				1279	if (*ref_count > 0)
				1280	{
				1281	(*ref_count)--;
				1282	if (*ref_count == 0)
				1283	code->memctl.free((void *)code->tables, code->memctl.memory_data);
				1284	}
				1285	}
				1286
				1287	code->memctl.free(code, code->memctl.memory_data);
				1288	}
				1289	}
				1290
				1291
				1292
				1293	/*************************************************
				1294	* Read a number, possibly signed *
				1295	*************************************************/
				1296
				1297	/* This function is used to read numbers in the pattern. The initial pointer
				1298	must be the sign or first digit of the number. When relative values (introduced
				1299	by + or -) are allowed, they are relative group numbers, and the result must be
				1300	greater than zero.
				1301
				1302	Arguments:
				1303	ptrptr points to the character pointer variable
				1304	ptrend points to the end of the input string
				1305	allow_sign if < 0, sign not allowed; if >= 0, sign is relative to this
				1306	max_value the largest number allowed
				1307	max_error the error to give for an over-large number
				1308	intptr where to put the result
				1309	errcodeptr where to put an error code
				1310
				1311	Returns: TRUE - a number was read
				1312	FALSE - errorcode == 0 => no number was found
				1313	errorcode != 0 => an error occurred
				1314	*/
				1315
				1316	static BOOL
				1317	read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign,
				1318	uint32_t max_value, uint32_t max_error, int intptr, int errorcodeptr)
				1319	{
				1320	int sign = 0;
				1321	uint32_t n = 0;
				1322	PCRE2_SPTR ptr = *ptrptr;
				1323	BOOL yield = FALSE;
				1324
				1325	*errorcodeptr = 0;
				1326
				1327	if (allow_sign >= 0 && ptr < ptrend)
				1328	{
				1329	if (*ptr == CHAR_PLUS)
				1330	{
				1331	sign = +1;
				1332	max_value -= allow_sign;
				1333	ptr++;
				1334	}
				1335	else if (*ptr == CHAR_MINUS)
				1336	{
				1337	sign = -1;
				1338	ptr++;
				1339	}
				1340	}
				1341
				1342	if (ptr >= ptrend \|\| !IS_DIGIT(*ptr)) return FALSE;
				1343	while (ptr < ptrend && IS_DIGIT(*ptr))
				1344	{
				1345	n = n * 10 + *ptr++ - CHAR_0;
				1346	if (n > max_value)
				1347	{
				1348	*errorcodeptr = max_error;
				1349	goto EXIT;
				1350	}
				1351	}
				1352
				1353	if (allow_sign >= 0 && sign != 0)
				1354	{
				1355	if (n == 0)
				1356	{
				1357	errorcodeptr = ERR26; / +0 and -0 are not allowed */
				1358	goto EXIT;
				1359	}
				1360
				1361	if (sign > 0) n += allow_sign;
				1362	else if ((int)n > allow_sign)
				1363	{
				1364	errorcodeptr = ERR15; / Non-existent subpattern */
				1365	goto EXIT;
				1366	}
				1367	else n = allow_sign + 1 - n;
				1368	}
				1369
				1370	yield = TRUE;
				1371
				1372	EXIT:
				1373	*intptr = n;
				1374	*ptrptr = ptr;
				1375	return yield;
				1376	}
				1377
				1378
				1379
				1380	/*************************************************
				1381	* Read repeat counts *
				1382	*************************************************/
				1383
				1384	/* Read an item of the form {n,m} and return the values if non-NULL pointers
				1385	are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a
				1386	larger value is used for "unlimited". We have to use signed arguments for
				1387	read_number() because it is capable of returning a signed value.
				1388
				1389	Arguments:
				1390	ptrptr points to pointer to character after'{'
				1391	ptrend pointer to end of input
				1392	minp if not NULL, pointer to int for min
				1393	maxp if not NULL, pointer to int for max (-1 if no max)
				1394	returned as -1 if no max
				1395	errorcodeptr points to error code variable
				1396
				1397	Returns: FALSE if not a repeat quantifier, errorcode set zero
				1398	FALSE on error, with errorcode set non-zero
				1399	TRUE on success, with pointer updated to point after '}'
				1400	*/
				1401
				1402	static BOOL
				1403	read_repeat_counts(PCRE2_SPTR ptrptr, PCRE2_SPTR ptrend, uint32_t minp,
				1404	uint32_t maxp, int errorcodeptr)
				1405	{
				1406	PCRE2_SPTR p;
				1407	BOOL yield = FALSE;
				1408	BOOL had_comma = FALSE;
				1409	int32_t min = 0;
				1410	int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */
				1411
				1412	/* Check the syntax */
				1413
				1414	*errorcodeptr = 0;
				1415	for (p = *ptrptr;; p++)
				1416	{
				1417	uint32_t c;
				1418	if (p >= ptrend) return FALSE;
				1419	c = *p;
				1420	if (IS_DIGIT(c)) continue;
				1421	if (c == CHAR_RIGHT_CURLY_BRACKET) break;
				1422	if (c == CHAR_COMMA)
				1423	{
				1424	if (had_comma) return FALSE;
				1425	had_comma = TRUE;
				1426	}
				1427	else return FALSE;
				1428	}
				1429
				1430	/* The only error from read_number() is for a number that is too big. */
				1431
				1432	p = *ptrptr;
				1433	if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr))
				1434	goto EXIT;
				1435
				1436	if (*p == CHAR_RIGHT_CURLY_BRACKET)
				1437	{
				1438	p++;
				1439	max = min;
				1440	}
				1441	else
				1442	{
				1443	if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
				1444	{
				1445	if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max,
				1446	errorcodeptr))
				1447	goto EXIT;
				1448	if (max < min)
				1449	{
				1450	*errorcodeptr = ERR4;
				1451	goto EXIT;
				1452	}
				1453	}
				1454	p++;
				1455	}
				1456
				1457	yield = TRUE;
				1458	if (minp != NULL) *minp = (uint32_t)min;
				1459	if (maxp != NULL) *maxp = (uint32_t)max;
				1460
				1461	/* Update the pattern pointer */
				1462
				1463	EXIT:
				1464	*ptrptr = p;
				1465	return yield;
				1466	}
				1467
				1468
				1469
				1470	/*************************************************
				1471	* Handle escapes *
				1472	*************************************************/
				1473
				1474	/* This function is called when a \ has been encountered. It either returns a
				1475	positive value for a simple escape such as \d, or 0 for a data character, which
				1476	is placed in chptr. A backreference to group n is returned as negative n. On
				1477	entry, ptr is pointing at the character after \. On exit, it points after the
				1478	final code unit of the escape sequence.
				1479
				1480	This function is also called from pcre2_substitute() to handle escape sequences
				1481	in replacement strings. In this case, the cb argument is NULL, and in the case
				1482	of escapes that have further processing, only sequences that define a data
				1483	character are recognised. The isclass argument is not relevant; the options
				1484	argument is the final value of the compiled pattern's options.
				1485
				1486	Arguments:
				1487	ptrptr points to the input position pointer
				1488	ptrend points to the end of the input
				1489	chptr points to a returned data character
				1490	errorcodeptr points to the errorcode variable (containing zero)
				1491	options the current options bits
				1492	isclass TRUE if inside a character class
				1493	cb compile data block or NULL when called from pcre2_substitute()
				1494
				1495	Returns: zero => a data character
				1496	positive => a special escape sequence
				1497	negative => a numerical back reference
				1498	on error, errorcodeptr is set non-zero
				1499	*/
				1500
				1501	int
				1502	PRIV(check_escape)(PCRE2_SPTR ptrptr, PCRE2_SPTR ptrend, uint32_t chptr,
				1503	int *errorcodeptr, uint32_t options, uint32_t extra_options, BOOL isclass,
				1504	compile_block *cb)
				1505	{
				1506	BOOL utf = (options & PCRE2_UTF) != 0;
				1507	PCRE2_SPTR ptr = *ptrptr;
				1508	uint32_t c, cc;
				1509	int escape = 0;
				1510	int i;
				1511
				1512	/* If backslash is at the end of the string, it's an error. */
				1513
				1514	if (ptr >= ptrend)
				1515	{
				1516	*errorcodeptr = ERR1;
				1517	return 0;
				1518	}
				1519
				1520	GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
				1521	errorcodeptr = 0; / Be optimistic */
				1522
				1523	/* Non-alphanumerics are literals, so we just leave the value in c. An initial
				1524	value test saves a memory lookup for code points outside the alphanumeric
				1525	range. */
				1526
				1527	if (c < ESCAPES_FIRST \|\| c > ESCAPES_LAST) {} /* Definitely literal */
				1528
				1529	/* Otherwise, do a table lookup. Non-zero values need little processing here. A
				1530	positive value is a literal value for something like \n. A negative value is
				1531	the negation of one of the ESC_ macros that is passed back for handling by the
				1532	calling function. Some extra checking is needed for \N because only \N{U+dddd}
				1533	is supported. If the value is zero, further processing is handled below. */
				1534
				1535	else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
				1536	{
				1537	if (i > 0)
				1538	{
				1539	c = (uint32_t)i;
				1540	if (c == CHAR_CR && (extra_options & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)
				1541	c = CHAR_LF;
				1542	}
				1543	else /* Negative table entry */
				1544	{
				1545	escape = -i; /* Else return a special escape */
				1546	if (cb != NULL && (escape == ESC_P \|\| escape == ESC_p \|\| escape == ESC_X))
				1547	cb->external_flags \|= PCRE2_HASBKPORX; /* Note \P, \p, or \X */
				1548
				1549	/* Perl supports \N{name} for character names and \N{U+dddd} for numerical
				1550	Unicode code points, as well as plain \N for "not newline". PCRE does not
				1551	support \N{name}. However, it does support quantification such as \N{2,3},
				1552	so if \N{ is not followed by U+dddd we check for a quantifier. */
				1553
				1554	if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
				1555	{
				1556	PCRE2_SPTR p = ptr + 1;
				1557
				1558	/* \N{U+ can be handled by the \x{ code. However, this construction is
				1559	not valid in EBCDIC environments because it specifies a Unicode
				1560	character, not a codepoint in the local code. For example \N{U+0041}
				1561	must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode
				1562	casing semantics for the entire pattern, so allow it only in UTF (i.e.
				1563	Unicode) mode. */
				1564
				1565	if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS)
				1566	{
				1567	#ifdef EBCDIC
				1568	*errorcodeptr = ERR93;
				1569	#else
				1570	if (utf)
				1571	{
				1572	ptr = p + 1;
				1573	escape = 0; /* Not a fancy escape after all */
				1574	goto COME_FROM_NU;
				1575	}
				1576	else *errorcodeptr = ERR93;
				1577	#endif
				1578	}
				1579
				1580	/* Give an error if what follows is not a quantifier, but don't override
				1581	an error set by the quantifier reader (e.g. number overflow). */
				1582
				1583	else
				1584	{
				1585	if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) &&
				1586	*errorcodeptr == 0)
				1587	*errorcodeptr = ERR37;
				1588	}
				1589	}
				1590	}
				1591	}
				1592
				1593	/* Escapes that need further processing, including those that are unknown, have
				1594	a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
				1595	\o, and \x are recognized (\u and \U can never appear as they are used for case
				1596	forcing). */
				1597
				1598	else
				1599	{
				1600	int s;
				1601	PCRE2_SPTR oldptr;
				1602	BOOL overflow;
				1603	BOOL alt_bsux =
				1604	((options & PCRE2_ALT_BSUX) \| (extra_options & PCRE2_EXTRA_ALT_BSUX)) != 0;
				1605
				1606	/* Filter calls from pcre2_substitute(). */
				1607
				1608	if (cb == NULL)
				1609	{
				1610	if (c != CHAR_c && c != CHAR_o && c != CHAR_x)
				1611	{
				1612	*errorcodeptr = ERR3;
				1613	return 0;
				1614	}
				1615	alt_bsux = FALSE; /* Do not modify \x handling */
				1616	}
				1617
				1618	switch (c)
				1619	{
				1620	/* A number of Perl escapes are not handled by PCRE. We give an explicit
				1621	error. */
				1622
				1623	case CHAR_F:
				1624	case CHAR_l:
				1625	case CHAR_L:
				1626	*errorcodeptr = ERR37;
				1627	break;
				1628
				1629	/* \u is unrecognized when neither PCRE2_ALT_BSUX nor PCRE2_EXTRA_ALT_BSUX
				1630	is set. Otherwise, \u must be followed by exactly four hex digits or, if
				1631	PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces.
				1632	Otherwise it is a lowercase u letter. This gives some compatibility with
				1633	ECMAScript (aka JavaScript). */
				1634
				1635	case CHAR_u:
				1636	if (!alt_bsux) *errorcodeptr = ERR37; else
				1637	{
				1638	uint32_t xc;
				1639
				1640	if (ptr >= ptrend) break;
				1641	if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
				1642	(extra_options & PCRE2_EXTRA_ALT_BSUX) != 0)
				1643	{
				1644	PCRE2_SPTR hptr = ptr + 1;
				1645	cc = 0;
				1646
				1647	while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff)
				1648	{
				1649	if ((cc & 0xf0000000) != 0) /* Test for 32-bit overflow */
				1650	{
				1651	*errorcodeptr = ERR77;
				1652	ptr = hptr; /* Show where */
				1653	break; /* hptr != } will cause another break below /
				1654	}
				1655	cc = (cc << 4) \| xc;
				1656	hptr++;
				1657	}
				1658
				1659	if (hptr == ptr + 1 \|\| /* No hex digits */
				1660	hptr >= ptrend \|\| /* Hit end of input */
				1661	hptr != CHAR_RIGHT_CURLY_BRACKET) / No } terminator */
				1662	break; /* Hex escape not recognized */
				1663
				1664	c = cc; /* Accept the code point */
				1665	ptr = hptr + 1;
				1666	}
				1667
				1668	else /* Must be exactly 4 hex digits */
				1669	{
				1670	if (ptrend - ptr < 4) break; /* Less than 4 chars */
				1671	if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */
				1672	if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
				1673	cc = (cc << 4) \| xc;
				1674	if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */
				1675	cc = (cc << 4) \| xc;
				1676	if ((xc = XDIGIT(ptr[3])) == 0xff) break; /* Not a hex digit */
				1677	c = (cc << 4) \| xc;
				1678	ptr += 4;
				1679	}
				1680
				1681	if (utf)
				1682	{
				1683	if (c > 0x10ffffU) *errorcodeptr = ERR77;
				1684	else
				1685	if (c >= 0xd800 && c <= 0xdfff &&
				1686	(extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
				1687	*errorcodeptr = ERR73;
				1688	}
				1689	else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
				1690	}
				1691	break;
				1692
				1693	/* \U is unrecognized unless PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set,
				1694	in which case it is an upper case letter. */
				1695
				1696	case CHAR_U:
				1697	if (!alt_bsux) *errorcodeptr = ERR37;
				1698	break;
				1699
				1700	/* In a character class, \g is just a literal "g". Outside a character
				1701	class, \g must be followed by one of a number of specific things:
				1702
				1703	(1) A number, either plain or braced. If positive, it is an absolute
				1704	backreference. If negative, it is a relative backreference. This is a Perl
				1705	5.10 feature.
				1706
				1707	(2) Perl 5.10 also supports \g{name} as a reference to a named group. This
				1708	is part of Perl's movement towards a unified syntax for back references. As
				1709	this is synonymous with \k{name}, we fudge it up by pretending it really
				1710	was \k{name}.
				1711
				1712	(3) For Oniguruma compatibility we also support \g followed by a name or a
				1713	number either in angle brackets or in single quotes. However, these are
				1714	(possibly recursive) subroutine calls, _not_ backreferences. We return
				1715	the ESC_g code.
				1716
				1717	Summary: Return a negative number for a numerical back reference, ESC_k for
				1718	a named back reference, and ESC_g for a named or numbered subroutine call.
				1719	*/
				1720
				1721	case CHAR_g:
				1722	if (isclass) break;
				1723
				1724	if (ptr >= ptrend)
				1725	{
				1726	*errorcodeptr = ERR57;
				1727	break;
				1728	}
				1729
				1730	if (ptr == CHAR_LESS_THAN_SIGN \|\| ptr == CHAR_APOSTROPHE)
				1731	{
				1732	escape = ESC_g;
				1733	break;
				1734	}
				1735
				1736	/* If there is a brace delimiter, try to read a numerical reference. If
				1737	there isn't one, assume we have a name and treat it as \k. */
				1738
				1739	if (*ptr == CHAR_LEFT_CURLY_BRACKET)
				1740	{
				1741	PCRE2_SPTR p = ptr + 1;
				1742	if (!read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
				1743	errorcodeptr))
				1744	{
				1745	if (errorcodeptr == 0) escape = ESC_k; / No number found */
				1746	break;
				1747	}
				1748	if (p >= ptrend \|\| *p != CHAR_RIGHT_CURLY_BRACKET)
				1749	{
				1750	*errorcodeptr = ERR57;
				1751	break;
				1752	}
				1753	ptr = p + 1;
				1754	}
				1755
				1756	/* Read an undelimited number */
				1757
				1758	else
				1759	{
				1760	if (!read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &s,
				1761	errorcodeptr))
				1762	{
				1763	if (errorcodeptr == 0) errorcodeptr = ERR57; /* No number found */
				1764	break;
				1765	}
				1766	}
				1767
				1768	if (s <= 0)
				1769	{
				1770	*errorcodeptr = ERR15;
				1771	break;
				1772	}
				1773
				1774	escape = -s;
				1775	break;
				1776
				1777	/* The handling of escape sequences consisting of a string of digits
				1778	starting with one that is not zero is not straightforward. Perl has changed
				1779	over the years. Nowadays \g{} for backreferences and \o{} for octal are
				1780	recommended to avoid the ambiguities in the old syntax.
				1781
				1782	Outside a character class, the digits are read as a decimal number. If the
				1783	number is less than 10, or if there are that many previous extracting left
				1784	brackets, it is a back reference. Otherwise, up to three octal digits are
				1785	read to form an escaped character code. Thus \123 is likely to be octal 123
				1786	(cf \0123, which is octal 012 followed by the literal 3).
				1787
				1788	Inside a character class, \ followed by a digit is always either a literal
				1789	8 or 9 or an octal number. */
				1790
				1791	case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
				1792	case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
				1793
				1794	if (!isclass)
				1795	{
				1796	oldptr = ptr;
				1797	ptr--; /* Back to the digit */
				1798
				1799	/* As we know we are at a digit, the only possible error from
				1800	read_number() is a number that is too large to be a group number. In this
				1801	case we fall through handle this as not a group reference. If we have
				1802	read a small enough number, check for a back reference.
				1803
				1804	\1 to \9 are always back references. \8x and \9x are too; \1x to \7x
				1805	are octal escapes if there are not that many previous captures. */
				1806
				1807	if (read_number(&ptr, ptrend, -1, INT_MAX/10 - 1, 0, &s, errorcodeptr) &&
				1808	(s < 10 \|\| oldptr[-1] >= CHAR_8 \|\| s <= (int)cb->bracount))
				1809	{
				1810	if (s > (int)MAX_GROUP_NUMBER) *errorcodeptr = ERR61;
				1811	else escape = -s; /* Indicates a back reference */
				1812	break;
				1813	}
				1814
				1815	ptr = oldptr; /* Put the pointer back and fall through */
				1816	}
				1817
				1818	/* Handle a digit following \ when the number is not a back reference, or
				1819	we are within a character class. If the first digit is 8 or 9, Perl used to
				1820	generate a binary zero and then treat the digit as a following literal. At
				1821	least by Perl 5.18 this changed so as not to insert the binary zero. */
				1822
				1823	if (c >= CHAR_8) break;
				1824
				1825	/* Fall through */
				1826
				1827	/* \0 always starts an octal number, but we may drop through to here with a
				1828	larger first octal digit. The original code used just to take the least
				1829	significant 8 bits of octal numbers (I think this is what early Perls used
				1830	to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
				1831	but no more than 3 octal digits. */
				1832
				1833	case CHAR_0:
				1834	c -= CHAR_0;
				1835	while(i++ < 2 && ptr < ptrend && ptr >= CHAR_0 && ptr <= CHAR_7)
				1836	c = c * 8 + *ptr++ - CHAR_0;
				1837	#if PCRE2_CODE_UNIT_WIDTH == 8
				1838	if (!utf && c > 0xff) *errorcodeptr = ERR51;
				1839	#endif
				1840	break;
				1841
				1842	/* \o is a relatively new Perl feature, supporting a more general way of
				1843	specifying character codes in octal. The only supported form is \o{ddd}. */
				1844
				1845	case CHAR_o:
				1846	if (ptr >= ptrend \|\| *ptr++ != CHAR_LEFT_CURLY_BRACKET)
				1847	{
				1848	ptr--;
				1849	*errorcodeptr = ERR55;
				1850	}
				1851	else if (ptr >= ptrend \|\| *ptr == CHAR_RIGHT_CURLY_BRACKET)
				1852	*errorcodeptr = ERR78;
				1853	else
				1854	{
				1855	c = 0;
				1856	overflow = FALSE;
				1857	while (ptr < ptrend && ptr >= CHAR_0 && ptr <= CHAR_7)
				1858	{
				1859	cc = *ptr++;
				1860	if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
				1861	#if PCRE2_CODE_UNIT_WIDTH == 32
				1862	if (c >= 0x20000000l) { overflow = TRUE; break; }
				1863	#endif
				1864	c = (c << 3) + (cc - CHAR_0);
				1865	#if PCRE2_CODE_UNIT_WIDTH == 8
				1866	if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
				1867	#elif PCRE2_CODE_UNIT_WIDTH == 16
				1868	if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
				1869	#elif PCRE2_CODE_UNIT_WIDTH == 32
				1870	if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
				1871	#endif
				1872	}
				1873	if (overflow)
				1874	{
				1875	while (ptr < ptrend && ptr >= CHAR_0 && ptr <= CHAR_7) ptr++;
				1876	*errorcodeptr = ERR34;
				1877	}
				1878	else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
				1879	{
				1880	if (utf && c >= 0xd800 && c <= 0xdfff &&
				1881	(extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
				1882	{
				1883	ptr--;
				1884	*errorcodeptr = ERR73;
				1885	}
				1886	}
				1887	else
				1888	{
				1889	ptr--;
				1890	*errorcodeptr = ERR64;
				1891	}
				1892	}
				1893	break;
				1894
				1895	/* When PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, \x must be followed
				1896	by two hexadecimal digits. Otherwise it is a lowercase x letter. */
				1897
				1898	case CHAR_x:
				1899	if (alt_bsux)
				1900	{
				1901	uint32_t xc;
				1902	if (ptrend - ptr < 2) break; /* Less than 2 characters */
				1903	if ((cc = XDIGIT(ptr[0])) == 0xff) break; /* Not a hex digit */
				1904	if ((xc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
				1905	c = (cc << 4) \| xc;
				1906	ptr += 2;
				1907	}
				1908
				1909	/* Handle \x in Perl's style. \x{ddd} is a character code which can be
				1910	greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
				1911	digits. If not, { used to be treated as a data character. However, Perl
				1912	seems to read hex digits up to the first non-such, and ignore the rest, so
				1913	that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
				1914	now gives an error. */
				1915
				1916	else
				1917	{
				1918	if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
				1919	{
				1920	#ifndef EBCDIC
				1921	COME_FROM_NU:
				1922	#endif
				1923	if (++ptr >= ptrend \|\| *ptr == CHAR_RIGHT_CURLY_BRACKET)
				1924	{
				1925	*errorcodeptr = ERR78;
				1926	break;
				1927	}
				1928	c = 0;
				1929	overflow = FALSE;
				1930
				1931	while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff)
				1932	{
				1933	ptr++;
				1934	if (c == 0 && cc == 0) continue; /* Leading zeroes */
				1935	#if PCRE2_CODE_UNIT_WIDTH == 32
				1936	if (c >= 0x10000000l) { overflow = TRUE; break; }
				1937	#endif
				1938	c = (c << 4) \| cc;
				1939	if ((utf && c > 0x10ffffU) \|\| (!utf && c > MAX_NON_UTF_CHAR))
				1940	{
				1941	overflow = TRUE;
				1942	break;
				1943	}
				1944	}
				1945
				1946	if (overflow)
				1947	{
				1948	while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;
				1949	*errorcodeptr = ERR34;
				1950	}
				1951	else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
				1952	{
				1953	if (utf && c >= 0xd800 && c <= 0xdfff &&
				1954	(extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
				1955	{
				1956	ptr--;
				1957	*errorcodeptr = ERR73;
				1958	}
				1959	}
				1960
				1961	/* If the sequence of hex digits does not end with '}', give an error.
				1962	We used just to recognize this construct and fall through to the normal
				1963	\x handling, but nowadays Perl gives an error, which seems much more
				1964	sensible, so we do too. */
				1965
				1966	else
				1967	{
				1968	ptr--;
				1969	*errorcodeptr = ERR67;
				1970	}
				1971	} /* End of \x{} processing */
				1972
				1973	/* Read a up to two hex digits after \x */
				1974
				1975	else
				1976	{
				1977	c = 0;
				1978	if (ptr >= ptrend \|\| (cc = XDIGIT(ptr)) == 0xff) break; / Not a hex digit */
				1979	ptr++;
				1980	c = cc;
				1981	if (ptr >= ptrend \|\| (cc = XDIGIT(ptr)) == 0xff) break; / Not a hex digit */
				1982	ptr++;
				1983	c = (c << 4) \| cc;
				1984	} /* End of \xdd handling */
				1985	} /* End of Perl-style \x handling */
				1986	break;
				1987
				1988	/* The handling of \c is different in ASCII and EBCDIC environments. In an
				1989	ASCII (or Unicode) environment, an error is given if the character
				1990	following \c is not a printable ASCII character. Otherwise, the following
				1991	character is upper-cased if it is a letter, and after that the 0x40 bit is
				1992	flipped. The result is the value of the escape.
				1993
				1994	In an EBCDIC environment the handling of \c is compatible with the
				1995	specification in the perlebcdic document. The following character must be
				1996	a letter or one of small number of special characters. These provide a
				1997	means of defining the character values 0-31.
				1998
				1999	For testing the EBCDIC handling of \c in an ASCII environment, recognize
				2000	the EBCDIC value of 'c' explicitly. */
				2001
				2002	#if defined EBCDIC && 'a' != 0x81
				2003	case 0x83:
				2004	#else
				2005	case CHAR_c:
				2006	#endif
				2007	if (ptr >= ptrend)
				2008	{
				2009	*errorcodeptr = ERR2;
				2010	break;
				2011	}
				2012	c = *ptr;
				2013	if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);
				2014
				2015	/* Handle \c in an ASCII/Unicode environment. */
				2016
				2017	#ifndef EBCDIC /* ASCII/UTF-8 coding */
				2018	if (c < 32 \|\| c > 126) /* Excludes all non-printable ASCII */
				2019	{
				2020	*errorcodeptr = ERR68;
				2021	break;
				2022	}
				2023	c ^= 0x40;
				2024
				2025	/* Handle \c in an EBCDIC environment. The special case \c? is converted to
				2026	255 (0xff) or 95 (0x5f) if other characters suggest we are using the
				2027	POSIX-BC encoding. (This is the way Perl indicates that it handles \c?.)
				2028	The other valid sequences correspond to a list of specific characters. */
				2029
				2030	#else
				2031	if (c == CHAR_QUESTION_MARK)
				2032	c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
				2033	else
				2034	{
				2035	for (i = 0; i < 32; i++)
				2036	{
				2037	if (c == ebcdic_escape_c[i]) break;
				2038	}
				2039	if (i < 32) c = i; else *errorcodeptr = ERR68;
				2040	}
				2041	#endif /* EBCDIC */
				2042
				2043	ptr++;
				2044	break;
				2045
				2046	/* Any other alphanumeric following \ is an error. Perl gives an error only
				2047	if in warning mode, but PCRE doesn't have a warning mode. */
				2048
				2049	default:
				2050	*errorcodeptr = ERR3;
				2051	ptrptr = ptr - 1; / Point to the character at fault */
				2052	return 0;
				2053	}
				2054	}
				2055
				2056	/* Set the pointer to the next character before returning. */
				2057
				2058	*ptrptr = ptr;
				2059	*chptr = c;
				2060	return escape;
				2061	}
				2062
				2063
				2064
				2065	#ifdef SUPPORT_UNICODE
				2066	/*************************************************
				2067	* Handle \P and \p *
				2068	*************************************************/
				2069
				2070	/* This function is called after \P or \p has been encountered, provided that
				2071	PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
				2072	contents of ptrptr are pointing after the P or p. On exit, it is left pointing
				2073	after the final code unit of the escape sequence.
				2074
				2075	Arguments:
				2076	ptrptr the pattern position pointer
				2077	negptr a boolean that is set TRUE for negation else FALSE
				2078	ptypeptr an unsigned int that is set to the type value
				2079	pdataptr an unsigned int that is set to the detailed property value
				2080	errorcodeptr the error code variable
				2081	cb the compile data
				2082
				2083	Returns: TRUE if the type value was found, or FALSE for an invalid type
				2084	*/
				2085
				2086	static BOOL
				2087	get_ucp(PCRE2_SPTR ptrptr, BOOL negptr, uint16_t *ptypeptr,
				2088	uint16_t pdataptr, int errorcodeptr, compile_block *cb)
				2089	{
				2090	PCRE2_UCHAR c;
				2091	PCRE2_SIZE i, bot, top;
				2092	PCRE2_SPTR ptr = *ptrptr;
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	2093	PCRE2_UCHAR name[50];
				2094	PCRE2_UCHAR *vptr = NULL;
				2095	uint16_t ptscript = PT_NOTSCRIPT;
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	2096
				2097	if (ptr >= cb->end_pattern) goto ERROR_RETURN;
				2098	c = *ptr++;
				2099	*negptr = FALSE;
				2100
				2101	/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
				2102	negation. */
				2103
				2104	if (c == CHAR_LEFT_CURLY_BRACKET)
				2105	{
				2106	if (ptr >= cb->end_pattern) goto ERROR_RETURN;
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	2107
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	2108	if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
				2109	{
				2110	*negptr = TRUE;
				2111	ptr++;
				2112	}
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	2113
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	2114	for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
				2115	{
				2116	if (ptr >= cb->end_pattern) goto ERROR_RETURN;
				2117	c = *ptr++;
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	2118	while (c == '_' \|\| c == '-' \|\| isspace(c))
				2119	{
				2120	if (ptr >= cb->end_pattern) goto ERROR_RETURN;
				2121	c = *ptr++;
				2122	}
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	2123	if (c == CHAR_NUL) goto ERROR_RETURN;
				2124	if (c == CHAR_RIGHT_CURLY_BRACKET) break;
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	2125	name[i] = tolower(c);
				2126	if ((c == ':' \|\| c == '=') && vptr == NULL) vptr = name + i;
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	2127	}
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	2128
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	2129	if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
				2130	name[i] = 0;
				2131	}
				2132
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	2133	/* If { doesn't follow \p or \P there is just one following character, which
				2134	must be an ASCII letter. */
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	2135
				2136	else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0)
				2137	{
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	2138	name[0] = tolower(c);
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	2139	name[1] = 0;
				2140	}
				2141	else goto ERROR_RETURN;
				2142
				2143	*ptrptr = ptr;
				2144
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	2145	/* If the property contains ':' or '=' we have class name and value separately
				2146	specified. The following are supported:
				2147
				2148	. Bidi_Class (synonym bc), for which the property names are "bidi<name>".
				2149	. Script (synonym sc) for which the property name is the script name
				2150	. Script_Extensions (synonym scx), ditto
				2151
				2152	As this is a small number, we currently just check the names directly. If this
				2153	grows, a sorted table and a switch will be neater.
				2154
				2155	For both the script properties, set a PT_xxx value so that (1) they can be
				2156	distinguished and (2) invalid script names that happen to be the name of
				2157	another property can be diagnosed. */
				2158
				2159	if (vptr != NULL)
				2160	{
				2161	int offset = 0;
				2162	PCRE2_UCHAR sname[8];
				2163
				2164	vptr = 0; / Terminate property name */
				2165	if (PRIV(strcmp_c8)(name, STRING_bidiclass) == 0 \|\|
				2166	PRIV(strcmp_c8)(name, STRING_bc) == 0)
				2167	{
				2168	offset = 4;
				2169	sname[0] = CHAR_b;
				2170	sname[1] = CHAR_i; /* There is no strcpy_c8 function */
				2171	sname[2] = CHAR_d;
				2172	sname[3] = CHAR_i;
				2173	}
				2174
				2175	else if (PRIV(strcmp_c8)(name, STRING_script) == 0 \|\|
				2176	PRIV(strcmp_c8)(name, STRING_sc) == 0)
				2177	ptscript = PT_SC;
				2178
				2179	else if (PRIV(strcmp_c8)(name, STRING_scriptextensions) == 0 \|\|
				2180	PRIV(strcmp_c8)(name, STRING_scx) == 0)
				2181	ptscript = PT_SCX;
				2182
				2183	else
				2184	{
				2185	*errorcodeptr = ERR47;
				2186	return FALSE;
				2187	}
				2188
				2189	/* Adjust the string in name[] as needed */
				2190
				2191	memmove(name + offset, vptr + 1, (name + i - vptr)*sizeof(PCRE2_UCHAR));
				2192	if (offset != 0) memmove(name, sname, offset*sizeof(PCRE2_UCHAR));
				2193	}
				2194
				2195	/* Search for a recognized property using binary chop. */
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	2196
				2197	bot = 0;
				2198	top = PRIV(utt_size);
				2199
				2200	while (bot < top)
				2201	{
				2202	int r;
				2203	i = (bot + top) >> 1;
				2204	r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	2205
				2206	/* When a matching property is found, some extra checking is needed when the
				2207	\p{xx:yy} syntax is used and xx is either sc or scx. */
				2208
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	2209	if (r == 0)
				2210	{
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	2211	*pdataptr = PRIV(utt)[i].value;
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	2212	if (vptr == NULL \|\| ptscript == PT_NOTSCRIPT)
				2213	{
				2214	*ptypeptr = PRIV(utt)[i].type;
				2215	return TRUE;
				2216	}
				2217
				2218	switch (PRIV(utt)[i].type)
				2219	{
				2220	case PT_SC:
				2221	*ptypeptr = PT_SC;
				2222	return TRUE;
				2223
				2224	case PT_SCX:
				2225	*ptypeptr = ptscript;
				2226	return TRUE;
				2227	}
				2228
				2229	break; /* Non-script found */
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	2230	}
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	2231
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	2232	if (r > 0) bot = i + 1; else top = i;
				2233	}
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	2234
				2235	errorcodeptr = ERR47; / Unrecognized property */
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	2236	return FALSE;
				2237
				2238	ERROR_RETURN: /* Malformed \P or \p */
				2239	*errorcodeptr = ERR46;
				2240	*ptrptr = ptr;
				2241	return FALSE;
				2242	}
				2243	#endif
				2244
				2245
				2246
				2247	/*************************************************
				2248	* Check for POSIX class syntax *
				2249	*************************************************/
				2250
				2251	/* This function is called when the sequence "[:" or "[." or "[=" is
				2252	encountered in a character class. It checks whether this is followed by a
				2253	sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
				2254	reach an unescaped ']' without the special preceding character, return FALSE.
				2255
				2256	Originally, this function only recognized a sequence of letters between the
				2257	terminators, but it seems that Perl recognizes any sequence of characters,
				2258	though of course unknown POSIX names are subsequently rejected. Perl gives an
				2259	"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
				2260	didn't consider this to be a POSIX class. Likewise for [:1234:].
				2261
				2262	The problem in trying to be exactly like Perl is in the handling of escapes. We
				2263	have to be sure that [abc[:x\]pqr] is not treated as containing a POSIX
				2264	class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
				2265	below handles the special cases \\ and \], but does not try to do any other
				2266	escape processing. This makes it different from Perl for cases such as
				2267	[:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
				2268	not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
				2269	when Perl does, I think.
				2270
				2271	A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
				2272	It seems that the appearance of a nested POSIX class supersedes an apparent
				2273	external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
				2274	a digit. This is handled by returning FALSE if the start of a new group with
				2275	the same terminator is encountered, since the next closing sequence must close
				2276	the nested group, not the outer one.
				2277
				2278	In Perl, unescaped square brackets may also appear as part of class names. For
				2279	example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
				2280	[:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
				2281	seem right at all. PCRE does not allow closing square brackets in POSIX class
				2282	names.
				2283
				2284	Arguments:
				2285	ptr pointer to the character after the initial [ (colon, dot, equals)
				2286	ptrend pointer to the end of the pattern
				2287	endptr where to return a pointer to the terminating ':', '.', or '='
				2288
				2289	Returns: TRUE or FALSE
				2290	*/
				2291
				2292	static BOOL
				2293	check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr)
				2294	{
				2295	PCRE2_UCHAR terminator; /* Don't combine these lines; the Solaris cc */
				2296	terminator = ptr++; / compiler warns about "non-constant" initializer. */
				2297
				2298	for (; ptrend - ptr >= 2; ptr++)
				2299	{
				2300	if (*ptr == CHAR_BACKSLASH &&
				2301	(ptr[1] == CHAR_RIGHT_SQUARE_BRACKET \|\| ptr[1] == CHAR_BACKSLASH))
				2302	ptr++;
				2303
				2304	else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) \|\|
				2305	*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
				2306
				2307	else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
				2308	{
				2309	*endptr = ptr;
				2310	return TRUE;
				2311	}
				2312	}
				2313
				2314	return FALSE;
				2315	}
				2316
				2317
				2318
				2319	/*************************************************
				2320	* Check POSIX class name *
				2321	*************************************************/
				2322
				2323	/* This function is called to check the name given in a POSIX-style class entry
				2324	such as [:alnum:].
				2325
				2326	Arguments:
				2327	ptr points to the first letter
				2328	len the length of the name
				2329
				2330	Returns: a value representing the name, or -1 if unknown
				2331	*/
				2332
				2333	static int
				2334	check_posix_name(PCRE2_SPTR ptr, int len)
				2335	{
				2336	const char *pn = posix_names;
				2337	int yield = 0;
				2338	while (posix_name_lengths[yield] != 0)
				2339	{
				2340	if (len == posix_name_lengths[yield] &&
				2341	PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;
				2342	pn += posix_name_lengths[yield] + 1;
				2343	yield++;
				2344	}
				2345	return -1;
				2346	}
				2347
				2348
				2349
				2350	/*************************************************
				2351	* Read a subpattern or VERB name *
				2352	*************************************************/
				2353
				2354	/* This function is called from parse_regex() below whenever it needs to read
				2355	the name of a subpattern or a (VERB) or an (alpha_assertion). The initial
				2356	pointer must be to the character before the name. If that character is '*' we
				2357	are reading a verb or alpha assertion name. The pointer is updated to point
				2358	after the name, for a VERB or alpha assertion name, or after tha name's
				2359	terminator for a subpattern name. Returning both the offset and the name
				2360	pointer is redundant information, but some callers use one and some the other,
				2361	so it is simplest just to return both.
				2362
				2363	Arguments:
				2364	ptrptr points to the character pointer variable
				2365	ptrend points to the end of the input string
				2366	utf true if the input is UTF-encoded
				2367	terminator the terminator of a subpattern name must be this
				2368	offsetptr where to put the offset from the start of the pattern
				2369	nameptr where to put a pointer to the name in the input
				2370	namelenptr where to put the length of the name
				2371	errcodeptr where to put an error code
				2372	cb pointer to the compile data block
				2373
				2374	Returns: TRUE if a name was read
				2375	FALSE otherwise, with error code set
				2376	*/
				2377
				2378	static BOOL
				2379	read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator,
				2380	PCRE2_SIZE offsetptr, PCRE2_SPTR nameptr, uint32_t *namelenptr,
				2381	int errorcodeptr, compile_block cb)
				2382	{
				2383	PCRE2_SPTR ptr = *ptrptr;
				2384	BOOL is_group = (*ptr != CHAR_ASTERISK);
				2385
				2386	if (++ptr >= ptrend) /* No characters in name */
				2387	{
				2388	errorcodeptr = is_group? ERR62: / Subpattern name expected */
				2389	ERR60; /* Verb not recognized or malformed */
				2390	goto FAILED;
				2391	}
				2392
				2393	*nameptr = ptr;
				2394	*offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);
				2395
				2396	/* In UTF mode, a group name may contain letters and decimal digits as defined
				2397	by Unicode properties, and underscores, but must not start with a digit. */
				2398
				2399	#ifdef SUPPORT_UNICODE
				2400	if (utf && is_group)
				2401	{
				2402	uint32_t c, type;
				2403
				2404	GETCHAR(c, ptr);
				2405	type = UCD_CHARTYPE(c);
				2406
				2407	if (type == ucp_Nd)
				2408	{
				2409	*errorcodeptr = ERR44;
				2410	goto FAILED;
				2411	}
				2412
				2413	for(;;)
				2414	{
				2415	if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&
				2416	c != CHAR_UNDERSCORE) break;
				2417	ptr++;
				2418	FORWARDCHARTEST(ptr, ptrend);
				2419	if (ptr >= ptrend) break;
				2420	GETCHAR(c, ptr);
				2421	type = UCD_CHARTYPE(c);
				2422	}
				2423	}
				2424	else
				2425	#else
				2426	(void)utf; /* Avoid compiler warning */
				2427	#endif /* SUPPORT_UNICODE */
				2428
				2429	/* Handle non-group names and group names in non-UTF modes. A group name must
				2430	not start with a digit. If either of the others start with a digit it just
				2431	won't be recognized. */
				2432
				2433	{
				2434	if (is_group && IS_DIGIT(*ptr))
				2435	{
				2436	*errorcodeptr = ERR44;
				2437	goto FAILED;
				2438	}
				2439
				2440	while (ptr < ptrend && MAX_255(ptr) && (cb->ctypes[ptr] & ctype_word) != 0)
				2441	{
				2442	ptr++;
				2443	}
				2444	}
				2445
				2446	/* Check name length */
				2447
				2448	if (ptr > *nameptr + MAX_NAME_SIZE)
				2449	{
				2450	*errorcodeptr = ERR48;
				2451	goto FAILED;
				2452	}
				2453	namelenptr = (uint32_t)(ptr - nameptr);
				2454
				2455	/* Subpattern names must not be empty, and their terminator is checked here.
				2456	(What follows a verb or alpha assertion name is checked separately.) */
				2457
				2458	if (is_group)
				2459	{
				2460	if (ptr == *nameptr)
				2461	{
				2462	errorcodeptr = ERR62; / Subpattern name expected */
				2463	goto FAILED;
				2464	}
				2465	if (ptr >= ptrend \|\| *ptr != (PCRE2_UCHAR)terminator)
				2466	{
				2467	*errorcodeptr = ERR42;
				2468	goto FAILED;
				2469	}
				2470	ptr++;
				2471	}
				2472
				2473	*ptrptr = ptr;
				2474	return TRUE;
				2475
				2476	FAILED:
				2477	*ptrptr = ptr;
				2478	return FALSE;
				2479	}
				2480
				2481
				2482
				2483	/*************************************************
				2484	* Manage callouts at start of cycle *
				2485	*************************************************/
				2486
				2487	/* At the start of a new item in parse_regex() we are able to record the
				2488	details of the previous item in a prior callout, and also to set up an
				2489	automatic callout if enabled. Avoid having two adjacent automatic callouts,
				2490	which would otherwise happen for items such as \Q that contribute nothing to
				2491	the parsed pattern.
				2492
				2493	Arguments:
				2494	ptr current pattern pointer
				2495	pcalloutptr points to a pointer to previous callout, or NULL
				2496	auto_callout TRUE if auto_callouts are enabled
				2497	parsed_pattern the parsed pattern pointer
				2498	cb compile block
				2499
				2500	Returns: possibly updated parsed_pattern pointer.
				2501	*/
				2502
				2503	static uint32_t *
				2504	manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout,
				2505	uint32_t parsed_pattern, compile_block cb)
				2506	{
				2507	uint32_t previous_callout = pcalloutptr;
				2508
				2509	if (previous_callout != NULL) previous_callout[2] = (uint32_t)(ptr -
				2510	cb->start_pattern - (PCRE2_SIZE)previous_callout[1]);
				2511
				2512	if (!auto_callout) previous_callout = NULL; else
				2513	{
				2514	if (previous_callout == NULL \|\|
				2515	previous_callout != parsed_pattern - 4 \|\|
				2516	previous_callout[3] != 255)
				2517	{
				2518	previous_callout = parsed_pattern; /* Set up new automatic callout */
				2519	parsed_pattern += 4;
				2520	previous_callout[0] = META_CALLOUT_NUMBER;
				2521	previous_callout[2] = 0;
				2522	previous_callout[3] = 255;
				2523	}
				2524	previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
				2525	}
				2526
				2527	*pcalloutptr = previous_callout;
				2528	return parsed_pattern;
				2529	}
				2530
				2531
				2532
				2533	/*************************************************
				2534	* Parse regex and identify named groups *
				2535	*************************************************/
				2536
				2537	/* This function is called first of all. It scans the pattern and does two
				2538	things: (1) It identifies capturing groups and makes a table of named capturing
				2539	groups so that information about them is fully available to both the compiling
				2540	scans. (2) It writes a parsed version of the pattern with comments omitted and
				2541	escapes processed into the parsed_pattern vector.
				2542
				2543	Arguments:
				2544	ptr points to the start of the pattern
				2545	options compiling dynamic options (may change during the scan)
				2546	has_lookbehind points to a boolean, set TRUE if a lookbehind is found
				2547	cb pointer to the compile data block
				2548
				2549	Returns: zero on success or a non-zero error code, with the
				2550	error offset placed in the cb field
				2551	*/
				2552
				2553	/* A structure and some flags for dealing with nested groups. */
				2554
				2555	typedef struct nest_save {
				2556	uint16_t nest_depth;
				2557	uint16_t reset_group;
				2558	uint16_t max_group;
				2559	uint16_t flags;
				2560	uint32_t options;
				2561	} nest_save;
				2562
				2563	#define NSF_RESET 0x0001u
				2564	#define NSF_CONDASSERT 0x0002u
				2565	#define NSF_ATOMICSR 0x0004u
				2566
				2567	/* Options that are changeable within the pattern must be tracked during
				2568	parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,
				2569	but all must be tracked so that META_OPTIONS items set the correct values for
				2570	the main compiling phase. */
				2571
				2572	#define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS\|PCRE2_DOTALL\|PCRE2_DUPNAMES\| \
				2573	PCRE2_EXTENDED\|PCRE2_EXTENDED_MORE\|PCRE2_MULTILINE\|PCRE2_NO_AUTO_CAPTURE\| \
				2574	PCRE2_UNGREEDY)
				2575
				2576	/* States used for analyzing ranges in character classes. The two OK values
				2577	must be last. */
				2578
				2579	enum { RANGE_NO, RANGE_STARTED, RANGE_OK_ESCAPED, RANGE_OK_LITERAL };
				2580
				2581	/* Only in 32-bit mode can there be literals > META_END. A macro encapsulates
				2582	the storing of literal values in the main parsed pattern, where they can always
				2583	be quantified. */
				2584
				2585	#if PCRE2_CODE_UNIT_WIDTH == 32
				2586	#define PARSED_LITERAL(c, p) \
				2587	{ \
				2588	if (c >= META_END) *p++ = META_BIGVALUE; \
				2589	*p++ = c; \
				2590	okquantifier = TRUE; \
				2591	}
				2592	#else
				2593	#define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE;
				2594	#endif
				2595
				2596	/* Here's the actual function. */
				2597
				2598	static int parse_regex(PCRE2_SPTR ptr, uint32_t options, BOOL *has_lookbehind,
				2599	compile_block *cb)
				2600	{
				2601	uint32_t c;
				2602	uint32_t delimiter;
				2603	uint32_t namelen;
				2604	uint32_t class_range_state;
				2605	uint32_t verblengthptr = NULL; / Value avoids compiler warning */
				2606	uint32_t *verbstartptr = NULL;
				2607	uint32_t *previous_callout = NULL;
				2608	uint32_t *parsed_pattern = cb->parsed_pattern;
				2609	uint32_t *parsed_pattern_end = cb->parsed_pattern_end;
				2610	uint32_t meta_quantifier = 0;
				2611	uint32_t add_after_mark = 0;
				2612	uint32_t extra_options = cb->cx->extra_options;
				2613	uint16_t nest_depth = 0;
				2614	int after_manual_callout = 0;
				2615	int expect_cond_assert = 0;
				2616	int errorcode = 0;
				2617	int escape;
				2618	int i;
				2619	BOOL inescq = FALSE;
				2620	BOOL inverbname = FALSE;
				2621	BOOL utf = (options & PCRE2_UTF) != 0;
				2622	BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0;
				2623	BOOL isdupname;
				2624	BOOL negate_class;
				2625	BOOL okquantifier = FALSE;
				2626	PCRE2_SPTR thisptr;
				2627	PCRE2_SPTR name;
				2628	PCRE2_SPTR ptrend = cb->end_pattern;
				2629	PCRE2_SPTR verbnamestart = NULL; /* Value avoids compiler warning */
				2630	named_group *ng;
				2631	nest_save top_nest, end_nests;
				2632
				2633	/* Insert leading items for word and line matching (features provided for the
				2634	benefit of pcre2grep). */
				2635
				2636	if ((extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
				2637	{
				2638	*parsed_pattern++ = META_CIRCUMFLEX;
				2639	*parsed_pattern++ = META_NOCAPTURE;
				2640	}
				2641	else if ((extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
				2642	{
				2643	*parsed_pattern++ = META_ESCAPE + ESC_b;
				2644	*parsed_pattern++ = META_NOCAPTURE;
				2645	}
				2646
				2647	/* If the pattern is actually a literal string, process it separately to avoid
				2648	cluttering up the main loop. */
				2649
				2650	if ((options & PCRE2_LITERAL) != 0)
				2651	{
				2652	while (ptr < ptrend)
				2653	{
				2654	if (parsed_pattern >= parsed_pattern_end)
				2655	{
				2656	errorcode = ERR63; /* Internal error (parsed pattern overflow) */
				2657	goto FAILED;
				2658	}
				2659	thisptr = ptr;
				2660	GETCHARINCTEST(c, ptr);
				2661	if (auto_callout)
				2662	parsed_pattern = manage_callouts(thisptr, &previous_callout,
				2663	auto_callout, parsed_pattern, cb);
				2664	PARSED_LITERAL(c, parsed_pattern);
				2665	}
				2666	goto PARSED_END;
				2667	}
				2668
				2669	/* Process a real regex which may contain meta-characters. */
				2670
				2671	top_nest = NULL;
				2672	end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);
				2673
				2674	/* The size of the nest_save structure might not be a factor of the size of the
				2675	workspace. Therefore we must round down end_nests so as to correctly avoid
				2676	creating a nest_save that spans the end of the workspace. */
				2677
				2678	end_nests = (nest_save )((char )end_nests -
				2679	((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));
				2680
				2681	/* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */
				2682
				2683	if ((options & PCRE2_EXTENDED_MORE) != 0) options \|= PCRE2_EXTENDED;
				2684
				2685	/* Now scan the pattern */
				2686
				2687	while (ptr < ptrend)
				2688	{
				2689	int prev_expect_cond_assert;
				2690	uint32_t min_repeat, max_repeat;
				2691	uint32_t set, unset, *optset;
				2692	uint32_t terminator;
				2693	uint32_t prev_meta_quantifier;
				2694	BOOL prev_okquantifier;
				2695	PCRE2_SPTR tempptr;
				2696	PCRE2_SIZE offset;
				2697
				2698	if (parsed_pattern >= parsed_pattern_end)
				2699	{
				2700	errorcode = ERR63; /* Internal error (parsed pattern overflow) */
				2701	goto FAILED;
				2702	}
				2703
				2704	if (nest_depth > cb->cx->parens_nest_limit)
				2705	{
				2706	errorcode = ERR19;
				2707	goto FAILED; /* Parentheses too deeply nested */
				2708	}
				2709
				2710	/* Get next input character, save its position for callout handling. */
				2711
				2712	thisptr = ptr;
				2713	GETCHARINCTEST(c, ptr);
				2714
				2715	/* Copy quoted literals until \E, allowing for the possibility of automatic
				2716	callouts, except when processing a (VERB) "name". /
				2717
				2718	if (inescq)
				2719	{
				2720	if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
				2721	{
				2722	inescq = FALSE;
				2723	ptr++; /* Skip E */
				2724	}
				2725	else
				2726	{
				2727	if (expect_cond_assert > 0) /* A literal is not allowed if we are */
				2728	{ /* expecting a conditional assertion, */
				2729	ptr--; /* but an empty \Q\E sequence is OK. */
				2730	errorcode = ERR28;
				2731	goto FAILED;
				2732	}
				2733	if (inverbname)
				2734	{ /* Don't use PARSED_LITERAL() because it */
				2735	#if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */
				2736	if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
				2737	#endif
				2738	*parsed_pattern++ = c;
				2739	}
				2740	else
				2741	{
				2742	if (after_manual_callout-- <= 0)
				2743	parsed_pattern = manage_callouts(thisptr, &previous_callout,
				2744	auto_callout, parsed_pattern, cb);
				2745	PARSED_LITERAL(c, parsed_pattern);
				2746	}
				2747	meta_quantifier = 0;
				2748	}
				2749	continue; /* Next character */
				2750	}
				2751
				2752	/* If we are processing the "name" part of a (*VERB:NAME) item, all
				2753	characters up to the closing parenthesis are literals except when
				2754	PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q
				2755	and \E and escaped characters are allowed (no character types such as \d). If
				2756	PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do
				2757	this by not entering the special (*VERB:NAME) processing - they are then
				2758	picked up below. Note that c is a character, not a code unit, so we must not
				2759	use MAX_255 to test its size because MAX_255 tests code units and is assumed
				2760	TRUE in 8-bit mode. */
				2761
				2762	if (inverbname &&
				2763	(
				2764	/* EITHER: not both options set */
				2765	((options & (PCRE2_EXTENDED \| PCRE2_ALT_VERBNAMES)) !=
				2766	(PCRE2_EXTENDED \| PCRE2_ALT_VERBNAMES)) \|\|
				2767	#ifdef SUPPORT_UNICODE
				2768	/* OR: character > 255 AND not Unicode Pattern White Space */
				2769	(c > 255 && (c\|1) != 0x200f && (c\|1) != 0x2029) \|\|
				2770	#endif
				2771	/* OR: not a # comment or isspace() white space */
				2772	(c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0
				2773	#ifdef SUPPORT_UNICODE
				2774	/* and not CHAR_NEL when Unicode is supported */
				2775	&& c != CHAR_NEL
				2776	#endif
				2777	)))
				2778	{
				2779	PCRE2_SIZE verbnamelength;
				2780
				2781	switch(c)
				2782	{
				2783	default: /* Don't use PARSED_LITERAL() because it */
				2784	#if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */
				2785	if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
				2786	#endif
				2787	*parsed_pattern++ = c;
				2788	break;
				2789
				2790	case CHAR_RIGHT_PARENTHESIS:
				2791	inverbname = FALSE;
				2792	/* This is the length in characters */
				2793	verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1);
				2794	/* But the limit on the length is in code units */
				2795	if (ptr - verbnamestart - 1 > (int)MAX_MARK)
				2796	{
				2797	ptr--;
				2798	errorcode = ERR76;
				2799	goto FAILED;
				2800	}
				2801	*verblengthptr = (uint32_t)verbnamelength;
				2802
				2803	/* If this name was on a verb such as (*ACCEPT) which does not continue,
				2804	a (*MARK) was generated for the name. We now add the original verb as the
				2805	next item. */
				2806
				2807	if (add_after_mark != 0)
				2808	{
				2809	*parsed_pattern++ = add_after_mark;
				2810	add_after_mark = 0;
				2811	}
				2812	break;
				2813
				2814	case CHAR_BACKSLASH:
				2815	if ((options & PCRE2_ALT_VERBNAMES) != 0)
				2816	{
				2817	escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
				2818	cb->cx->extra_options, FALSE, cb);
				2819	if (errorcode != 0) goto FAILED;
				2820	}
				2821	else escape = 0; /* Treat all as literal */
				2822
				2823	switch(escape)
				2824	{
				2825	case 0: /* Don't use PARSED_LITERAL() because it */
				2826	#if PCRE2_CODE_UNIT_WIDTH == 32 /* sets okquantifier. */
				2827	if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
				2828	#endif
				2829	*parsed_pattern++ = c;
				2830	break;
				2831
				2832	case ESC_Q:
				2833	inescq = TRUE;
				2834	break;
				2835
				2836	case ESC_E: /* Ignore */
				2837	break;
				2838
				2839	default:
				2840	errorcode = ERR40; /* Invalid in verb name */
				2841	goto FAILED;
				2842	}
				2843	}
				2844	continue; /* Next character in pattern */
				2845	}
				2846
				2847	/* Not a verb name character. At this point we must process everything that
				2848	must not change the quantification state. This is mainly comments, but we
				2849	handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as
				2850	A+, as in Perl. An isolated \E is ignored. */
				2851
				2852	if (c == CHAR_BACKSLASH && ptr < ptrend)
				2853	{
				2854	if (ptr == CHAR_Q \|\| ptr == CHAR_E)
				2855	{
				2856	inescq = *ptr == CHAR_Q;
				2857	ptr++;
				2858	continue;
				2859	}
				2860	}
				2861
				2862	/* Skip over whitespace and # comments in extended mode. Note that c is a
				2863	character, not a code unit, so we must not use MAX_255 to test its size
				2864	because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The
				2865	whitespace characters are those designated as "Pattern White Space" by
				2866	Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is
				2867	U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a
				2868	subset of space characters that match \h and \v. */
				2869
				2870	if ((options & PCRE2_EXTENDED) != 0)
				2871	{
				2872	if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue;
				2873	#ifdef SUPPORT_UNICODE
				2874	if (c == CHAR_NEL \|\| (c\|1) == 0x200f \|\| (c\|1) == 0x2029) continue;
				2875	#endif
				2876	if (c == CHAR_NUMBER_SIGN)
				2877	{
				2878	while (ptr < ptrend)
				2879	{
				2880	if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */
				2881	{ /* IS_NEWLINE sets cb->nllen. */
				2882	ptr += cb->nllen;
				2883	break;
				2884	}
				2885	ptr++;
				2886	#ifdef SUPPORT_UNICODE
				2887	if (utf) FORWARDCHARTEST(ptr, ptrend);
				2888	#endif
				2889	}
				2890	continue; /* Next character in pattern */
				2891	}
				2892	}
				2893
				2894	/* Skip over bracketed comments */
				2895
				2896	if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 &&
				2897	ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
				2898	{
				2899	while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS);
				2900	if (ptr >= ptrend)
				2901	{
				2902	errorcode = ERR18; /* A special error for missing ) in a comment */
				2903	goto FAILED; /* to make it easier to debug. */
				2904	}
				2905	ptr++;
				2906	continue; /* Next character in pattern */
				2907	}
				2908
				2909	/* If the next item is not a quantifier, fill in length of any previous
				2910	callout and create an auto callout if required. */
				2911
				2912	if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK &&
				2913	(c != CHAR_LEFT_CURLY_BRACKET \|\|
				2914	(tempptr = ptr,
				2915	!read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode))))
				2916	{
				2917	if (after_manual_callout-- <= 0)
				2918	parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout,
				2919	parsed_pattern, cb);
				2920	}
				2921
				2922	/* If expect_cond_assert is 2, we have just passed (?( and are expecting an
				2923	assertion, possibly preceded by a callout. If the value is 1, we have just
				2924	had the callout and expect an assertion. There must be at least 3 more
				2925	characters in all cases. When expect_cond_assert is 2, we know that the
				2926	current character is an opening parenthesis, as otherwise we wouldn't be
				2927	here. However, when it is 1, we need to check, and it's easiest just to check
				2928	always. Note that expect_cond_assert may be negative, since all callouts just
				2929	decrement it. */
				2930
				2931	if (expect_cond_assert > 0)
				2932	{
				2933	BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 &&
				2934	(ptr[0] == CHAR_QUESTION_MARK \|\| ptr[0] == CHAR_ASTERISK);
				2935	if (ok)
				2936	{
				2937	if (ptr[0] == CHAR_ASTERISK) /* New alpha assertion format, possibly */
				2938	{
				2939	ok = MAX_255(ptr[1]) && (cb->ctypes[ptr[1]] & ctype_lcletter) != 0;
				2940	}
				2941	else switch(ptr[1]) /* Traditional symbolic format */
				2942	{
				2943	case CHAR_C:
				2944	ok = expect_cond_assert == 2;
				2945	break;
				2946
				2947	case CHAR_EQUALS_SIGN:
				2948	case CHAR_EXCLAMATION_MARK:
				2949	break;
				2950
				2951	case CHAR_LESS_THAN_SIGN:
				2952	ok = ptr[2] == CHAR_EQUALS_SIGN \|\| ptr[2] == CHAR_EXCLAMATION_MARK;
				2953	break;
				2954
				2955	default:
				2956	ok = FALSE;
				2957	}
				2958	}
				2959
				2960	if (!ok)
				2961	{
				2962	ptr--; /* Adjust error offset */
				2963	errorcode = ERR28;
				2964	goto FAILED;
				2965	}
				2966	}
				2967
				2968	/* Remember whether we are expecting a conditional assertion, and set the
				2969	default for this item. */
				2970
				2971	prev_expect_cond_assert = expect_cond_assert;
				2972	expect_cond_assert = 0;
				2973
				2974	/* Remember quantification status for the previous significant item, then set
				2975	default for this item. */
				2976
				2977	prev_okquantifier = okquantifier;
				2978	prev_meta_quantifier = meta_quantifier;
				2979	okquantifier = FALSE;
				2980	meta_quantifier = 0;
				2981
				2982	/* If the previous significant item was a quantifier, adjust the parsed code
				2983	if there is a following modifier. The base meta value is always followed by
				2984	the PLUS and QUERY values, in that order. We do this here rather than after
				2985	reading a quantifier so that intervening comments and /x whitespace can be
				2986	ignored without having to replicate code. */
				2987
				2988	if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK \|\| c == CHAR_PLUS))
				2989	{
				2990	parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] =
				2991	prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)?
				2992	0x00020000u : 0x00010000u);
				2993	continue; /* Next character in pattern */
				2994	}
				2995
				2996
				2997	/* Process the next item in the main part of a pattern. */
				2998
				2999	switch(c)
				3000	{
				3001	default: /* Non-special character */
				3002	PARSED_LITERAL(c, parsed_pattern);
				3003	break;
				3004
				3005
				3006	/* ---- Escape sequence ---- */
				3007
				3008	case CHAR_BACKSLASH:
				3009	tempptr = ptr;
				3010	escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
				3011	cb->cx->extra_options, FALSE, cb);
				3012	if (errorcode != 0)
				3013	{
				3014	ESCAPE_FAILED:
				3015	if ((extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
				3016	goto FAILED;
				3017	ptr = tempptr;
				3018	if (ptr >= ptrend) c = CHAR_BACKSLASH; else
				3019	{
				3020	GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
				3021	}
				3022	escape = 0; /* Treat as literal character */
				3023	}
				3024
				3025	/* The escape was a data escape or literal character. */
				3026
				3027	if (escape == 0)
				3028	{
				3029	PARSED_LITERAL(c, parsed_pattern);
				3030	}
				3031
				3032	/* The escape was a back (or forward) reference. We keep the offset in
				3033	order to give a more useful diagnostic for a bad forward reference. For
				3034	references to groups numbered less than 10 we can't use more than two items
				3035	in parsed_pattern because they may be just two characters in the input (and
				3036	in a 64-bit world an offset may need two elements). So for them, the offset
				3037	of the first occurrent is held in a special vector. */
				3038
				3039	else if (escape < 0)
				3040	{
				3041	offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 1);
				3042	escape = -escape;
				3043	*parsed_pattern++ = META_BACKREF \| (uint32_t)escape;
				3044	if (escape < 10)
				3045	{
				3046	if (cb->small_ref_offset[escape] == PCRE2_UNSET)
				3047	cb->small_ref_offset[escape] = offset;
				3048	}
				3049	else
				3050	{
				3051	PUTOFFSET(offset, parsed_pattern);
				3052	}
				3053	okquantifier = TRUE;
				3054	}
				3055
				3056	/* The escape was a character class such as \d etc. or other special
				3057	escape indicator such as \A or \X. Most of them generate just a single
				3058	parsed item, but \P and \p are followed by a 16-bit type and a 16-bit
				3059	value. They are supported only when Unicode is available. The type and
				3060	value are packed into a single 32-bit value so that the whole sequences
				3061	uses only two elements in the parsed_vector. This is because the same
				3062	coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is
				3063	set.
				3064
				3065	There are also some cases where the escape sequence is followed by a name:
				3066	\k{name}, \k<name>, and \k'name' are backreferences by name, and \g<name>
				3067	and \g'name' are subroutine calls by name; \g{name} is a synonym for
				3068	\k{name}. Note that \g<number> and \g'number' are handled by check_escape()
				3069	and returned as a negative value (handled above). A name is coded as an
				3070	offset into the pattern and a length. */
				3071
				3072	else switch (escape)
				3073	{
				3074	case ESC_C:
				3075	#ifdef NEVER_BACKSLASH_C
				3076	errorcode = ERR85;
				3077	goto ESCAPE_FAILED;
				3078	#else
				3079	if ((options & PCRE2_NEVER_BACKSLASH_C) != 0)
				3080	{
				3081	errorcode = ERR83;
				3082	goto ESCAPE_FAILED;
				3083	}
				3084	#endif
				3085	okquantifier = TRUE;
				3086	*parsed_pattern++ = META_ESCAPE + escape;
				3087	break;
				3088
				3089	case ESC_X:
				3090	#ifndef SUPPORT_UNICODE
				3091	errorcode = ERR45; /* Supported only with Unicode support */
				3092	goto ESCAPE_FAILED;
				3093	#endif
				3094	case ESC_H:
				3095	case ESC_h:
				3096	case ESC_N:
				3097	case ESC_R:
				3098	case ESC_V:
				3099	case ESC_v:
				3100	okquantifier = TRUE;
				3101	*parsed_pattern++ = META_ESCAPE + escape;
				3102	break;
				3103
				3104	default: /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */
				3105	*parsed_pattern++ = META_ESCAPE + escape;
				3106	break;
				3107
				3108	/* Escapes that change in UCP mode. Note that PCRE2_UCP will never be set
				3109	without Unicode support because it is checked when pcre2_compile() is
				3110	called. */
				3111
				3112	case ESC_d:
				3113	case ESC_D:
				3114	case ESC_s:
				3115	case ESC_S:
				3116	case ESC_w:
				3117	case ESC_W:
				3118	okquantifier = TRUE;
				3119	if ((options & PCRE2_UCP) == 0)
				3120	{
				3121	*parsed_pattern++ = META_ESCAPE + escape;
				3122	}
				3123	else
				3124	{
				3125	*parsed_pattern++ = META_ESCAPE +
				3126	((escape == ESC_d \|\| escape == ESC_s \|\| escape == ESC_w)?
				3127	ESC_p : ESC_P);
				3128	switch(escape)
				3129	{
				3130	case ESC_d:
				3131	case ESC_D:
				3132	*parsed_pattern++ = (PT_PC << 16) \| ucp_Nd;
				3133	break;
				3134
				3135	case ESC_s:
				3136	case ESC_S:
				3137	*parsed_pattern++ = PT_SPACE << 16;
				3138	break;
				3139
				3140	case ESC_w:
				3141	case ESC_W:
				3142	*parsed_pattern++ = PT_WORD << 16;
				3143	break;
				3144	}
				3145	}
				3146	break;
				3147
				3148	/* Unicode property matching */
				3149
				3150	case ESC_P:
				3151	case ESC_p:
				3152	#ifdef SUPPORT_UNICODE
				3153	{
				3154	BOOL negated;
				3155	uint16_t ptype = 0, pdata = 0;
				3156	if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
				3157	goto ESCAPE_FAILED;
				3158	if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
				3159	*parsed_pattern++ = META_ESCAPE + escape;
				3160	*parsed_pattern++ = (ptype << 16) \| pdata;
				3161	okquantifier = TRUE;
				3162	}
				3163	#else
				3164	errorcode = ERR45;
				3165	goto ESCAPE_FAILED;
				3166	#endif
				3167	break; /* End \P and \p */
				3168
				3169	/* When \g is used with quotes or angle brackets as delimiters, it is a
				3170	numerical or named subroutine call, and control comes here. When used
				3171	with brace delimiters it is a numberical back reference and does not come
				3172	here because check_escape() returns it directly as a reference. \k is
				3173	always a named back reference. */
				3174
				3175	case ESC_g:
				3176	case ESC_k:
				3177	if (ptr >= ptrend \|\| (*ptr != CHAR_LEFT_CURLY_BRACKET &&
				3178	ptr != CHAR_LESS_THAN_SIGN && ptr != CHAR_APOSTROPHE))
				3179	{
				3180	errorcode = (escape == ESC_g)? ERR57 : ERR69;
				3181	goto ESCAPE_FAILED;
				3182	}
				3183	terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
				3184	CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
				3185	CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
				3186
				3187	/* For a non-braced \g, check for a numerical recursion. */
				3188
				3189	if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET)
				3190	{
				3191	PCRE2_SPTR p = ptr + 1;
				3192
				3193	if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
				3194	&errorcode))
				3195	{
				3196	if (p >= ptrend \|\| *p != terminator)
				3197	{
				3198	errorcode = ERR57;
				3199	goto ESCAPE_FAILED;
				3200	}
				3201	ptr = p;
				3202	goto SET_RECURSION;
				3203	}
				3204	if (errorcode != 0) goto ESCAPE_FAILED;
				3205	}
				3206
				3207	/* Not a numerical recursion */
				3208
				3209	if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
				3210	&errorcode, cb)) goto ESCAPE_FAILED;
				3211
				3212	/* \k and \g when used with braces are back references, whereas \g used
				3213	with quotes or angle brackets is a recursion */
				3214
				3215	*parsed_pattern++ =
				3216	(escape == ESC_k \|\| terminator == CHAR_RIGHT_CURLY_BRACKET)?
				3217	META_BACKREF_BYNAME : META_RECURSE_BYNAME;
				3218	*parsed_pattern++ = namelen;
				3219
				3220	PUTOFFSET(offset, parsed_pattern);
				3221	okquantifier = TRUE;
				3222	break; /* End special escape processing */
				3223	}
				3224	break; /* End escape sequence processing */
				3225
				3226
				3227	/* ---- Single-character special items ---- */
				3228
				3229	case CHAR_CIRCUMFLEX_ACCENT:
				3230	*parsed_pattern++ = META_CIRCUMFLEX;
				3231	break;
				3232
				3233	case CHAR_DOLLAR_SIGN:
				3234	*parsed_pattern++ = META_DOLLAR;
				3235	break;
				3236
				3237	case CHAR_DOT:
				3238	*parsed_pattern++ = META_DOT;
				3239	okquantifier = TRUE;
				3240	break;
				3241
				3242
				3243	/* ---- Single-character quantifiers ---- */
				3244
				3245	case CHAR_ASTERISK:
				3246	meta_quantifier = META_ASTERISK;
				3247	goto CHECK_QUANTIFIER;
				3248
				3249	case CHAR_PLUS:
				3250	meta_quantifier = META_PLUS;
				3251	goto CHECK_QUANTIFIER;
				3252
				3253	case CHAR_QUESTION_MARK:
				3254	meta_quantifier = META_QUERY;
				3255	goto CHECK_QUANTIFIER;
				3256
				3257
				3258	/* ---- Potential {n,m} quantifier ---- */
				3259
				3260	case CHAR_LEFT_CURLY_BRACKET:
				3261	if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat,
				3262	&errorcode))
				3263	{
				3264	if (errorcode != 0) goto FAILED; /* Error in quantifier. */
				3265	PARSED_LITERAL(c, parsed_pattern); /* Not a quantifier */
				3266	break; /* No more quantifier processing */
				3267	}
				3268	meta_quantifier = META_MINMAX;
				3269	/* Fall through */
				3270
				3271
				3272	/* ---- Quantifier post-processing ---- */
				3273
				3274	/* Check that a quantifier is allowed after the previous item. */
				3275
				3276	CHECK_QUANTIFIER:
				3277	if (!prev_okquantifier)
				3278	{
				3279	errorcode = ERR9;
				3280	goto FAILED_BACK;
				3281	}
				3282
				3283	/* Most (*VERB)s are not allowed to be quantified, but an ungreedy
				3284	quantifier can be useful for (*ACCEPT) - meaning "succeed on backtrack", a
				3285	sort of negated (COMMIT). We therefore allow (ACCEPT) to be quantified by
				3286	wrapping it in non-capturing brackets, but we have to allow for a preceding
				3287	(MARK) for when (ACCEPT) has an argument. */
				3288
				3289	if (parsed_pattern[-1] == META_ACCEPT)
				3290	{
				3291	uint32_t *p;
				3292	for (p = parsed_pattern - 1; p >= verbstartptr; p--) p[1] = p[0];
				3293	*verbstartptr = META_NOCAPTURE;
				3294	parsed_pattern[1] = META_KET;
				3295	parsed_pattern += 2;
				3296	}
				3297
				3298	/* Now we can put the quantifier into the parsed pattern vector. At this
				3299	stage, we have only the basic quantifier. The check for a following + or ?
				3300	modifier happens at the top of the loop, after any intervening comments
				3301	have been removed. */
				3302
				3303	*parsed_pattern++ = meta_quantifier;
				3304	if (c == CHAR_LEFT_CURLY_BRACKET)
				3305	{
				3306	*parsed_pattern++ = min_repeat;
				3307	*parsed_pattern++ = max_repeat;
				3308	}
				3309	break;
				3310
				3311
				3312	/* ---- Character class ---- */
				3313
				3314	case CHAR_LEFT_SQUARE_BRACKET:
				3315	okquantifier = TRUE;
				3316
				3317	/* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
				3318	used for "start of word" and "end of word". As these are otherwise illegal
				3319	sequences, we don't break anything by recognizing them. They are replaced
				3320	by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
				3321	erroneous and are handled by the normal code below. */
				3322
				3323	if (ptrend - ptr >= 6 &&
				3324	(PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 \|\|
				3325	PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0))
				3326	{
				3327	*parsed_pattern++ = META_ESCAPE + ESC_b;
				3328
				3329	if (ptr[2] == CHAR_LESS_THAN_SIGN)
				3330	{
				3331	*parsed_pattern++ = META_LOOKAHEAD;
				3332	}
				3333	else
				3334	{
				3335	*parsed_pattern++ = META_LOOKBEHIND;
				3336	*has_lookbehind = TRUE;
				3337
				3338	/* The offset is used only for the "non-fixed length" error; this won't
				3339	occur here, so just store zero. */
				3340
				3341	PUTOFFSET((PCRE2_SIZE)0, parsed_pattern);
				3342	}
				3343
				3344	if ((options & PCRE2_UCP) == 0)
				3345	*parsed_pattern++ = META_ESCAPE + ESC_w;
				3346	else
				3347	{
				3348	*parsed_pattern++ = META_ESCAPE + ESC_p;
				3349	*parsed_pattern++ = PT_WORD << 16;
				3350	}
				3351	*parsed_pattern++ = META_KET;
				3352	ptr += 6;
				3353	break;
				3354	}
				3355
				3356	/* PCRE supports POSIX class stuff inside a class. Perl gives an error if
				3357	they are encountered at the top level, so we'll do that too. */
				3358
				3359	if (ptr < ptrend && (ptr == CHAR_COLON \|\| ptr == CHAR_DOT \|\|
				3360	*ptr == CHAR_EQUALS_SIGN) &&
				3361	check_posix_syntax(ptr, ptrend, &tempptr))
				3362	{
				3363	errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13;
				3364	goto FAILED;
				3365	}
				3366
				3367	/* Process a regular character class. If the first character is '^', set
				3368	the negation flag. If the first few characters (either before or after ^)
				3369	are \Q\E or \E or space or tab in extended-more mode, we skip them too.
				3370	This makes for compatibility with Perl. */
				3371
				3372	negate_class = FALSE;
				3373	while (ptr < ptrend)
				3374	{
				3375	GETCHARINCTEST(c, ptr);
				3376	if (c == CHAR_BACKSLASH)
				3377	{
				3378	if (ptr < ptrend && *ptr == CHAR_E) ptr++;
				3379	else if (ptrend - ptr >= 3 &&
				3380	PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0)
				3381	ptr += 3;
				3382	else
				3383	break;
				3384	}
				3385	else if ((options & PCRE2_EXTENDED_MORE) != 0 &&
				3386	(c == CHAR_SPACE \|\| c == CHAR_HT)) /* Note: just these two */
				3387	continue;
				3388	else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
				3389	negate_class = TRUE;
				3390	else break;
				3391	}
				3392
				3393	/* Now the real contents of the class; c has the first "real" character.
				3394	Empty classes are permitted only if the option is set. */
				3395
				3396	if (c == CHAR_RIGHT_SQUARE_BRACKET &&
				3397	(cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0)
				3398	{
				3399	*parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY;
				3400	break; /* End of class processing */
				3401	}
				3402
				3403	/* Process a non-empty class. */
				3404
				3405	*parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS;
				3406	class_range_state = RANGE_NO;
				3407
				3408	/* In an EBCDIC environment, Perl treats alphabetic ranges specially
				3409	because there are holes in the encoding, and simply using the range A-Z
				3410	(for example) would include the characters in the holes. This applies only
				3411	to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z]
				3412	in this respect. In order to accommodate this, we keep track of whether
				3413	character values are literal or not, and a state variable for handling
				3414	ranges. */
				3415
				3416	/* Loop for the contents of the class */
				3417
				3418	for (;;)
				3419	{
				3420	BOOL char_is_literal = TRUE;
				3421
				3422	/* Inside \Q...\E everything is literal except \E */
				3423
				3424	if (inescq)
				3425	{
				3426	if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
				3427	{
				3428	inescq = FALSE; /* Reset literal state */
				3429	ptr++; /* Skip the 'E' */
				3430	goto CLASS_CONTINUE;
				3431	}
				3432	goto CLASS_LITERAL;
				3433	}
				3434
				3435	/* Skip over space and tab (only) in extended-more mode. */
				3436
				3437	if ((options & PCRE2_EXTENDED_MORE) != 0 &&
				3438	(c == CHAR_SPACE \|\| c == CHAR_HT))
				3439	goto CLASS_CONTINUE;
				3440
				3441	/* Handle POSIX class names. Perl allows a negation extension of the
				3442	form [:^name:]. A square bracket that doesn't match the syntax is
				3443	treated as a literal. We also recognize the POSIX constructions
				3444	[.ch.] and [=ch=] ("collating elements") and fault them, as Perl
				3445	5.6 and 5.8 do. */
				3446
				3447	if (c == CHAR_LEFT_SQUARE_BRACKET &&
				3448	ptrend - ptr >= 3 &&
				3449	(ptr == CHAR_COLON \|\| ptr == CHAR_DOT \|\|
				3450	*ptr == CHAR_EQUALS_SIGN) &&
				3451	check_posix_syntax(ptr, ptrend, &tempptr))
				3452	{
				3453	BOOL posix_negate = FALSE;
				3454	int posix_class;
				3455
				3456	/* Perl treats a hyphen before a POSIX class as a literal, not the
				3457	start of a range. However, it gives a warning in its warning mode. PCRE
				3458	does not have a warning mode, so we give an error, because this is
				3459	likely an error on the user's part. */
				3460
				3461	if (class_range_state == RANGE_STARTED)
				3462	{
				3463	errorcode = ERR50;
				3464	goto FAILED;
				3465	}
				3466
				3467	if (*ptr != CHAR_COLON)
				3468	{
				3469	errorcode = ERR13;
				3470	goto FAILED_BACK;
				3471	}
				3472
				3473	if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
				3474	{
				3475	posix_negate = TRUE;
				3476	ptr++;
				3477	}
				3478
				3479	posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
				3480	if (posix_class < 0)
				3481	{
				3482	errorcode = ERR30;
				3483	goto FAILED;
				3484	}
				3485	ptr = tempptr + 2;
				3486
				3487	/* Perl treats a hyphen after a POSIX class as a literal, not the
				3488	start of a range. However, it gives a warning in its warning mode
				3489	unless the hyphen is the last character in the class. PCRE does not
				3490	have a warning mode, so we give an error, because this is likely an
				3491	error on the user's part. */
				3492
				3493	if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
				3494	ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
				3495	{
				3496	errorcode = ERR50;
				3497	goto FAILED;
				3498	}
				3499
				3500	/* Set "a hyphen is not the start of a range" for the -] case, and also
				3501	in case the POSIX class is followed by \E or \Q\E (possibly repeated -
				3502	fuzzers do that kind of thing) and then a hyphen. This causes that
				3503	hyphen to be treated as a literal. I don't think it's worth setting up
				3504	special apparatus to do otherwise. */
				3505
				3506	class_range_state = RANGE_NO;
				3507
				3508	/* When PCRE2_UCP is set, some of the POSIX classes are converted to
				3509	use Unicode properties \p or \P or, in one case, \h or \H. The
				3510	substitutes table has two values per class, containing the type and
				3511	value of a \p or \P item. The special cases are specified with a
				3512	negative type: a non-zero value causes \h or \H to be used, and a zero
				3513	value falls through to behave like a non-UCP POSIX class. */
				3514
				3515	#ifdef SUPPORT_UNICODE
				3516	if ((options & PCRE2_UCP) != 0)
				3517	{
				3518	int ptype = posix_substitutes[2*posix_class];
				3519	int pvalue = posix_substitutes[2*posix_class + 1];
				3520	if (ptype >= 0)
				3521	{
				3522	*parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p);
				3523	*parsed_pattern++ = (ptype << 16) \| pvalue;
				3524	goto CLASS_CONTINUE;
				3525	}
				3526
				3527	if (pvalue != 0)
				3528	{
				3529	*parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h);
				3530	goto CLASS_CONTINUE;
				3531	}
				3532
				3533	/* Fall through */
				3534	}
				3535	#endif /* SUPPORT_UNICODE */
				3536
				3537	/* Non-UCP POSIX class */
				3538
				3539	*parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX;
				3540	*parsed_pattern++ = posix_class;
				3541	}
				3542
				3543	/* Handle potential start of range */
				3544
				3545	else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED)
				3546	{
				3547	*parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)?
				3548	META_RANGE_LITERAL : META_RANGE_ESCAPED;
				3549	class_range_state = RANGE_STARTED;
				3550	}
				3551
				3552	/* Handle a literal character */
				3553
				3554	else if (c != CHAR_BACKSLASH)
				3555	{
				3556	CLASS_LITERAL:
				3557	if (class_range_state == RANGE_STARTED)
				3558	{
				3559	if (c == parsed_pattern[-2]) /* Optimize one-char range */
				3560	parsed_pattern--;
				3561	else if (parsed_pattern[-2] > c) /* Check range is in order */
				3562	{
				3563	errorcode = ERR8;
				3564	goto FAILED_BACK;
				3565	}
				3566	else
				3567	{
				3568	if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL)
				3569	parsed_pattern[-1] = META_RANGE_ESCAPED;
				3570	PARSED_LITERAL(c, parsed_pattern);
				3571	}
				3572	class_range_state = RANGE_NO;
				3573	}
				3574	else /* Potential start of range */
				3575	{
				3576	class_range_state = char_is_literal?
				3577	RANGE_OK_LITERAL : RANGE_OK_ESCAPED;
				3578	PARSED_LITERAL(c, parsed_pattern);
				3579	}
				3580	}
				3581
				3582	/* Handle escapes in a class */
				3583
				3584	else
				3585	{
				3586	tempptr = ptr;
				3587	escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
				3588	cb->cx->extra_options, TRUE, cb);
				3589
				3590	if (errorcode != 0)
				3591	{
				3592	if ((extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
				3593	goto FAILED;
				3594	ptr = tempptr;
				3595	if (ptr >= ptrend) c = CHAR_BACKSLASH; else
				3596	{
				3597	GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
				3598	}
				3599	escape = 0; /* Treat as literal character */
				3600	}
				3601
				3602	switch(escape)
				3603	{
				3604	case 0: /* Escaped character code point is in c */
				3605	char_is_literal = FALSE;
				3606	goto CLASS_LITERAL;
				3607
				3608	case ESC_b:
				3609	c = CHAR_BS; /* \b is backspace in a class */
				3610	char_is_literal = FALSE;
				3611	goto CLASS_LITERAL;
				3612
				3613	case ESC_Q:
				3614	inescq = TRUE; /* Enter literal mode */
				3615	goto CLASS_CONTINUE;
				3616
				3617	case ESC_E: /* Ignore orphan \E */
				3618	goto CLASS_CONTINUE;
				3619
				3620	case ESC_B: /* Always an error in a class */
				3621	case ESC_R:
				3622	case ESC_X:
				3623	errorcode = ERR7;
				3624	ptr--;
				3625	goto FAILED;
				3626	}
				3627
				3628	/* The second part of a range can be a single-character escape
				3629	sequence (detected above), but not any of the other escapes. Perl
				3630	treats a hyphen as a literal in such circumstances. However, in Perl's
				3631	warning mode, a warning is given, so PCRE now faults it, as it is
				3632	almost certainly a mistake on the user's part. */
				3633
				3634	if (class_range_state == RANGE_STARTED)
				3635	{
				3636	errorcode = ERR50;
				3637	goto FAILED; /* Not CLASS_ESCAPE_FAILED; always an error */
				3638	}
				3639
				3640	/* Of the remaining escapes, only those that define characters are
				3641	allowed in a class. None may start a range. */
				3642
				3643	class_range_state = RANGE_NO;
				3644	switch(escape)
				3645	{
				3646	case ESC_N:
				3647	errorcode = ERR71;
				3648	goto FAILED;
				3649
				3650	case ESC_H:
				3651	case ESC_h:
				3652	case ESC_V:
				3653	case ESC_v:
				3654	*parsed_pattern++ = META_ESCAPE + escape;
				3655	break;
				3656
				3657	/* These escapes are converted to Unicode property tests when
				3658	PCRE2_UCP is set. */
				3659
				3660	case ESC_d:
				3661	case ESC_D:
				3662	case ESC_s:
				3663	case ESC_S:
				3664	case ESC_w:
				3665	case ESC_W:
				3666	if ((options & PCRE2_UCP) == 0)
				3667	{
				3668	*parsed_pattern++ = META_ESCAPE + escape;
				3669	}
				3670	else
				3671	{
				3672	*parsed_pattern++ = META_ESCAPE +
				3673	((escape == ESC_d \|\| escape == ESC_s \|\| escape == ESC_w)?
				3674	ESC_p : ESC_P);
				3675	switch(escape)
				3676	{
				3677	case ESC_d:
				3678	case ESC_D:
				3679	*parsed_pattern++ = (PT_PC << 16) \| ucp_Nd;
				3680	break;
				3681
				3682	case ESC_s:
				3683	case ESC_S:
				3684	*parsed_pattern++ = PT_SPACE << 16;
				3685	break;
				3686
				3687	case ESC_w:
				3688	case ESC_W:
				3689	*parsed_pattern++ = PT_WORD << 16;
				3690	break;
				3691	}
				3692	}
				3693	break;
				3694
				3695	/* Explicit Unicode property matching */
				3696
				3697	case ESC_P:
				3698	case ESC_p:
				3699	#ifdef SUPPORT_UNICODE
				3700	{
				3701	BOOL negated;
				3702	uint16_t ptype = 0, pdata = 0;
				3703	if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
				3704	goto FAILED;
				3705	if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
				3706	*parsed_pattern++ = META_ESCAPE + escape;
				3707	*parsed_pattern++ = (ptype << 16) \| pdata;
				3708	}
				3709	#else
				3710	errorcode = ERR45;
				3711	goto FAILED;
				3712	#endif
				3713	break; /* End \P and \p */
				3714
				3715	default: /* All others are not allowed in a class */
				3716	errorcode = ERR7;
				3717	ptr--;
				3718	goto FAILED;
				3719	}
				3720
				3721	/* Perl gives a warning unless a following hyphen is the last character
				3722	in the class. PCRE throws an error. */
				3723
				3724	if (ptr < ptrend - 1 && *ptr == CHAR_MINUS &&
				3725	ptr[1] != CHAR_RIGHT_SQUARE_BRACKET)
				3726	{
				3727	errorcode = ERR50;
				3728	goto FAILED;
				3729	}
				3730	}
				3731
				3732	/* Proceed to next thing in the class. */
				3733
				3734	CLASS_CONTINUE:
				3735	if (ptr >= ptrend)
				3736	{
				3737	errorcode = ERR6; /* Missing terminating ']' */
				3738	goto FAILED;
				3739	}
				3740	GETCHARINCTEST(c, ptr);
				3741	if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break;
				3742	} /* End of class-processing loop */
				3743
				3744	/* -] at the end of a class is a literal '-' */
				3745
				3746	if (class_range_state == RANGE_STARTED)
				3747	{
				3748	parsed_pattern[-1] = CHAR_MINUS;
				3749	class_range_state = RANGE_NO;
				3750	}
				3751
				3752	*parsed_pattern++ = META_CLASS_END;
				3753	break; /* End of character class */
				3754
				3755
				3756	/* ---- Opening parenthesis ---- */
				3757
				3758	case CHAR_LEFT_PARENTHESIS:
				3759	if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
				3760
				3761	/* If ( is not followed by ? it is either a capture or a special verb or an
				3762	alpha assertion or a positive non-atomic lookahead. */
				3763
				3764	if (*ptr != CHAR_QUESTION_MARK)
				3765	{
				3766	const char *vn;
				3767
				3768	/* Handle capturing brackets (or non-capturing if auto-capture is turned
				3769	off). */
				3770
				3771	if (*ptr != CHAR_ASTERISK)
				3772	{
				3773	nest_depth++;
				3774	if ((options & PCRE2_NO_AUTO_CAPTURE) == 0)
				3775	{
				3776	if (cb->bracount >= MAX_GROUP_NUMBER)
				3777	{
				3778	errorcode = ERR97;
				3779	goto FAILED;
				3780	}
				3781	cb->bracount++;
				3782	*parsed_pattern++ = META_CAPTURE \| cb->bracount;
				3783	}
				3784	else *parsed_pattern++ = META_NOCAPTURE;
				3785	}
				3786
				3787	/* Do nothing for (* followed by end of pattern or ) so it gives a "bad
				3788	quantifier" error rather than "(MARK) must have an argument". /
				3789
				3790	else if (ptrend - ptr <= 1 \|\| (c = ptr[1]) == CHAR_RIGHT_PARENTHESIS)
				3791	break;
				3792
				3793	/* Handle "alpha assertions" such as (*pla:...). Most of these are
				3794	synonyms for the historical symbolic assertions, but the script run and
				3795	non-atomic lookaround ones are new. They are distinguished by starting
				3796	with a lower case letter. Checking both ends of the alphabet makes this
				3797	work in all character codes. */
				3798
				3799	else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0)
				3800	{
				3801	uint32_t meta;
				3802
				3803	vn = alasnames;
				3804	if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
				3805	&errorcode, cb)) goto FAILED;
				3806	if (ptr >= ptrend \|\| *ptr != CHAR_COLON)
				3807	{
				3808	errorcode = ERR95; /* Malformed */
				3809	goto FAILED;
				3810	}
				3811
				3812	/* Scan the table of alpha assertion names */
				3813
				3814	for (i = 0; i < alascount; i++)
				3815	{
				3816	if (namelen == alasmeta[i].len &&
				3817	PRIV(strncmp_c8)(name, vn, namelen) == 0)
				3818	break;
				3819	vn += alasmeta[i].len + 1;
				3820	}
				3821
				3822	if (i >= alascount)
				3823	{
				3824	errorcode = ERR95; /* Alpha assertion not recognized */
				3825	goto FAILED;
				3826	}
				3827
				3828	/* Check for expecting an assertion condition. If so, only atomic
				3829	lookaround assertions are valid. */
				3830
				3831	meta = alasmeta[i].meta;
				3832	if (prev_expect_cond_assert > 0 &&
				3833	(meta < META_LOOKAHEAD \|\| meta > META_LOOKBEHINDNOT))
				3834	{
				3835	errorcode = (meta == META_LOOKAHEAD_NA \|\| meta == META_LOOKBEHIND_NA)?
				3836	ERR98 : ERR28; /* (Atomic) assertion expected */
				3837	goto FAILED;
				3838	}
				3839
				3840	/* The lookaround alphabetic synonyms can mostly be handled by jumping
				3841	to the code that handles the traditional symbolic forms. */
				3842
				3843	switch(meta)
				3844	{
				3845	default:
				3846	errorcode = ERR89; /* Unknown code; should never occur because */
				3847	goto FAILED; /* the meta values come from a table above. */
				3848
				3849	case META_ATOMIC:
				3850	goto ATOMIC_GROUP;
				3851
				3852	case META_LOOKAHEAD:
				3853	goto POSITIVE_LOOK_AHEAD;
				3854
				3855	case META_LOOKAHEAD_NA:
				3856	goto POSITIVE_NONATOMIC_LOOK_AHEAD;
				3857
				3858	case META_LOOKAHEADNOT:
				3859	goto NEGATIVE_LOOK_AHEAD;
				3860
				3861	case META_LOOKBEHIND:
				3862	case META_LOOKBEHINDNOT:
				3863	case META_LOOKBEHIND_NA:
				3864	*parsed_pattern++ = meta;
				3865	ptr--;
				3866	goto POST_LOOKBEHIND;
				3867
				3868	/* The script run facilities are handled here. Unicode support is
				3869	required (give an error if not, as this is a security issue). Always
				3870	record a META_SCRIPT_RUN item. Then, for the atomic version, insert
				3871	META_ATOMIC and remember that we need two META_KETs at the end. */
				3872
				3873	case META_SCRIPT_RUN:
				3874	case META_ATOMIC_SCRIPT_RUN:
				3875	#ifdef SUPPORT_UNICODE
				3876	*parsed_pattern++ = META_SCRIPT_RUN;
				3877	nest_depth++;
				3878	ptr++;
				3879	if (meta == META_ATOMIC_SCRIPT_RUN)
				3880	{
				3881	*parsed_pattern++ = META_ATOMIC;
				3882	if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
				3883	else if (++top_nest >= end_nests)
				3884	{
				3885	errorcode = ERR84;
				3886	goto FAILED;
				3887	}
				3888	top_nest->nest_depth = nest_depth;
				3889	top_nest->flags = NSF_ATOMICSR;
				3890	top_nest->options = options & PARSE_TRACKED_OPTIONS;
				3891	}
				3892	break;
				3893	#else /* SUPPORT_UNICODE */
				3894	errorcode = ERR96;
				3895	goto FAILED;
				3896	#endif
				3897	}
				3898	}
				3899
				3900
				3901	/* ---- Handle (VERB) and (VERB:NAME) ---- */
				3902
				3903	else
				3904	{
				3905	vn = verbnames;
				3906	if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
				3907	&errorcode, cb)) goto FAILED;
				3908	if (ptr >= ptrend \|\| (*ptr != CHAR_COLON &&
				3909	*ptr != CHAR_RIGHT_PARENTHESIS))
				3910	{
				3911	errorcode = ERR60; /* Malformed */
				3912	goto FAILED;
				3913	}
				3914
				3915	/* Scan the table of verb names */
				3916
				3917	for (i = 0; i < verbcount; i++)
				3918	{
				3919	if (namelen == verbs[i].len &&
				3920	PRIV(strncmp_c8)(name, vn, namelen) == 0)
				3921	break;
				3922	vn += verbs[i].len + 1;
				3923	}
				3924
				3925	if (i >= verbcount)
				3926	{
				3927	errorcode = ERR60; /* Verb not recognized */
				3928	goto FAILED;
				3929	}
				3930
				3931	/* An empty argument is treated as no argument. */
				3932
				3933	if (*ptr == CHAR_COLON && ptr + 1 < ptrend &&
				3934	ptr[1] == CHAR_RIGHT_PARENTHESIS)
				3935	ptr++; /* Advance to the closing parens */
				3936
				3937	/* Check for mandatory non-empty argument; this is (MARK) /
				3938
				3939	if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON)
				3940	{
				3941	errorcode = ERR66;
				3942	goto FAILED;
				3943	}
				3944
				3945	/* Remember where this verb, possibly with a preceding (*MARK), starts,
				3946	for handling quantified (ACCEPT). /
				3947
				3948	verbstartptr = parsed_pattern;
				3949	okquantifier = (verbs[i].meta == META_ACCEPT);
				3950
				3951	/* It appears that Perl allows any characters whatsoever, other than a
				3952	closing parenthesis, to appear in arguments ("names"), so we no longer
				3953	insist on letters, digits, and underscores. Perl does not, however, do
				3954	any interpretation within arguments, and has no means of including a
				3955	closing parenthesis. PCRE supports escape processing but only when it
				3956	is requested by an option. We set inverbname TRUE here, and let the
				3957	main loop take care of this so that escape and \x processing is done by
				3958	the main code above. */
				3959
				3960	if (ptr++ == CHAR_COLON) / Skip past : or ) */
				3961	{
				3962	/* Some optional arguments can be treated as a preceding (MARK) /
				3963
				3964	if (verbs[i].has_arg < 0)
				3965	{
				3966	add_after_mark = verbs[i].meta;
				3967	*parsed_pattern++ = META_MARK;
				3968	}
				3969
				3970	/* The remaining verbs with arguments (except *MARK) need a different
				3971	opcode. */
				3972
				3973	else
				3974	{
				3975	*parsed_pattern++ = verbs[i].meta +
				3976	((verbs[i].meta != META_MARK)? 0x00010000u:0);
				3977	}
				3978
				3979	/* Set up for reading the name in the main loop. */
				3980
				3981	verblengthptr = parsed_pattern++;
				3982	verbnamestart = ptr;
				3983	inverbname = TRUE;
				3984	}
				3985	else /* No verb "name" argument */
				3986	{
				3987	*parsed_pattern++ = verbs[i].meta;
				3988	}
				3989	} /* End of (VERB) handling /
				3990	break; /* Done with this parenthesis */
				3991	} /* End of groups that don't start with (? */
				3992
				3993
				3994	/* ---- Items starting (? ---- */
				3995
				3996	/* The type of item is determined by what follows (?. Handle (?\| and option
				3997	changes under "default" because both need a new block on the nest stack.
				3998	Comments starting with (?# are handled above. Note that there is some
				3999	ambiguity about the sequence (?- because if a digit follows it's a relative
				4000	recursion or subroutine call whereas otherwise it's an option unsetting. */
				4001
				4002	if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
				4003
				4004	switch(*ptr)
				4005	{
				4006	default:
				4007	if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1]))
				4008	goto RECURSION_BYNUMBER; /* The + case is handled by CHAR_PLUS */
				4009
				4010	/* We now have either (?\| or a (possibly empty) option setting,
				4011	optionally followed by a non-capturing group. */
				4012
				4013	nest_depth++;
				4014	if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
				4015	else if (++top_nest >= end_nests)
				4016	{
				4017	errorcode = ERR84;
				4018	goto FAILED;
				4019	}
				4020	top_nest->nest_depth = nest_depth;
				4021	top_nest->flags = 0;
				4022	top_nest->options = options & PARSE_TRACKED_OPTIONS;
				4023
				4024	/* Start of non-capturing group that resets the capture count for each
				4025	branch. */
				4026
				4027	if (*ptr == CHAR_VERTICAL_LINE)
				4028	{
				4029	top_nest->reset_group = (uint16_t)cb->bracount;
				4030	top_nest->max_group = (uint16_t)cb->bracount;
				4031	top_nest->flags \|= NSF_RESET;
				4032	cb->external_flags \|= PCRE2_DUPCAPUSED;
				4033	*parsed_pattern++ = META_NOCAPTURE;
				4034	ptr++;
				4035	}
				4036
				4037	/* Scan for options imnsxJU to be set or unset. */
				4038
				4039	else
				4040	{
				4041	BOOL hyphenok = TRUE;
				4042	uint32_t oldoptions = options;
				4043
				4044	top_nest->reset_group = 0;
				4045	top_nest->max_group = 0;
				4046	set = unset = 0;
				4047	optset = &set;
				4048
				4049	/* ^ at the start unsets imnsx and disables the subsequent use of - */
				4050
				4051	if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT)
				4052	{
				4053	options &= ~(PCRE2_CASELESS\|PCRE2_MULTILINE\|PCRE2_NO_AUTO_CAPTURE\|
				4054	PCRE2_DOTALL\|PCRE2_EXTENDED\|PCRE2_EXTENDED_MORE);
				4055	hyphenok = FALSE;
				4056	ptr++;
				4057	}
				4058
				4059	while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS &&
				4060	*ptr != CHAR_COLON)
				4061	{
				4062	switch (*ptr++)
				4063	{
				4064	case CHAR_MINUS:
				4065	if (!hyphenok)
				4066	{
				4067	errorcode = ERR94;
				4068	ptr--; /* Correct the offset */
				4069	goto FAILED;
				4070	}
				4071	optset = &unset;
				4072	hyphenok = FALSE;
				4073	break;
				4074
				4075	case CHAR_J: /* Record that it changed in the external options */
				4076	*optset \|= PCRE2_DUPNAMES;
				4077	cb->external_flags \|= PCRE2_JCHANGED;
				4078	break;
				4079
				4080	case CHAR_i: *optset \|= PCRE2_CASELESS; break;
				4081	case CHAR_m: *optset \|= PCRE2_MULTILINE; break;
				4082	case CHAR_n: *optset \|= PCRE2_NO_AUTO_CAPTURE; break;
				4083	case CHAR_s: *optset \|= PCRE2_DOTALL; break;
				4084	case CHAR_U: *optset \|= PCRE2_UNGREEDY; break;
				4085
				4086	/* If x appears twice it sets the extended extended option. */
				4087
				4088	case CHAR_x:
				4089	*optset \|= PCRE2_EXTENDED;
				4090	if (ptr < ptrend && *ptr == CHAR_x)
				4091	{
				4092	*optset \|= PCRE2_EXTENDED_MORE;
				4093	ptr++;
				4094	}
				4095	break;
				4096
				4097	default:
				4098	errorcode = ERR11;
				4099	ptr--; /* Correct the offset */
				4100	goto FAILED;
				4101	}
				4102	}
				4103
				4104	/* If we are setting extended without extended-more, ensure that any
				4105	existing extended-more gets unset. Also, unsetting extended must also
				4106	unset extended-more. */
				4107
				4108	if ((set & (PCRE2_EXTENDED\|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED \|\|
				4109	(unset & PCRE2_EXTENDED) != 0)
				4110	unset \|= PCRE2_EXTENDED_MORE;
				4111
				4112	options = (options \| set) & (~unset);
				4113
				4114	/* If the options ended with ')' this is not the start of a nested
				4115	group with option changes, so the options change at this level.
				4116	In this case, if the previous level set up a nest block, discard the
				4117	one we have just created. Otherwise adjust it for the previous level.
				4118	If the options ended with ':' we are starting a non-capturing group,
				4119	possibly with an options setting. */
				4120
				4121	if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
				4122	if (*ptr++ == CHAR_RIGHT_PARENTHESIS)
				4123	{
				4124	nest_depth--; /* This is not a nested group after all. */
				4125	if (top_nest > (nest_save *)(cb->start_workspace) &&
				4126	(top_nest-1)->nest_depth == nest_depth) top_nest--;
				4127	else top_nest->nest_depth = nest_depth;
				4128	}
				4129	else *parsed_pattern++ = META_NOCAPTURE;
				4130
				4131	/* If nothing changed, no need to record. */
				4132
				4133	if (options != oldoptions)
				4134	{
				4135	*parsed_pattern++ = META_OPTIONS;
				4136	*parsed_pattern++ = options;
				4137	}
				4138	} /* End options processing */
				4139	break; /* End default case after (? */
				4140
				4141
				4142	/* ---- Python syntax support ---- */
				4143
				4144	case CHAR_P:
				4145	if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
				4146
				4147	/* (?P<name> is the same as (?<name>, which defines a named group. */
				4148
				4149	if (*ptr == CHAR_LESS_THAN_SIGN)
				4150	{
				4151	terminator = CHAR_GREATER_THAN_SIGN;
				4152	goto DEFINE_NAME;
				4153	}
				4154
				4155	/* (?P>name) is the same as (?&name), which is a recursion or subroutine
				4156	call. */
				4157
				4158	if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME;
				4159
				4160	/* (?P=name) is the same as \k<name>, a back reference by name. Anything
				4161	else after (?P is an error. */
				4162
				4163	if (*ptr != CHAR_EQUALS_SIGN)
				4164	{
				4165	errorcode = ERR41;
				4166	goto FAILED;
				4167	}
				4168	if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
				4169	&namelen, &errorcode, cb)) goto FAILED;
				4170	*parsed_pattern++ = META_BACKREF_BYNAME;
				4171	*parsed_pattern++ = namelen;
				4172	PUTOFFSET(offset, parsed_pattern);
				4173	okquantifier = TRUE;
				4174	break; /* End of (?P processing */
				4175
				4176
				4177	/* ---- Recursion/subroutine calls by number ---- */
				4178
				4179	case CHAR_R:
				4180	i = 0; /* (?R) == (?R0) */
				4181	ptr++;
				4182	if (ptr >= ptrend \|\| *ptr != CHAR_RIGHT_PARENTHESIS)
				4183	{
				4184	errorcode = ERR58;
				4185	goto FAILED;
				4186	}
				4187	goto SET_RECURSION;
				4188
				4189	/* An item starting (?- followed by a digit comes here via the "default"
				4190	case because (?- followed by a non-digit is an options setting. */
				4191
				4192	case CHAR_PLUS:
				4193	if (ptrend - ptr < 2 \|\| !IS_DIGIT(ptr[1]))
				4194	{
				4195	errorcode = ERR29; /* Missing number */
				4196	goto FAILED;
				4197	}
				4198	/* Fall through */
				4199
				4200	case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
				4201	case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
				4202	RECURSION_BYNUMBER:
				4203	if (!read_number(&ptr, ptrend,
				4204	(IS_DIGIT(ptr))? -1:(int)(cb->bracount), / + and - are relative */
				4205	MAX_GROUP_NUMBER, ERR61,
				4206	&i, &errorcode)) goto FAILED;
				4207	if (i < 0) /* NB (?0) is permitted */
				4208	{
				4209	errorcode = ERR15; /* Unknown group */
				4210	goto FAILED_BACK;
				4211	}
				4212	if (ptr >= ptrend \|\| *ptr != CHAR_RIGHT_PARENTHESIS)
				4213	goto UNCLOSED_PARENTHESIS;
				4214
				4215	SET_RECURSION:
				4216	*parsed_pattern++ = META_RECURSE \| (uint32_t)i;
				4217	offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
				4218	ptr++;
				4219	PUTOFFSET(offset, parsed_pattern);
				4220	okquantifier = TRUE;
				4221	break; /* End of recursive call by number handling */
				4222
				4223
				4224	/* ---- Recursion/subroutine calls by name ---- */
				4225
				4226	case CHAR_AMPERSAND:
				4227	RECURSE_BY_NAME:
				4228	if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
				4229	&namelen, &errorcode, cb)) goto FAILED;
				4230	*parsed_pattern++ = META_RECURSE_BYNAME;
				4231	*parsed_pattern++ = namelen;
				4232	PUTOFFSET(offset, parsed_pattern);
				4233	okquantifier = TRUE;
				4234	break;
				4235
				4236	/* ---- Callout with numerical or string argument ---- */
				4237
				4238	case CHAR_C:
				4239	if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
				4240
				4241	/* If the previous item was a condition starting (?(? an assertion,
				4242	optionally preceded by a callout, is expected. This is checked later on,
				4243	during actual compilation. However we need to identify this kind of
				4244	assertion in this pass because it must not be qualified. The value of
				4245	expect_cond_assert is set to 2 after (?(? is processed. We decrement it
				4246	for a callout - still leaving a positive value that identifies the
				4247	assertion. Multiple callouts or any other items will make it zero or
				4248	less, which doesn't matter because they will cause an error later. */
				4249
				4250	expect_cond_assert = prev_expect_cond_assert - 1;
				4251
				4252	/* If previous_callout is not NULL, it means this follows a previous
				4253	callout. If it was a manual callout, do nothing; this means its "length
				4254	of next pattern item" field will remain zero. If it was an automatic
				4255	callout, abolish it. */
				4256
				4257	if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 &&
				4258	previous_callout == parsed_pattern - 4 &&
				4259	parsed_pattern[-1] == 255)
				4260	parsed_pattern = previous_callout;
				4261
				4262	/* Save for updating next pattern item length, and skip one item before
				4263	completing. */
				4264
				4265	previous_callout = parsed_pattern;
				4266	after_manual_callout = 1;
				4267
				4268	/* Handle a string argument; specific delimiter is required. */
				4269
				4270	if (ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(ptr))
				4271	{
				4272	PCRE2_SIZE calloutlength;
				4273	PCRE2_SPTR startptr = ptr;
				4274
				4275	delimiter = 0;
				4276	for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
				4277	{
				4278	if (*ptr == PRIV(callout_start_delims)[i])
				4279	{
				4280	delimiter = PRIV(callout_end_delims)[i];
				4281	break;
				4282	}
				4283	}
				4284	if (delimiter == 0)
				4285	{
				4286	errorcode = ERR82;
				4287	goto FAILED;
				4288	}
				4289
				4290	*parsed_pattern = META_CALLOUT_STRING;
				4291	parsed_pattern += 3; /* Skip pattern info */
				4292
				4293	for (;;)
				4294	{
				4295	if (++ptr >= ptrend)
				4296	{
				4297	errorcode = ERR81;
				4298	ptr = startptr; /* To give a more useful message */
				4299	goto FAILED;
				4300	}
				4301	if (ptr == delimiter && (++ptr >= ptrend \|\| ptr != delimiter))
				4302	break;
				4303	}
				4304
				4305	calloutlength = (PCRE2_SIZE)(ptr - startptr);
				4306	if (calloutlength > UINT32_MAX)
				4307	{
				4308	errorcode = ERR72;
				4309	goto FAILED;
				4310	}
				4311	*parsed_pattern++ = (uint32_t)calloutlength;
				4312	offset = (PCRE2_SIZE)(startptr - cb->start_pattern);
				4313	PUTOFFSET(offset, parsed_pattern);
				4314	}
				4315
				4316	/* Handle a callout with an optional numerical argument, which must be
				4317	less than or equal to 255. A missing argument gives 0. */
				4318
				4319	else
				4320	{
				4321	int n = 0;
				4322	parsed_pattern = META_CALLOUT_NUMBER; / Numerical callout */
				4323	parsed_pattern += 3; /* Skip pattern info */
				4324	while (ptr < ptrend && IS_DIGIT(*ptr))
				4325	{
				4326	n = n * 10 + *ptr++ - CHAR_0;
				4327	if (n > 255)
				4328	{
				4329	errorcode = ERR38;
				4330	goto FAILED;
				4331	}
				4332	}
				4333	*parsed_pattern++ = n;
				4334	}
				4335
				4336	/* Both formats must have a closing parenthesis */
				4337
				4338	if (ptr >= ptrend \|\| *ptr != CHAR_RIGHT_PARENTHESIS)
				4339	{
				4340	errorcode = ERR39;
				4341	goto FAILED;
				4342	}
				4343	ptr++;
				4344
				4345	/* Remember the offset to the next item in the pattern, and set a default
				4346	length. This should get updated after the next item is read. */
				4347
				4348	previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
				4349	previous_callout[2] = 0;
				4350	break; /* End callout */
				4351
				4352
				4353	/* ---- Conditional group ---- */
				4354
				4355	/* A condition can be an assertion, a number (referring to a numbered
				4356	group's having been set), a name (referring to a named group), or 'R',
				4357	referring to overall recursion. R<digits> and R&name are also permitted
				4358	for recursion state tests. Numbers may be preceded by + or - to specify a
				4359	relative group number.
				4360
				4361	There are several syntaxes for testing a named group: (?(name)) is used
				4362	by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
				4363
				4364	There are two unfortunate ambiguities. 'R' can be the recursive thing or
				4365	the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be
				4366	the Perl DEFINE feature or the Python named test. We look for a name
				4367	first; if not found, we try the other case.
				4368
				4369	For compatibility with auto-callouts, we allow a callout to be specified
				4370	before a condition that is an assertion. */
				4371
				4372	case CHAR_LEFT_PARENTHESIS:
				4373	if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
				4374	nest_depth++;
				4375
				4376	/* If the next character is ? or * there must be an assertion next
				4377	(optionally preceded by a callout). We do not check this here, but
				4378	instead we set expect_cond_assert to 2. If this is still greater than
				4379	zero (callouts decrement it) when the next assertion is read, it will be
				4380	marked as a condition that must not be repeated. A value greater than
				4381	zero also causes checking that an assertion (possibly with callout)
				4382	follows. */
				4383
				4384	if (ptr == CHAR_QUESTION_MARK \|\| ptr == CHAR_ASTERISK)
				4385	{
				4386	*parsed_pattern++ = META_COND_ASSERT;
				4387	ptr--; /* Pull pointer back to the opening parenthesis. */
				4388	expect_cond_assert = 2;
				4389	break; /* End of conditional */
				4390	}
				4391
				4392	/* Handle (?([+-]number)... */
				4393
				4394	if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
				4395	&errorcode))
				4396	{
				4397	if (i <= 0)
				4398	{
				4399	errorcode = ERR15;
				4400	goto FAILED;
				4401	}
				4402	*parsed_pattern++ = META_COND_NUMBER;
				4403	offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
				4404	PUTOFFSET(offset, parsed_pattern);
				4405	*parsed_pattern++ = i;
				4406	}
				4407	else if (errorcode != 0) goto FAILED; /* Number too big */
				4408
				4409	/* No number found. Handle the special case (?(VERSION[>]=n.m)... */
				4410
				4411	else if (ptrend - ptr >= 10 &&
				4412	PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 &&
				4413	ptr[7] != CHAR_RIGHT_PARENTHESIS)
				4414	{
				4415	uint32_t ge = 0;
				4416	int major = 0;
				4417	int minor = 0;
				4418
				4419	ptr += 7;
				4420	if (*ptr == CHAR_GREATER_THAN_SIGN)
				4421	{
				4422	ge = 1;
				4423	ptr++;
				4424	}
				4425
				4426	/* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT
				4427	references its argument twice. */
				4428
				4429	if (ptr != CHAR_EQUALS_SIGN \|\| (ptr++, !IS_DIGIT(ptr)))
				4430	goto BAD_VERSION_CONDITION;
				4431
				4432	if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode))
				4433	goto FAILED;
				4434
				4435	if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
				4436	if (*ptr == CHAR_DOT)
				4437	{
				4438	if (++ptr >= ptrend \|\| !IS_DIGIT(*ptr)) goto BAD_VERSION_CONDITION;
				4439	minor = (ptr++ - CHAR_0) 10;
				4440	if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
				4441	if (IS_DIGIT(ptr)) minor += ptr++ - CHAR_0;
				4442	if (ptr >= ptrend \|\| *ptr != CHAR_RIGHT_PARENTHESIS)
				4443	goto BAD_VERSION_CONDITION;
				4444	}
				4445
				4446	*parsed_pattern++ = META_COND_VERSION;
				4447	*parsed_pattern++ = ge;
				4448	*parsed_pattern++ = major;
				4449	*parsed_pattern++ = minor;
				4450	}
				4451
				4452	/* All the remaining cases now require us to read a name. We cannot at
				4453	this stage distinguish ambiguous cases such as (?(R12) which might be a
				4454	recursion test by number or a name, because the named groups have not yet
				4455	all been identified. Those cases are treated as names, but given a
				4456	different META code. */
				4457
				4458	else
				4459	{
				4460	BOOL was_r_ampersand = FALSE;
				4461
				4462	if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND)
				4463	{
				4464	terminator = CHAR_RIGHT_PARENTHESIS;
				4465	was_r_ampersand = TRUE;
				4466	ptr++;
				4467	}
				4468	else if (*ptr == CHAR_LESS_THAN_SIGN)
				4469	terminator = CHAR_GREATER_THAN_SIGN;
				4470	else if (*ptr == CHAR_APOSTROPHE)
				4471	terminator = CHAR_APOSTROPHE;
				4472	else
				4473	{
				4474	terminator = CHAR_RIGHT_PARENTHESIS;
				4475	ptr--; /* Point to char before name */
				4476	}
				4477	if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
				4478	&errorcode, cb)) goto FAILED;
				4479
				4480	/* Handle (?(R&name) */
				4481
				4482	if (was_r_ampersand)
				4483	{
				4484	*parsed_pattern = META_COND_RNAME;
				4485	ptr--; /* Back to closing parens */
				4486	}
				4487
				4488	/* Handle (?(name). If the name is "DEFINE" we identify it with a
				4489	special code. Likewise if the name consists of R followed only by
				4490	digits. Otherwise, handle it like a quoted name. */
				4491
				4492	else if (terminator == CHAR_RIGHT_PARENTHESIS)
				4493	{
				4494	if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0)
				4495	*parsed_pattern = META_COND_DEFINE;
				4496	else
				4497	{
				4498	for (i = 1; i < (int)namelen; i++)
				4499	if (!IS_DIGIT(name[i])) break;
				4500	parsed_pattern = (name == CHAR_R && i >= (int)namelen)?
				4501	META_COND_RNUMBER : META_COND_NAME;
				4502	}
				4503	ptr--; /* Back to closing parens */
				4504	}
				4505
				4506	/* Handle (?('name') or (?(<name>) */
				4507
				4508	else *parsed_pattern = META_COND_NAME;
				4509
				4510	/* All these cases except DEFINE end with the name length and offset;
				4511	DEFINE just has an offset (for the "too many branches" error). */
				4512
				4513	if (parsed_pattern++ != META_COND_DEFINE) parsed_pattern++ = namelen;
				4514	PUTOFFSET(offset, parsed_pattern);
				4515	} /* End cases that read a name */
				4516
				4517	/* Check the closing parenthesis of the condition */
				4518
				4519	if (ptr >= ptrend \|\| *ptr != CHAR_RIGHT_PARENTHESIS)
				4520	{
				4521	errorcode = ERR24;
				4522	goto FAILED;
				4523	}
				4524	ptr++;
				4525	break; /* End of condition processing */
				4526
				4527
				4528	/* ---- Atomic group ---- */
				4529
				4530	case CHAR_GREATER_THAN_SIGN:
				4531	ATOMIC_GROUP: /* Come from (atomic: /
				4532	*parsed_pattern++ = META_ATOMIC;
				4533	nest_depth++;
				4534	ptr++;
				4535	break;
				4536
				4537
				4538	/* ---- Lookahead assertions ---- */
				4539
				4540	case CHAR_EQUALS_SIGN:
				4541	POSITIVE_LOOK_AHEAD: /* Come from (pla: /
				4542	*parsed_pattern++ = META_LOOKAHEAD;
				4543	ptr++;
				4544	goto POST_ASSERTION;
				4545
				4546	case CHAR_ASTERISK:
				4547	POSITIVE_NONATOMIC_LOOK_AHEAD: /* Come from (?* */
				4548	*parsed_pattern++ = META_LOOKAHEAD_NA;
				4549	ptr++;
				4550	goto POST_ASSERTION;
				4551
				4552	case CHAR_EXCLAMATION_MARK:
				4553	NEGATIVE_LOOK_AHEAD: /* Come from (nla: /
				4554	*parsed_pattern++ = META_LOOKAHEADNOT;
				4555	ptr++;
				4556	goto POST_ASSERTION;
				4557
				4558
				4559	/* ---- Lookbehind assertions ---- */
				4560
				4561	/* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?<
				4562	is the start of the name of a capturing group. */
				4563
				4564	case CHAR_LESS_THAN_SIGN:
				4565	if (ptrend - ptr <= 1 \|\|
				4566	(ptr[1] != CHAR_EQUALS_SIGN &&
				4567	ptr[1] != CHAR_EXCLAMATION_MARK &&
				4568	ptr[1] != CHAR_ASTERISK))
				4569	{
				4570	terminator = CHAR_GREATER_THAN_SIGN;
				4571	goto DEFINE_NAME;
				4572	}
				4573	*parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
				4574	META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)?
				4575	META_LOOKBEHINDNOT : META_LOOKBEHIND_NA;
				4576
				4577	POST_LOOKBEHIND: /* Come from (plb: (naplb: and (nlb: /
				4578	*has_lookbehind = TRUE;
				4579	offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
				4580	PUTOFFSET(offset, parsed_pattern);
				4581	ptr += 2;
				4582	/* Fall through */
				4583
				4584	/* If the previous item was a condition starting (?(? an assertion,
				4585	optionally preceded by a callout, is expected. This is checked later on,
				4586	during actual compilation. However we need to identify this kind of
				4587	assertion in this pass because it must not be qualified. The value of
				4588	expect_cond_assert is set to 2 after (?(? is processed. We decrement it
				4589	for a callout - still leaving a positive value that identifies the
				4590	assertion. Multiple callouts or any other items will make it zero or
				4591	less, which doesn't matter because they will cause an error later. */
				4592
				4593	POST_ASSERTION:
				4594	nest_depth++;
				4595	if (prev_expect_cond_assert > 0)
				4596	{
				4597	if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
				4598	else if (++top_nest >= end_nests)
				4599	{
				4600	errorcode = ERR84;
				4601	goto FAILED;
				4602	}
				4603	top_nest->nest_depth = nest_depth;
				4604	top_nest->flags = NSF_CONDASSERT;
				4605	top_nest->options = options & PARSE_TRACKED_OPTIONS;
				4606	}
				4607	break;
				4608
				4609
				4610	/* ---- Define a named group ---- */
				4611
				4612	/* A named group may be defined as (?'name') or (?<name>). In the latter
				4613	case we jump to DEFINE_NAME from the disambiguation of (?< above with the
				4614	terminator set to '>'. */
				4615
				4616	case CHAR_APOSTROPHE:
				4617	terminator = CHAR_APOSTROPHE; /* Terminator */
				4618
				4619	DEFINE_NAME:
				4620	if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
				4621	&errorcode, cb)) goto FAILED;
				4622
				4623	/* We have a name for this capturing group. It is also assigned a number,
				4624	which is its primary means of identification. */
				4625
				4626	if (cb->bracount >= MAX_GROUP_NUMBER)
				4627	{
				4628	errorcode = ERR97;
				4629	goto FAILED;
				4630	}
				4631	cb->bracount++;
				4632	*parsed_pattern++ = META_CAPTURE \| cb->bracount;
				4633	nest_depth++;
				4634
				4635	/* Check not too many names */
				4636
				4637	if (cb->names_found >= MAX_NAME_COUNT)
				4638	{
				4639	errorcode = ERR49;
				4640	goto FAILED;
				4641	}
				4642
				4643	/* Adjust the entry size to accommodate the longest name found. */
				4644
				4645	if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)
				4646	cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);
				4647
				4648	/* Scan the list to check for duplicates. For duplicate names, if the
				4649	number is the same, break the loop, which causes the name to be
				4650	discarded; otherwise, if DUPNAMES is not set, give an error.
				4651	If it is set, allow the name with a different number, but continue
				4652	scanning in case this is a duplicate with the same number. For
				4653	non-duplicate names, give an error if the number is duplicated. */
				4654
				4655	isdupname = FALSE;
				4656	ng = cb->named_groups;
				4657	for (i = 0; i < cb->names_found; i++, ng++)
				4658	{
				4659	if (namelen == ng->length &&
				4660	PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0)
				4661	{
				4662	if (ng->number == cb->bracount) break;
				4663	if ((options & PCRE2_DUPNAMES) == 0)
				4664	{
				4665	errorcode = ERR43;
				4666	goto FAILED;
				4667	}
				4668	isdupname = ng->isdup = TRUE; /* Mark as a duplicate */
				4669	cb->dupnames = TRUE; /* Duplicate names exist */
				4670	}
				4671	else if (ng->number == cb->bracount)
				4672	{
				4673	errorcode = ERR65;
				4674	goto FAILED;
				4675	}
				4676	}
				4677
				4678	if (i < cb->names_found) break; /* Ignore duplicate with same number */
				4679
				4680	/* Increase the list size if necessary */
				4681
				4682	if (cb->names_found >= cb->named_group_list_size)
				4683	{
				4684	uint32_t newsize = cb->named_group_list_size * 2;
				4685	named_group *newspace =
				4686	cb->cx->memctl.malloc(newsize * sizeof(named_group),
				4687	cb->cx->memctl.memory_data);
				4688	if (newspace == NULL)
				4689	{
				4690	errorcode = ERR21;
				4691	goto FAILED;
				4692	}
				4693
				4694	memcpy(newspace, cb->named_groups,
				4695	cb->named_group_list_size * sizeof(named_group));
				4696	if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)
				4697	cb->cx->memctl.free((void *)cb->named_groups,
				4698	cb->cx->memctl.memory_data);
				4699	cb->named_groups = newspace;
				4700	cb->named_group_list_size = newsize;
				4701	}
				4702
				4703	/* Add this name to the list */
				4704
				4705	cb->named_groups[cb->names_found].name = name;
				4706	cb->named_groups[cb->names_found].length = (uint16_t)namelen;
				4707	cb->named_groups[cb->names_found].number = cb->bracount;
				4708	cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname;
				4709	cb->names_found++;
				4710	break;
				4711	} /* End of (? switch */
				4712	break; /* End of ( handling */
				4713
				4714
				4715	/* ---- Branch terminators ---- */
				4716
				4717	/* Alternation: reset the capture count if we are in a (?\| group. */
				4718
				4719	case CHAR_VERTICAL_LINE:
				4720	if (top_nest != NULL && top_nest->nest_depth == nest_depth &&
				4721	(top_nest->flags & NSF_RESET) != 0)
				4722	{
				4723	if (cb->bracount > top_nest->max_group)
				4724	top_nest->max_group = (uint16_t)cb->bracount;
				4725	cb->bracount = top_nest->reset_group;
				4726	}
				4727	*parsed_pattern++ = META_ALT;
				4728	break;
				4729
				4730	/* End of group; reset the capture count to the maximum if we are in a (?\|
				4731	group and/or reset the options that are tracked during parsing. Disallow
				4732	quantifier for a condition that is an assertion. */
				4733
				4734	case CHAR_RIGHT_PARENTHESIS:
				4735	okquantifier = TRUE;
				4736	if (top_nest != NULL && top_nest->nest_depth == nest_depth)
				4737	{
				4738	options = (options & ~PARSE_TRACKED_OPTIONS) \| top_nest->options;
				4739	if ((top_nest->flags & NSF_RESET) != 0 &&
				4740	top_nest->max_group > cb->bracount)
				4741	cb->bracount = top_nest->max_group;
				4742	if ((top_nest->flags & NSF_CONDASSERT) != 0)
				4743	okquantifier = FALSE;
				4744
				4745	if ((top_nest->flags & NSF_ATOMICSR) != 0)
				4746	{
				4747	*parsed_pattern++ = META_KET;
				4748	}
				4749
				4750	if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
				4751	else top_nest--;
				4752	}
				4753	if (nest_depth == 0) /* Unmatched closing parenthesis */
				4754	{
				4755	errorcode = ERR22;
				4756	goto FAILED_BACK;
				4757	}
				4758	nest_depth--;
				4759	*parsed_pattern++ = META_KET;
				4760	break;
				4761	} /* End of switch on pattern character */
				4762	} /* End of main character scan loop */
				4763
				4764	/* End of pattern reached. Check for missing ) at the end of a verb name. */
				4765
				4766	if (inverbname && ptr >= ptrend)
				4767	{
				4768	errorcode = ERR60;
				4769	goto FAILED;
				4770	}
				4771
				4772	/* Manage callout for the final item */
				4773
				4774	PARSED_END:
				4775	parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout,
				4776	parsed_pattern, cb);
				4777
				4778	/* Insert trailing items for word and line matching (features provided for the
				4779	benefit of pcre2grep). */
				4780
				4781	if ((extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
				4782	{
				4783	*parsed_pattern++ = META_KET;
				4784	*parsed_pattern++ = META_DOLLAR;
				4785	}
				4786	else if ((extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
				4787	{
				4788	*parsed_pattern++ = META_KET;
				4789	*parsed_pattern++ = META_ESCAPE + ESC_b;
				4790	}
				4791
				4792	/* Terminate the parsed pattern, then return success if all groups are closed.
				4793	Otherwise we have unclosed parentheses. */
				4794
				4795	if (parsed_pattern >= parsed_pattern_end)
				4796	{
				4797	errorcode = ERR63; /* Internal error (parsed pattern overflow) */
				4798	goto FAILED;
				4799	}
				4800
				4801	*parsed_pattern = META_END;
				4802	if (nest_depth == 0) return 0;
				4803
				4804	UNCLOSED_PARENTHESIS:
				4805	errorcode = ERR14;
				4806
				4807	/* Come here for all failures. */
				4808
				4809	FAILED:
				4810	cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern);
				4811	return errorcode;
				4812
				4813	/* Some errors need to indicate the previous character. */
				4814
				4815	FAILED_BACK:
				4816	ptr--;
				4817	goto FAILED;
				4818
				4819	/* This failure happens several times. */
				4820
				4821	BAD_VERSION_CONDITION:
				4822	errorcode = ERR79;
				4823	goto FAILED;
				4824	}
				4825
				4826
				4827
				4828	/*************************************************
				4829	* Find first significant opcode *
				4830	*************************************************/
				4831
				4832	/* This is called by several functions that scan a compiled expression looking
				4833	for a fixed first character, or an anchoring opcode etc. It skips over things
				4834	that do not influence this. For some calls, it makes sense to skip negative
				4835	forward and all backward assertions, and also the \b assertion; for others it
				4836	does not.
				4837
				4838	Arguments:
				4839	code pointer to the start of the group
				4840	skipassert TRUE if certain assertions are to be skipped
				4841
				4842	Returns: pointer to the first significant opcode
				4843	*/
				4844
				4845	static const PCRE2_UCHAR*
				4846	first_significant_code(PCRE2_SPTR code, BOOL skipassert)
				4847	{
				4848	for (;;)
				4849	{
				4850	switch ((int)*code)
				4851	{
				4852	case OP_ASSERT_NOT:
				4853	case OP_ASSERTBACK:
				4854	case OP_ASSERTBACK_NOT:
				4855	case OP_ASSERTBACK_NA:
				4856	if (!skipassert) return code;
				4857	do code += GET(code, 1); while (*code == OP_ALT);
				4858	code += PRIV(OP_lengths)[*code];
				4859	break;
				4860
				4861	case OP_WORD_BOUNDARY:
				4862	case OP_NOT_WORD_BOUNDARY:
				4863	if (!skipassert) return code;
				4864	/* Fall through */
				4865
				4866	case OP_CALLOUT:
				4867	case OP_CREF:
				4868	case OP_DNCREF:
				4869	case OP_RREF:
				4870	case OP_DNRREF:
				4871	case OP_FALSE:
				4872	case OP_TRUE:
				4873	code += PRIV(OP_lengths)[*code];
				4874	break;
				4875
				4876	case OP_CALLOUT_STR:
				4877	code += GET(code, 1 + 2*LINK_SIZE);
				4878	break;
				4879
				4880	case OP_SKIPZERO:
				4881	code += 2 + GET(code, 2) + LINK_SIZE;
				4882	break;
				4883
				4884	case OP_COND:
				4885	case OP_SCOND:
				4886	if (code[1+LINK_SIZE] != OP_FALSE \|\| /* Not DEFINE */
				4887	code[GET(code, 1)] != OP_KET) /* More than one branch */
				4888	return code;
				4889	code += GET(code, 1) + 1 + LINK_SIZE;
				4890	break;
				4891
				4892	case OP_MARK:
				4893	case OP_COMMIT_ARG:
				4894	case OP_PRUNE_ARG:
				4895	case OP_SKIP_ARG:
				4896	case OP_THEN_ARG:
				4897	code += code[1] + PRIV(OP_lengths)[*code];
				4898	break;
				4899
				4900	default:
				4901	return code;
				4902	}
				4903	}
				4904	/* Control never reaches here */
				4905	}
				4906
				4907
				4908
				4909	#ifdef SUPPORT_UNICODE
				4910	/*************************************************
				4911	* Get othercase range *
				4912	*************************************************/
				4913
				4914	/* This function is passed the start and end of a class range in UCP mode. It
				4915	searches up the characters, looking for ranges of characters in the "other"
				4916	case. Each call returns the next one, updating the start address. A character
				4917	with multiple other cases is returned on its own with a special return value.
				4918
				4919	Arguments:
				4920	cptr points to starting character value; updated
				4921	d end value
				4922	ocptr where to put start of othercase range
				4923	odptr where to put end of othercase range
				4924
				4925	Yield: -1 when no more
				4926	0 when a range is returned
				4927	>0 the CASESET offset for char with multiple other cases
				4928	in this case, ocptr contains the original
				4929	*/
				4930
				4931	static int
				4932	get_othercase_range(uint32_t cptr, uint32_t d, uint32_t ocptr,
				4933	uint32_t *odptr)
				4934	{
				4935	uint32_t c, othercase, next;
				4936	unsigned int co;
				4937
				4938	/* Find the first character that has an other case. If it has multiple other
				4939	cases, return its case offset value. */
				4940
				4941	for (c = *cptr; c <= d; c++)
				4942	{
				4943	if ((co = UCD_CASESET(c)) != 0)
				4944	{
				4945	ocptr = c++; / Character that has the set */
				4946	cptr = c; / Rest of input range */
				4947	return (int)co;
				4948	}
				4949	if ((othercase = UCD_OTHERCASE(c)) != c) break;
				4950	}
				4951
				4952	if (c > d) return -1; /* Reached end of range */
				4953
				4954	/* Found a character that has a single other case. Search for the end of the
				4955	range, which is either the end of the input range, or a character that has zero
				4956	or more than one other cases. */
				4957
				4958	*ocptr = othercase;
				4959	next = othercase + 1;
				4960
				4961	for (++c; c <= d; c++)
				4962	{
				4963	if ((co = UCD_CASESET(c)) != 0 \|\| UCD_OTHERCASE(c) != next) break;
				4964	next++;
				4965	}
				4966
				4967	odptr = next - 1; / End of othercase range */
				4968	cptr = c; / Rest of input range */
				4969	return 0;
				4970	}
				4971	#endif /* SUPPORT_UNICODE */
				4972
				4973
				4974
				4975	/*************************************************
				4976	* Add a character or range to a class (internal) *
				4977	*************************************************/
				4978
				4979	/* This function packages up the logic of adding a character or range of
				4980	characters to a class. The character values in the arguments will be within the
				4981	valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
				4982	called only from within the "add to class" group of functions, some of which
				4983	are recursive and mutually recursive. The external entry point is
				4984	add_to_class().
				4985
				4986	Arguments:
				4987	classbits the bit map for characters < 256
				4988	uchardptr points to the pointer for extra data
				4989	options the options word
				4990	cb compile data
				4991	start start of range character
				4992	end end of range character
				4993
				4994	Returns: the number of < 256 characters added
				4995	the pointer to extra data is updated
				4996	*/
				4997
				4998	static unsigned int
				4999	add_to_class_internal(uint8_t classbits, PCRE2_UCHAR *uchardptr,
				5000	uint32_t options, compile_block *cb, uint32_t start, uint32_t end)
				5001	{
				5002	uint32_t c;
				5003	uint32_t classbits_end = (end <= 0xff ? end : 0xff);
				5004	unsigned int n8 = 0;
				5005
				5006	/* If caseless matching is required, scan the range and process alternate
				5007	cases. In Unicode, there are 8-bit characters that have alternate cases that
				5008	are greater than 255 and vice-versa. Sometimes we can just extend the original
				5009	range. */
				5010
				5011	if ((options & PCRE2_CASELESS) != 0)
				5012	{
				5013	#ifdef SUPPORT_UNICODE
				5014	if ((options & (PCRE2_UTF\|PCRE2_UCP)) != 0)
				5015	{
				5016	int rc;
				5017	uint32_t oc, od;
				5018
				5019	options &= ~PCRE2_CASELESS; /* Remove for recursive calls */
				5020	c = start;
				5021
				5022	while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
				5023	{
				5024	/* Handle a single character that has more than one other case. */
				5025
				5026	if (rc > 0) n8 += add_list_to_class_internal(classbits, uchardptr, options, cb,
				5027	PRIV(ucd_caseless_sets) + rc, oc);
				5028
				5029	/* Do nothing if the other case range is within the original range. */
				5030
				5031	else if (oc >= cb->class_range_start && od <= cb->class_range_end) continue;
				5032
				5033	/* Extend the original range if there is overlap, noting that if oc < c, we
				5034	can't have od > end because a subrange is always shorter than the basic
				5035	range. Otherwise, use a recursive call to add the additional range. */
				5036
				5037	else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
				5038	else if (od > end && oc <= end + 1)
				5039	{
				5040	end = od; /* Extend upwards */
				5041	if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
				5042	}
				5043	else n8 += add_to_class_internal(classbits, uchardptr, options, cb, oc, od);
				5044	}
				5045	}
				5046	else
				5047	#endif /* SUPPORT_UNICODE */
				5048
				5049	/* Not UTF mode */
				5050
				5051	for (c = start; c <= classbits_end; c++)
				5052	{
				5053	SETBIT(classbits, cb->fcc[c]);
				5054	n8++;
				5055	}
				5056	}
				5057
				5058	/* Now handle the originally supplied range. Adjust the final value according
				5059	to the bit length - this means that the same lists of (e.g.) horizontal spaces
				5060	can be used in all cases. */
				5061
				5062	if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR)
				5063	end = MAX_NON_UTF_CHAR;
				5064
				5065	if (start > cb->class_range_start && end < cb->class_range_end) return n8;
				5066
				5067	/* Use the bitmap for characters < 256. Otherwise use extra data.*/
				5068
				5069	for (c = start; c <= classbits_end; c++)
				5070	{
				5071	/* Regardless of start, c will always be <= 255. */
				5072	SETBIT(classbits, c);
				5073	n8++;
				5074	}
				5075
				5076	#ifdef SUPPORT_WIDE_CHARS
				5077	if (start <= 0xff) start = 0xff + 1;
				5078
				5079	if (end >= start)
				5080	{
				5081	PCRE2_UCHAR uchardata = uchardptr;
				5082
				5083	#ifdef SUPPORT_UNICODE
				5084	if ((options & PCRE2_UTF) != 0)
				5085	{
				5086	if (start < end)
				5087	{
				5088	*uchardata++ = XCL_RANGE;
				5089	uchardata += PRIV(ord2utf)(start, uchardata);
				5090	uchardata += PRIV(ord2utf)(end, uchardata);
				5091	}
				5092	else if (start == end)
				5093	{
				5094	*uchardata++ = XCL_SINGLE;
				5095	uchardata += PRIV(ord2utf)(start, uchardata);
				5096	}
				5097	}
				5098	else
				5099	#endif /* SUPPORT_UNICODE */
				5100
				5101	/* Without UTF support, character values are constrained by the bit length,
				5102	and can only be > 256 for 16-bit and 32-bit libraries. */
				5103
				5104	#if PCRE2_CODE_UNIT_WIDTH == 8
				5105	{}
				5106	#else
				5107	if (start < end)
				5108	{
				5109	*uchardata++ = XCL_RANGE;
				5110	*uchardata++ = start;
				5111	*uchardata++ = end;
				5112	}
				5113	else if (start == end)
				5114	{
				5115	*uchardata++ = XCL_SINGLE;
				5116	*uchardata++ = start;
				5117	}
				5118	#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
				5119	uchardptr = uchardata; / Updata extra data pointer */
				5120	}
				5121	#else /* SUPPORT_WIDE_CHARS */
				5122	(void)uchardptr; /* Avoid compiler warning */
				5123	#endif /* SUPPORT_WIDE_CHARS */
				5124
				5125	return n8; /* Number of 8-bit characters */
				5126	}
				5127
				5128
				5129
				5130	#ifdef SUPPORT_UNICODE
				5131	/*************************************************
				5132	* Add a list of characters to a class (internal) *
				5133	*************************************************/
				5134
				5135	/* This function is used for adding a list of case-equivalent characters to a
				5136	class when in UTF mode. This function is called only from within
				5137	add_to_class_internal(), with which it is mutually recursive.
				5138
				5139	Arguments:
				5140	classbits the bit map for characters < 256
				5141	uchardptr points to the pointer for extra data
				5142	options the options word
				5143	cb contains pointers to tables etc.
				5144	p points to row of 32-bit values, terminated by NOTACHAR
				5145	except character to omit; this is used when adding lists of
				5146	case-equivalent characters to avoid including the one we
				5147	already know about
				5148
				5149	Returns: the number of < 256 characters added
				5150	the pointer to extra data is updated
				5151	*/
				5152
				5153	static unsigned int
				5154	add_list_to_class_internal(uint8_t classbits, PCRE2_UCHAR *uchardptr,
				5155	uint32_t options, compile_block cb, const uint32_t p, unsigned int except)
				5156	{
				5157	unsigned int n8 = 0;
				5158	while (p[0] < NOTACHAR)
				5159	{
				5160	unsigned int n = 0;
				5161	if (p[0] != except)
				5162	{
				5163	while(p[n+1] == p[0] + n + 1) n++;
				5164	n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]);
				5165	}
				5166	p += n + 1;
				5167	}
				5168	return n8;
				5169	}
				5170	#endif
				5171
				5172
				5173
				5174	/*************************************************
				5175	* External entry point for add range to class *
				5176	*************************************************/
				5177
				5178	/* This function sets the overall range so that the internal functions can try
				5179	to avoid duplication when handling case-independence.
				5180
				5181	Arguments:
				5182	classbits the bit map for characters < 256
				5183	uchardptr points to the pointer for extra data
				5184	options the options word
				5185	cb compile data
				5186	start start of range character
				5187	end end of range character
				5188
				5189	Returns: the number of < 256 characters added
				5190	the pointer to extra data is updated
				5191	*/
				5192
				5193	static unsigned int
				5194	add_to_class(uint8_t classbits, PCRE2_UCHAR *uchardptr, uint32_t options,
				5195	compile_block *cb, uint32_t start, uint32_t end)
				5196	{
				5197	cb->class_range_start = start;
				5198	cb->class_range_end = end;
				5199	return add_to_class_internal(classbits, uchardptr, options, cb, start, end);
				5200	}
				5201
				5202
				5203	/*************************************************
				5204	* External entry point for add list to class *
				5205	*************************************************/
				5206
				5207	/* This function is used for adding a list of horizontal or vertical whitespace
				5208	characters to a class. The list must be in order so that ranges of characters
				5209	can be detected and handled appropriately. This function sets the overall range
				5210	so that the internal functions can try to avoid duplication when handling
				5211	case-independence.
				5212
				5213	Arguments:
				5214	classbits the bit map for characters < 256
				5215	uchardptr points to the pointer for extra data
				5216	options the options word
				5217	cb contains pointers to tables etc.
				5218	p points to row of 32-bit values, terminated by NOTACHAR
				5219	except character to omit; this is used when adding lists of
				5220	case-equivalent characters to avoid including the one we
				5221	already know about
				5222
				5223	Returns: the number of < 256 characters added
				5224	the pointer to extra data is updated
				5225	*/
				5226
				5227	static unsigned int
				5228	add_list_to_class(uint8_t classbits, PCRE2_UCHAR *uchardptr, uint32_t options,
				5229	compile_block cb, const uint32_t p, unsigned int except)
				5230	{
				5231	unsigned int n8 = 0;
				5232	while (p[0] < NOTACHAR)
				5233	{
				5234	unsigned int n = 0;
				5235	if (p[0] != except)
				5236	{
				5237	while(p[n+1] == p[0] + n + 1) n++;
				5238	cb->class_range_start = p[0];
				5239	cb->class_range_end = p[n];
				5240	n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]);
				5241	}
				5242	p += n + 1;
				5243	}
				5244	return n8;
				5245	}
				5246
				5247
				5248
				5249	/*************************************************
				5250	* Add characters not in a list to a class *
				5251	*************************************************/
				5252
				5253	/* This function is used for adding the complement of a list of horizontal or
				5254	vertical whitespace to a class. The list must be in order.
				5255
				5256	Arguments:
				5257	classbits the bit map for characters < 256
				5258	uchardptr points to the pointer for extra data
				5259	options the options word
				5260	cb contains pointers to tables etc.
				5261	p points to row of 32-bit values, terminated by NOTACHAR
				5262
				5263	Returns: the number of < 256 characters added
				5264	the pointer to extra data is updated
				5265	*/
				5266
				5267	static unsigned int
				5268	add_not_list_to_class(uint8_t classbits, PCRE2_UCHAR *uchardptr,
				5269	uint32_t options, compile_block cb, const uint32_t p)
				5270	{
				5271	BOOL utf = (options & PCRE2_UTF) != 0;
				5272	unsigned int n8 = 0;
				5273	if (p[0] > 0)
				5274	n8 += add_to_class(classbits, uchardptr, options, cb, 0, p[0] - 1);
				5275	while (p[0] < NOTACHAR)
				5276	{
				5277	while (p[1] == p[0] + 1) p++;
				5278	n8 += add_to_class(classbits, uchardptr, options, cb, p[0] + 1,
				5279	(p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
				5280	p++;
				5281	}
				5282	return n8;
				5283	}
				5284
				5285
				5286
				5287	/*************************************************
				5288	* Find details of duplicate group names *
				5289	*************************************************/
				5290
				5291	/* This is called from compile_branch() when it needs to know the index and
				5292	count of duplicates in the names table when processing named backreferences,
				5293	either directly, or as conditions.
				5294
				5295	Arguments:
				5296	name points to the name
				5297	length the length of the name
				5298	indexptr where to put the index
				5299	countptr where to put the count of duplicates
				5300	errorcodeptr where to put an error code
				5301	cb the compile block
				5302
				5303	Returns: TRUE if OK, FALSE if not, error code set
				5304	*/
				5305
				5306	static BOOL
				5307	find_dupname_details(PCRE2_SPTR name, uint32_t length, int *indexptr,
				5308	int countptr, int errorcodeptr, compile_block *cb)
				5309	{
				5310	uint32_t i, groupnumber;
				5311	int count;
				5312	PCRE2_UCHAR *slot = cb->name_table;
				5313
				5314	/* Find the first entry in the table */
				5315
				5316	for (i = 0; i < cb->names_found; i++)
				5317	{
				5318	if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) == 0 &&
				5319	slot[IMM2_SIZE+length] == 0) break;
				5320	slot += cb->name_entry_size;
				5321	}
				5322
				5323	/* This should not occur, because this function is called only when we know we
				5324	have duplicate names. Give an internal error. */
				5325
				5326	if (i >= cb->names_found)
				5327	{
				5328	*errorcodeptr = ERR53;
				5329	cb->erroroffset = name - cb->start_pattern;
				5330	return FALSE;
				5331	}
				5332
				5333	/* Record the index and then see how many duplicates there are, updating the
				5334	backref map and maximum back reference as we do. */
				5335
				5336	*indexptr = i;
				5337	count = 0;
				5338
				5339	for (;;)
				5340	{
				5341	count++;
				5342	groupnumber = GET2(slot,0);
				5343	cb->backref_map \|= (groupnumber < 32)? (1u << groupnumber) : 1;
				5344	if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
				5345	if (++i >= cb->names_found) break;
				5346	slot += cb->name_entry_size;
				5347	if (PRIV(strncmp)(name, slot+IMM2_SIZE, length) != 0 \|\|
				5348	(slot+IMM2_SIZE)[length] != 0) break;
				5349	}
				5350
				5351	*countptr = count;
				5352	return TRUE;
				5353	}
				5354
				5355
				5356
				5357	/*************************************************
				5358	* Compile one branch *
				5359	*************************************************/
				5360
				5361	/* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If
				5362	the options are changed during the branch, the pointer is used to change the
				5363	external options bits. This function is used during the pre-compile phase when
				5364	we are trying to find out the amount of memory needed, as well as during the
				5365	real compile phase. The value of lengthptr distinguishes the two phases.
				5366
				5367	Arguments:
				5368	optionsptr pointer to the option bits
				5369	codeptr points to the pointer to the current code point
				5370	pptrptr points to the current parsed pattern pointer
				5371	errorcodeptr points to error code variable
				5372	firstcuptr place to put the first required code unit
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	5373	firstcuflagsptr place to put the first code unit flags
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	5374	reqcuptr place to put the last required code unit
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	5375	reqcuflagsptr place to put the last required code unit flags
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	5376	bcptr points to current branch chain
				5377	cb contains pointers to tables etc.
				5378	lengthptr NULL during the real compile phase
				5379	points to length accumulator during pre-compile phase
				5380
				5381	Returns: 0 There's been an error, *errorcodeptr is non-zero
				5382	+1 Success, this branch must match at least one character
				5383	-1 Success, this branch may match an empty string
				5384	*/
				5385
				5386	static int
				5387	compile_branch(uint32_t optionsptr, PCRE2_UCHAR codeptr, uint32_t *pptrptr,
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	5388	int errorcodeptr, uint32_t firstcuptr, uint32_t *firstcuflagsptr,
				5389	uint32_t reqcuptr, uint32_t reqcuflagsptr, branch_chain *bcptr,
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	5390	compile_block cb, PCRE2_SIZE lengthptr)
				5391	{
				5392	int bravalue = 0;
				5393	int okreturn = -1;
				5394	int group_return = 0;
				5395	uint32_t repeat_min = 0, repeat_max = 0; /* To please picky compilers */
				5396	uint32_t greedy_default, greedy_non_default;
				5397	uint32_t repeat_type, op_type;
				5398	uint32_t options = optionsptr; / May change dynamically */
				5399	uint32_t firstcu, reqcu;
				5400	uint32_t zeroreqcu, zerofirstcu;
				5401	uint32_t escape;
				5402	uint32_t pptr = pptrptr;
				5403	uint32_t meta, meta_arg;
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	5404	uint32_t firstcuflags, reqcuflags;
				5405	uint32_t zeroreqcuflags, zerofirstcuflags;
				5406	uint32_t req_caseopt, reqvary, tempreqvary;
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	5407	PCRE2_SIZE offset = 0;
				5408	PCRE2_SIZE length_prevgroup = 0;
				5409	PCRE2_UCHAR code = codeptr;
				5410	PCRE2_UCHAR *last_code = code;
				5411	PCRE2_UCHAR *orig_code = code;
				5412	PCRE2_UCHAR *tempcode;
				5413	PCRE2_UCHAR *previous = NULL;
				5414	PCRE2_UCHAR op_previous;
				5415	BOOL groupsetfirstcu = FALSE;
				5416	BOOL had_accept = FALSE;
				5417	BOOL matched_char = FALSE;
				5418	BOOL previous_matched_char = FALSE;
				5419	BOOL reset_caseful = FALSE;
				5420	const uint8_t *cbits = cb->cbits;
				5421	uint8_t classbits[32];
				5422
				5423	/* We can fish out the UTF setting once and for all into a BOOL, but we must
				5424	not do this for other options (e.g. PCRE2_EXTENDED) because they may change
				5425	dynamically as we process the pattern. */
				5426
				5427	#ifdef SUPPORT_UNICODE
				5428	BOOL utf = (options & PCRE2_UTF) != 0;
				5429	BOOL ucp = (options & PCRE2_UCP) != 0;
				5430	#else /* No Unicode support */
				5431	BOOL utf = FALSE;
				5432	#endif
				5433
				5434	/* Helper variables for OP_XCLASS opcode (for characters > 255). We define
				5435	class_uchardata always so that it can be passed to add_to_class() always,
				5436	though it will not be used in non-UTF 8-bit cases. This avoids having to supply
				5437	alternative calls for the different cases. */
				5438
				5439	PCRE2_UCHAR *class_uchardata;
				5440	#ifdef SUPPORT_WIDE_CHARS
				5441	BOOL xclass;
				5442	PCRE2_UCHAR *class_uchardata_base;
				5443	#endif
				5444
				5445	/* Set up the default and non-default settings for greediness */
				5446
				5447	greedy_default = ((options & PCRE2_UNGREEDY) != 0);
				5448	greedy_non_default = greedy_default ^ 1;
				5449
				5450	/* Initialize no first unit, no required unit. REQ_UNSET means "no char
				5451	matching encountered yet". It gets changed to REQ_NONE if we hit something that
				5452	matches a non-fixed first unit; reqcu just remains unset if we never find one.
				5453
				5454	When we hit a repeat whose minimum is zero, we may have to adjust these values
				5455	to take the zero repeat into account. This is implemented by setting them to
				5456	zerofirstcu and zeroreqcu when such a repeat is encountered. The individual
				5457	item types that can be repeated set these backoff variables appropriately. */
				5458
				5459	firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
				5460	firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
				5461
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	5462	/* The variable req_caseopt contains either the REQ_CASELESS bit or zero,
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	5463	according to the current setting of the caseless flag. The REQ_CASELESS value
				5464	leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
				5465	to record the case status of the value. This is used only for ASCII characters.
				5466	*/
				5467
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	5468	req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	5469
				5470	/* Switch on next META item until the end of the branch */
				5471
				5472	for (;; pptr++)
				5473	{
				5474	#ifdef SUPPORT_WIDE_CHARS
				5475	BOOL xclass_has_prop;
				5476	#endif
				5477	BOOL negate_class;
				5478	BOOL should_flip_negation;
				5479	BOOL match_all_or_no_wide_chars;
				5480	BOOL possessive_quantifier;
				5481	BOOL note_group_empty;
				5482	int class_has_8bitchar;
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	5483	uint32_t mclength;
				5484	uint32_t skipunits;
				5485	uint32_t subreqcu, subfirstcu;
				5486	uint32_t groupnumber;
				5487	uint32_t verbarglen, verbculen;
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	5488	uint32_t subreqcuflags, subfirstcuflags;
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	5489	open_capitem *oc;
				5490	PCRE2_UCHAR mcbuffer[8];
				5491
				5492	/* Get next META item in the pattern and its potential argument. */
				5493
				5494	meta = META_CODE(*pptr);
				5495	meta_arg = META_DATA(*pptr);
				5496
				5497	/* If we are in the pre-compile phase, accumulate the length used for the
				5498	previous cycle of this loop, unless the next item is a quantifier. */
				5499
				5500	if (lengthptr != NULL)
				5501	{
				5502	if (code > cb->start_workspace + cb->workspace_size -
				5503	WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */
				5504	{
				5505	*errorcodeptr = (code >= cb->start_workspace + cb->workspace_size)?
				5506	ERR52 : ERR86;
				5507	return 0;
				5508	}
				5509
				5510	/* There is at least one situation where code goes backwards: this is the
				5511	case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier
				5512	is processed, the whole class is eliminated. However, it is created first,
				5513	so we have to allow memory for it. Therefore, don't ever reduce the length
				5514	at this point. */
				5515
				5516	if (code < last_code) code = last_code;
				5517
				5518	/* If the next thing is not a quantifier, we add the length of the previous
				5519	item into the total, and reset the code pointer to the start of the
				5520	workspace. Otherwise leave the previous item available to be quantified. */
				5521
				5522	if (meta < META_ASTERISK \|\| meta > META_MINMAX_QUERY)
				5523	{
				5524	if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code))
				5525	{
				5526	errorcodeptr = ERR20; / Integer overflow */
				5527	return 0;
				5528	}
				5529	*lengthptr += (PCRE2_SIZE)(code - orig_code);
				5530	if (*lengthptr > MAX_PATTERN_SIZE)
				5531	{
				5532	errorcodeptr = ERR20; / Pattern is too large */
				5533	return 0;
				5534	}
				5535	code = orig_code;
				5536	}
				5537
				5538	/* Remember where this code item starts so we can catch the "backwards"
				5539	case above next time round. */
				5540
				5541	last_code = code;
				5542	}
				5543
				5544	/* Process the next parsed pattern item. If it is not a quantifier, remember
				5545	where it starts so that it can be quantified when a quantifier follows.
				5546	Checking for the legality of quantifiers happens in parse_regex(), except for
				5547	a quantifier after an assertion that is a condition. */
				5548
				5549	if (meta < META_ASTERISK \|\| meta > META_MINMAX_QUERY)
				5550	{
				5551	previous = code;
				5552	if (matched_char && !had_accept) okreturn = 1;
				5553	}
				5554
				5555	previous_matched_char = matched_char;
				5556	matched_char = FALSE;
				5557	note_group_empty = FALSE;
				5558	skipunits = 0; /* Default value for most subgroups */
				5559
				5560	switch(meta)
				5561	{
				5562	/* ===================================================================*/
				5563	/* The branch terminates at pattern end or \| or ) */
				5564
				5565	case META_END:
				5566	case META_ALT:
				5567	case META_KET:
				5568	*firstcuptr = firstcu;
				5569	*firstcuflagsptr = firstcuflags;
				5570	*reqcuptr = reqcu;
				5571	*reqcuflagsptr = reqcuflags;
				5572	*codeptr = code;
				5573	*pptrptr = pptr;
				5574	return okreturn;
				5575
				5576
				5577	/* ===================================================================*/
				5578	/* Handle single-character metacharacters. In multiline mode, ^ disables
				5579	the setting of any following char as a first character. */
				5580
				5581	case META_CIRCUMFLEX:
				5582	if ((options & PCRE2_MULTILINE) != 0)
				5583	{
				5584	if (firstcuflags == REQ_UNSET)
				5585	zerofirstcuflags = firstcuflags = REQ_NONE;
				5586	*code++ = OP_CIRCM;
				5587	}
				5588	else *code++ = OP_CIRC;
				5589	break;
				5590
				5591	case META_DOLLAR:
				5592	*code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
				5593	break;
				5594
				5595	/* There can never be a first char if '.' is first, whatever happens about
				5596	repeats. The value of reqcu doesn't change either. */
				5597
				5598	case META_DOT:
				5599	matched_char = TRUE;
				5600	if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
				5601	zerofirstcu = firstcu;
				5602	zerofirstcuflags = firstcuflags;
				5603	zeroreqcu = reqcu;
				5604	zeroreqcuflags = reqcuflags;
				5605	*code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
				5606	break;
				5607
				5608
				5609	/* ===================================================================*/
				5610	/* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set.
				5611	Otherwise, an initial ']' is taken as a data character. When empty classes
				5612	are allowed, [] must always fail, so generate OP_FAIL, whereas [^] must
				5613	match any character, so generate OP_ALLANY. */
				5614
				5615	case META_CLASS_EMPTY:
				5616	case META_CLASS_EMPTY_NOT:
				5617	matched_char = TRUE;
				5618	*code++ = (meta == META_CLASS_EMPTY_NOT)? OP_ALLANY : OP_FAIL;
				5619	if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
				5620	zerofirstcu = firstcu;
				5621	zerofirstcuflags = firstcuflags;
				5622	break;
				5623
				5624
				5625	/* ===================================================================*/
				5626	/* Non-empty character class. If the included characters are all < 256, we
				5627	build a 32-byte bitmap of the permitted characters, except in the special
				5628	case where there is only one such character. For negated classes, we build
				5629	the map as usual, then invert it at the end. However, we use a different
				5630	opcode so that data characters > 255 can be handled correctly.
				5631
				5632	If the class contains characters outside the 0-255 range, a different
				5633	opcode is compiled. It may optionally have a bit map for characters < 256,
				5634	but those above are are explicitly listed afterwards. A flag code unit
				5635	tells whether the bitmap is present, and whether this is a negated class or
				5636	not. */
				5637
				5638	case META_CLASS_NOT:
				5639	case META_CLASS:
				5640	matched_char = TRUE;
				5641	negate_class = meta == META_CLASS_NOT;
				5642
				5643	/* We can optimize the case of a single character in a class by generating
				5644	OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's
				5645	negative. In the negative case there can be no first char if this item is
				5646	first, whatever repeat count may follow. In the case of reqcu, save the
				5647	previous value for reinstating. */
				5648
				5649	/* NOTE: at present this optimization is not effective if the only
				5650	character in a class in 32-bit, non-UCP mode has its top bit set. */
				5651
				5652	if (pptr[1] < META_END && pptr[2] == META_CLASS_END)
				5653	{
				5654	#ifdef SUPPORT_UNICODE
				5655	uint32_t d;
				5656	#endif
				5657	uint32_t c = pptr[1];
				5658
				5659	pptr += 2; /* Move on to class end */
				5660	if (meta == META_CLASS) /* A positive one-char class can be */
				5661	{ /* handled as a normal literal character. */
				5662	meta = c; /* Set up the character */
				5663	goto NORMAL_CHAR_SET;
				5664	}
				5665
				5666	/* Handle a negative one-character class */
				5667
				5668	zeroreqcu = reqcu;
				5669	zeroreqcuflags = reqcuflags;
				5670	if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
				5671	zerofirstcu = firstcu;
				5672	zerofirstcuflags = firstcuflags;
				5673
				5674	/* For caseless UTF or UCP mode, check whether this character has more
				5675	than one other case. If so, generate a special OP_NOTPROP item instead of
				5676	OP_NOTI. */
				5677
				5678	#ifdef SUPPORT_UNICODE
				5679	if ((utf\|\|ucp) && (options & PCRE2_CASELESS) != 0 &&
				5680	(d = UCD_CASESET(c)) != 0)
				5681	{
				5682	*code++ = OP_NOTPROP;
				5683	*code++ = PT_CLIST;
				5684	*code++ = d;
				5685	break; /* We are finished with this class */
				5686	}
				5687	#endif
				5688	/* Char has only one other case, or UCP not available */
				5689
				5690	*code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT;
				5691	code += PUTCHAR(c, code);
				5692	break; /* We are finished with this class */
				5693	} /* End of 1-char optimization */
				5694
				5695	/* Handle character classes that contain more than just one literal
				5696	character. If there are exactly two characters in a positive class, see if
				5697	they are case partners. This can be optimized to generate a caseless single
				5698	character match (which also sets first/required code units if relevant). */
				5699
				5700	if (meta == META_CLASS && pptr[1] < META_END && pptr[2] < META_END &&
				5701	pptr[3] == META_CLASS_END)
				5702	{
				5703	uint32_t c = pptr[1];
				5704
				5705	#ifdef SUPPORT_UNICODE
				5706	if (UCD_CASESET(c) == 0)
				5707	#endif
				5708	{
				5709	uint32_t d;
				5710
				5711	#ifdef SUPPORT_UNICODE
				5712	if ((utf \|\| ucp) && c > 127) d = UCD_OTHERCASE(c); else
				5713	#endif
				5714	{
				5715	#if PCRE2_CODE_UNIT_WIDTH != 8
				5716	if (c > 255) d = c; else
				5717	#endif
				5718	d = TABLE_GET(c, cb->fcc, c);
				5719	}
				5720
				5721	if (c != d && pptr[2] == d)
				5722	{
				5723	pptr += 3; /* Move on to class end */
				5724	meta = c;
				5725	if ((options & PCRE2_CASELESS) == 0)
				5726	{
				5727	reset_caseful = TRUE;
				5728	options \|= PCRE2_CASELESS;
				5729	req_caseopt = REQ_CASELESS;
				5730	}
				5731	goto CLASS_CASELESS_CHAR;
				5732	}
				5733	}
				5734	}
				5735
				5736	/* If a non-extended class contains a negative special such as \S, we need
				5737	to flip the negation flag at the end, so that support for characters > 255
				5738	works correctly (they are all included in the class). An extended class may
				5739	need to insert specific matching or non-matching code for wide characters.
				5740	*/
				5741
				5742	should_flip_negation = match_all_or_no_wide_chars = FALSE;
				5743
				5744	/* Extended class (xclass) will be used when characters > 255
				5745	might match. */
				5746
				5747	#ifdef SUPPORT_WIDE_CHARS
				5748	xclass = FALSE;
				5749	class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */
				5750	class_uchardata_base = class_uchardata; /* Save the start */
				5751	#endif
				5752
				5753	/* For optimization purposes, we track some properties of the class:
				5754	class_has_8bitchar will be non-zero if the class contains at least one
				5755	character with a code point less than 256; xclass_has_prop will be TRUE if
				5756	Unicode property checks are present in the class. */
				5757
				5758	class_has_8bitchar = 0;
				5759	#ifdef SUPPORT_WIDE_CHARS
				5760	xclass_has_prop = FALSE;
				5761	#endif
				5762
				5763	/* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
				5764	in a temporary bit of memory, in case the class contains fewer than two
				5765	8-bit characters because in that case the compiled code doesn't use the bit
				5766	map. */
				5767
				5768	memset(classbits, 0, 32 * sizeof(uint8_t));
				5769
				5770	/* Process items until META_CLASS_END is reached. */
				5771
				5772	while ((meta = *(++pptr)) != META_CLASS_END)
				5773	{
				5774	/* Handle POSIX classes such as [:alpha:] etc. */
				5775
				5776	if (meta == META_POSIX \|\| meta == META_POSIX_NEG)
				5777	{
				5778	BOOL local_negate = (meta == META_POSIX_NEG);
				5779	int posix_class = *(++pptr);
				5780	int taboffset, tabopt;
				5781	uint8_t pbits[32];
				5782
				5783	should_flip_negation = local_negate; /* Note negative special */
				5784
				5785	/* If matching is caseless, upper and lower are converted to alpha.
				5786	This relies on the fact that the class table starts with alpha,
				5787	lower, upper as the first 3 entries. */
				5788
				5789	if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2)
				5790	posix_class = 0;
				5791
				5792	/* When PCRE2_UCP is set, some of the POSIX classes are converted to
				5793	different escape sequences that use Unicode properties \p or \P.
				5794	Others that are not available via \p or \P have to generate
				5795	XCL_PROP/XCL_NOTPROP directly, which is done here. */
				5796
				5797	#ifdef SUPPORT_UNICODE
				5798	if ((options & PCRE2_UCP) != 0) switch(posix_class)
				5799	{
				5800	case PC_GRAPH:
				5801	case PC_PRINT:
				5802	case PC_PUNCT:
				5803	*class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
				5804	*class_uchardata++ = (PCRE2_UCHAR)
				5805	((posix_class == PC_GRAPH)? PT_PXGRAPH :
				5806	(posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT);
				5807	*class_uchardata++ = 0;
				5808	xclass_has_prop = TRUE;
				5809	goto CONTINUE_CLASS;
				5810
				5811	/* For the other POSIX classes (ascii, xdigit) we are going to
				5812	fall through to the non-UCP case and build a bit map for
				5813	characters with code points less than 256. However, if we are in
				5814	a negated POSIX class, characters with code points greater than
				5815	255 must either all match or all not match, depending on whether
				5816	the whole class is not or is negated. For example, for
				5817	[[:^ascii:]... they must all match, whereas for [^[:^xdigit:]...
				5818	they must not.
				5819
				5820	In the special case where there are no xclass items, this is
				5821	automatically handled by the use of OP_CLASS or OP_NCLASS, but an
				5822	explicit range is needed for OP_XCLASS. Setting a flag here
				5823	causes the range to be generated later when it is known that
				5824	OP_XCLASS is required. In the 8-bit library this is relevant only in
				5825	utf mode, since no wide characters can exist otherwise. */
				5826
				5827	default:
				5828	#if PCRE2_CODE_UNIT_WIDTH == 8
				5829	if (utf)
				5830	#endif
				5831	match_all_or_no_wide_chars \|= local_negate;
				5832	break;
				5833	}
				5834	#endif /* SUPPORT_UNICODE */
				5835
				5836	/* In the non-UCP case, or when UCP makes no difference, we build the
				5837	bit map for the POSIX class in a chunk of local store because we may
				5838	be adding and subtracting from it, and we don't want to subtract bits
				5839	that may be in the main map already. At the end we or the result into
				5840	the bit map that is being built. */
				5841
				5842	posix_class *= 3;
				5843
				5844	/* Copy in the first table (always present) */
				5845
				5846	memcpy(pbits, cbits + posix_class_maps[posix_class],
				5847	32 * sizeof(uint8_t));
				5848
				5849	/* If there is a second table, add or remove it as required. */
				5850
				5851	taboffset = posix_class_maps[posix_class + 1];
				5852	tabopt = posix_class_maps[posix_class + 2];
				5853
				5854	if (taboffset >= 0)
				5855	{
				5856	if (tabopt >= 0)
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	5857	for (int i = 0; i < 32; i++) pbits[i] \|= cbits[(int)i + taboffset];
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	5858	else
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	5859	for (int i = 0; i < 32; i++) pbits[i] &= ~cbits[(int)i + taboffset];
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	5860	}
				5861
				5862	/* Now see if we need to remove any special characters. An option
				5863	value of 1 removes vertical space and 2 removes underscore. */
				5864
				5865	if (tabopt < 0) tabopt = -tabopt;
				5866	if (tabopt == 1) pbits[1] &= ~0x3c;
				5867	else if (tabopt == 2) pbits[11] &= 0x7f;
				5868
				5869	/* Add the POSIX table or its complement into the main table that is
				5870	being built and we are done. */
				5871
				5872	if (local_negate)
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	5873	for (int i = 0; i < 32; i++) classbits[i] \|= (uint8_t)(~pbits[i]);
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	5874	else
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	5875	for (int i = 0; i < 32; i++) classbits[i] \|= pbits[i];
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	5876
				5877	/* Every class contains at least one < 256 character. */
				5878
				5879	class_has_8bitchar = 1;
				5880	goto CONTINUE_CLASS; /* End of POSIX handling */
				5881	}
				5882
				5883	/* Other than POSIX classes, the only items we should encounter are
				5884	\d-type escapes and literal characters (possibly as ranges). */
				5885
				5886	if (meta == META_BIGVALUE)
				5887	{
				5888	meta = *(++pptr);
				5889	goto CLASS_LITERAL;
				5890	}
				5891
				5892	/* Any other non-literal must be an escape */
				5893
				5894	if (meta >= META_END)
				5895	{
				5896	if (META_CODE(meta) != META_ESCAPE)
				5897	{
				5898	#ifdef DEBUG_SHOW_PARSED
				5899	fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x "
				5900	"in character class\n", meta);
				5901	#endif
				5902	errorcodeptr = ERR89; / Internal error - unrecognized. */
				5903	return 0;
				5904	}
				5905	escape = META_DATA(meta);
				5906
				5907	/* Every class contains at least one < 256 character. */
				5908
				5909	class_has_8bitchar++;
				5910
				5911	switch(escape)
				5912	{
				5913	case ESC_d:
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	5914	for (int i = 0; i < 32; i++) classbits[i] \|= cbits[i+cbit_digit];
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	5915	break;
				5916
				5917	case ESC_D:
				5918	should_flip_negation = TRUE;
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	5919	for (int i = 0; i < 32; i++)
				5920	classbits[i] \|= (uint8_t)(~cbits[i+cbit_digit]);
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	5921	break;
				5922
				5923	case ESC_w:
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	5924	for (int i = 0; i < 32; i++) classbits[i] \|= cbits[i+cbit_word];
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	5925	break;
				5926
				5927	case ESC_W:
				5928	should_flip_negation = TRUE;
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	5929	for (int i = 0; i < 32; i++)
				5930	classbits[i] \|= (uint8_t)(~cbits[i+cbit_word]);
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	5931	break;
				5932
				5933	/* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
				5934	5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
				5935	previously set by something earlier in the character class.
				5936	Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
				5937	we could just adjust the appropriate bit. From PCRE 8.34 we no
				5938	longer treat \s and \S specially. */
				5939
				5940	case ESC_s:
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	5941	for (int i = 0; i < 32; i++) classbits[i] \|= cbits[i+cbit_space];
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	5942	break;
				5943
				5944	case ESC_S:
				5945	should_flip_negation = TRUE;
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	5946	for (int i = 0; i < 32; i++)
				5947	classbits[i] \|= (uint8_t)(~cbits[i+cbit_space]);
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	5948	break;
				5949
				5950	/* When adding the horizontal or vertical space lists to a class, or
				5951	their complements, disable PCRE2_CASELESS, because it justs wastes
				5952	time, and in the "not-x" UTF cases can create unwanted duplicates in
				5953	the XCLASS list (provoked by characters that have more than one other
				5954	case and by both cases being in the same "not-x" sublist). */
				5955
				5956	case ESC_h:
				5957	(void)add_list_to_class(classbits, &class_uchardata,
				5958	options & ~PCRE2_CASELESS, cb, PRIV(hspace_list), NOTACHAR);
				5959	break;
				5960
				5961	case ESC_H:
				5962	(void)add_not_list_to_class(classbits, &class_uchardata,
				5963	options & ~PCRE2_CASELESS, cb, PRIV(hspace_list));
				5964	break;
				5965
				5966	case ESC_v:
				5967	(void)add_list_to_class(classbits, &class_uchardata,
				5968	options & ~PCRE2_CASELESS, cb, PRIV(vspace_list), NOTACHAR);
				5969	break;
				5970
				5971	case ESC_V:
				5972	(void)add_not_list_to_class(classbits, &class_uchardata,
				5973	options & ~PCRE2_CASELESS, cb, PRIV(vspace_list));
				5974	break;
				5975
				5976	/* If Unicode is not supported, \P and \p are not allowed and are
				5977	faulted at parse time, so will never appear here. */
				5978
				5979	#ifdef SUPPORT_UNICODE
				5980	case ESC_p:
				5981	case ESC_P:
				5982	{
				5983	uint32_t ptype = *(++pptr) >> 16;
				5984	uint32_t pdata = *pptr & 0xffff;
				5985	*class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP;
				5986	*class_uchardata++ = ptype;
				5987	*class_uchardata++ = pdata;
				5988	xclass_has_prop = TRUE;
				5989	class_has_8bitchar--; /* Undo! */
				5990	}
				5991	break;
				5992	#endif
				5993	}
				5994
				5995	goto CONTINUE_CLASS;
				5996	} /* End handling \d-type escapes */
				5997
				5998	/* A literal character may be followed by a range meta. At parse time
				5999	there are checks for out-of-order characters, for ranges where the two
				6000	characters are equal, and for hyphens that cannot indicate a range. At
				6001	this point, therefore, no checking is needed. */
				6002
				6003	else
				6004	{
				6005	uint32_t c, d;
				6006
				6007	CLASS_LITERAL:
				6008	c = d = meta;
				6009
				6010	/* Remember if \r or \n were explicitly used */
				6011
				6012	if (c == CHAR_CR \|\| c == CHAR_NL) cb->external_flags \|= PCRE2_HASCRORLF;
				6013
				6014	/* Process a character range */
				6015
				6016	if (pptr[1] == META_RANGE_LITERAL \|\| pptr[1] == META_RANGE_ESCAPED)
				6017	{
				6018	#ifdef EBCDIC
				6019	BOOL range_is_literal = (pptr[1] == META_RANGE_LITERAL);
				6020	#endif
				6021	pptr += 2;
				6022	d = *pptr;
				6023	if (d == META_BIGVALUE) d = *(++pptr);
				6024
				6025	/* Remember an explicit \r or \n, and add the range to the class. */
				6026
				6027	if (d == CHAR_CR \|\| d == CHAR_NL) cb->external_flags \|= PCRE2_HASCRORLF;
				6028
				6029	/* In an EBCDIC environment, Perl treats alphabetic ranges specially
				6030	because there are holes in the encoding, and simply using the range
				6031	A-Z (for example) would include the characters in the holes. This
				6032	applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
				6033
				6034	#ifdef EBCDIC
				6035	if (range_is_literal &&
				6036	(cb->ctypes[c] & ctype_letter) != 0 &&
				6037	(cb->ctypes[d] & ctype_letter) != 0 &&
				6038	(c <= CHAR_z) == (d <= CHAR_z))
				6039	{
				6040	uint32_t uc = (d <= CHAR_z)? 0 : 64;
				6041	uint32_t C = c - uc;
				6042	uint32_t D = d - uc;
				6043
				6044	if (C <= CHAR_i)
				6045	{
				6046	class_has_8bitchar +=
				6047	add_to_class(classbits, &class_uchardata, options, cb, C + uc,
				6048	((D < CHAR_i)? D : CHAR_i) + uc);
				6049	C = CHAR_j;
				6050	}
				6051
				6052	if (C <= D && C <= CHAR_r)
				6053	{
				6054	class_has_8bitchar +=
				6055	add_to_class(classbits, &class_uchardata, options, cb, C + uc,
				6056	((D < CHAR_r)? D : CHAR_r) + uc);
				6057	C = CHAR_s;
				6058	}
				6059
				6060	if (C <= D)
				6061	{
				6062	class_has_8bitchar +=
				6063	add_to_class(classbits, &class_uchardata, options, cb, C + uc,
				6064	D + uc);
				6065	}
				6066	}
				6067	else
				6068	#endif
				6069	/* Not an EBCDIC special range */
				6070
				6071	class_has_8bitchar +=
				6072	add_to_class(classbits, &class_uchardata, options, cb, c, d);
				6073	goto CONTINUE_CLASS; /* Go get the next char in the class */
				6074	} /* End of range handling */
				6075
				6076
				6077	/* Handle a single character. */
				6078
				6079	class_has_8bitchar +=
				6080	add_to_class(classbits, &class_uchardata, options, cb, meta, meta);
				6081	}
				6082
				6083	/* Continue to the next item in the class. */
				6084
				6085	CONTINUE_CLASS:
				6086
				6087	#ifdef SUPPORT_WIDE_CHARS
				6088	/* If any wide characters or Unicode properties have been encountered,
				6089	set xclass = TRUE. Then, in the pre-compile phase, accumulate the length
				6090	of the extra data and reset the pointer. This is so that very large
				6091	classes that contain a zillion wide characters or Unicode property tests
				6092	do not overwrite the workspace (which is on the stack). */
				6093
				6094	if (class_uchardata > class_uchardata_base)
				6095	{
				6096	xclass = TRUE;
				6097	if (lengthptr != NULL)
				6098	{
				6099	*lengthptr += class_uchardata - class_uchardata_base;
				6100	class_uchardata = class_uchardata_base;
				6101	}
				6102	}
				6103	#endif
				6104
				6105	continue; /* Needed to avoid error when not supporting wide chars */
				6106	} /* End of main class-processing loop */
				6107
				6108	/* If this class is the first thing in the branch, there can be no first
				6109	char setting, whatever the repeat count. Any reqcu setting must remain
				6110	unchanged after any kind of repeat. */
				6111
				6112	if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
				6113	zerofirstcu = firstcu;
				6114	zerofirstcuflags = firstcuflags;
				6115	zeroreqcu = reqcu;
				6116	zeroreqcuflags = reqcuflags;
				6117
				6118	/* If there are characters with values > 255, or Unicode property settings
				6119	(\p or \P), we have to compile an extended class, with its own opcode,
				6120	unless there were no property settings and there was a negated special such
				6121	as \S in the class, and PCRE2_UCP is not set, because in that case all
				6122	characters > 255 are in or not in the class, so any that were explicitly
				6123	given as well can be ignored.
				6124
				6125	In the UCP case, if certain negated POSIX classes ([:^ascii:] or
				6126	[^:xdigit:]) were present in a class, we either have to match or not match
				6127	all wide characters (depending on whether the whole class is or is not
				6128	negated). This requirement is indicated by match_all_or_no_wide_chars being
				6129	true. We do this by including an explicit range, which works in both cases.
				6130	This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there
				6131	cannot be any wide characters in 8-bit non-UTF mode.
				6132
				6133	When there are properties in a positive UTF-8 or any 16-bit or 32_bit
				6134	class where \S etc is present without PCRE2_UCP, causing an extended class
				6135	to be compiled, we make sure that all characters > 255 are included by
				6136	forcing match_all_or_no_wide_chars to be true.
				6137
				6138	If, when generating an xclass, there are no characters < 256, we can omit
				6139	the bitmap in the actual compiled code. */
				6140
				6141	#ifdef SUPPORT_WIDE_CHARS /* Defined for 16/32 bits, or 8-bit with Unicode */
				6142	if (xclass && (
				6143	#ifdef SUPPORT_UNICODE
				6144	(options & PCRE2_UCP) != 0 \|\|
				6145	#endif
				6146	xclass_has_prop \|\| !should_flip_negation))
				6147	{
				6148	if (match_all_or_no_wide_chars \|\| (
				6149	#if PCRE2_CODE_UNIT_WIDTH == 8
				6150	utf &&
				6151	#endif
				6152	should_flip_negation && !negate_class && (options & PCRE2_UCP) == 0))
				6153	{
				6154	*class_uchardata++ = XCL_RANGE;
				6155	if (utf) /* Will always be utf in the 8-bit library */
				6156	{
				6157	class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
				6158	class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata);
				6159	}
				6160	else /* Can only happen for the 16-bit & 32-bit libraries */
				6161	{
				6162	#if PCRE2_CODE_UNIT_WIDTH == 16
				6163	*class_uchardata++ = 0x100;
				6164	*class_uchardata++ = 0xffffu;
				6165	#elif PCRE2_CODE_UNIT_WIDTH == 32
				6166	*class_uchardata++ = 0x100;
				6167	*class_uchardata++ = 0xffffffffu;
				6168	#endif
				6169	}
				6170	}
				6171	class_uchardata++ = XCL_END; / Marks the end of extra data */
				6172	*code++ = OP_XCLASS;
				6173	code += LINK_SIZE;
				6174	*code = negate_class? XCL_NOT:0;
				6175	if (xclass_has_prop) *code \|= XCL_HASPROP;
				6176
				6177	/* If the map is required, move up the extra data to make room for it;
				6178	otherwise just move the code pointer to the end of the extra data. */
				6179
				6180	if (class_has_8bitchar > 0)
				6181	{
				6182	*code++ \|= XCL_MAP;
				6183	(void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code,
				6184	CU2BYTES(class_uchardata - code));
				6185	if (negate_class && !xclass_has_prop)
				6186	{
				6187	/* Using 255 ^ instead of ~ avoids clang sanitize warning. */
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	6188	for (int i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	6189	}
				6190	memcpy(code, classbits, 32);
				6191	code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));
				6192	}
				6193	else code = class_uchardata;
				6194
				6195	/* Now fill in the complete length of the item */
				6196
				6197	PUT(previous, 1, (int)(code - previous));
				6198	break; /* End of class handling */
				6199	}
				6200	#endif /* SUPPORT_WIDE_CHARS */
				6201
				6202	/* If there are no characters > 255, or they are all to be included or
				6203	excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
				6204	whole class was negated and whether there were negative specials such as \S
				6205	(non-UCP) in the class. Then copy the 32-byte map into the code vector,
				6206	negating it if necessary. */
				6207
				6208	*code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
				6209	if (lengthptr == NULL) /* Save time in the pre-compile phase */
				6210	{
				6211	if (negate_class)
				6212	{
				6213	/* Using 255 ^ instead of ~ avoids clang sanitize warning. */
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	6214	for (int i = 0; i < 32; i++) classbits[i] = 255 ^ classbits[i];
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	6215	}
				6216	memcpy(code, classbits, 32);
				6217	}
				6218	code += 32 / sizeof(PCRE2_UCHAR);
				6219	break; /* End of class processing */
				6220
				6221
				6222	/* ===================================================================*/
				6223	/* Deal with (VERB)s. /
				6224
				6225	/* Check for open captures before ACCEPT and close those that are within
				6226	the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an
				6227	assertion. In the first pass, just accumulate the length required;
				6228	otherwise hitting (*ACCEPT) inside many nested parentheses can cause
				6229	workspace overflow. Do not set firstcu after ACCEPT. /
				6230
				6231	case META_ACCEPT:
				6232	cb->had_accept = had_accept = TRUE;
				6233	for (oc = cb->open_caps;
				6234	oc != NULL && oc->assert_depth >= cb->assert_depth;
				6235	oc = oc->next)
				6236	{
				6237	if (lengthptr != NULL)
				6238	{
				6239	*lengthptr += CU2BYTES(1) + IMM2_SIZE;
				6240	}
				6241	else
				6242	{
				6243	*code++ = OP_CLOSE;
				6244	PUT2INC(code, 0, oc->number);
				6245	}
				6246	}
				6247	*code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
				6248	if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
				6249	break;
				6250
				6251	case META_PRUNE:
				6252	case META_SKIP:
				6253	cb->had_pruneorskip = TRUE;
				6254	/* Fall through */
				6255	case META_COMMIT:
				6256	case META_FAIL:
				6257	*code++ = verbops[(meta - META_MARK) >> 16];
				6258	break;
				6259
				6260	case META_THEN:
				6261	cb->external_flags \|= PCRE2_HASTHEN;
				6262	*code++ = OP_THEN;
				6263	break;
				6264
				6265	/* Handle verbs with arguments. Arguments can be very long, especially in
				6266	16- and 32-bit modes, and can overflow the workspace in the first pass.
				6267	However, the argument length is constrained to be small enough to fit in
				6268	one code unit. This check happens in parse_regex(). In the first pass,
				6269	instead of putting the argument into memory, we just update the length
				6270	counter and set up an empty argument. */
				6271
				6272	case META_THEN_ARG:
				6273	cb->external_flags \|= PCRE2_HASTHEN;
				6274	goto VERB_ARG;
				6275
				6276	case META_PRUNE_ARG:
				6277	case META_SKIP_ARG:
				6278	cb->had_pruneorskip = TRUE;
				6279	/* Fall through */
				6280	case META_MARK:
				6281	case META_COMMIT_ARG:
				6282	VERB_ARG:
				6283	*code++ = verbops[(meta - META_MARK) >> 16];
				6284	/* The length is in characters. */
				6285	verbarglen = *(++pptr);
				6286	verbculen = 0;
				6287	tempcode = code++;
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	6288	for (int i = 0; i < (int)verbarglen; i++)
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	6289	{
				6290	meta = *(++pptr);
				6291	#ifdef SUPPORT_UNICODE
				6292	if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
				6293	#endif
				6294	{
				6295	mclength = 1;
				6296	mcbuffer[0] = meta;
				6297	}
				6298	if (lengthptr != NULL) *lengthptr += mclength; else
				6299	{
				6300	memcpy(code, mcbuffer, CU2BYTES(mclength));
				6301	code += mclength;
				6302	verbculen += mclength;
				6303	}
				6304	}
				6305
				6306	tempcode = verbculen; / Fill in the code unit length */
				6307	code++ = 0; / Terminating zero */
				6308	break;
				6309
				6310
				6311	/* ===================================================================*/
				6312	/* Handle options change. The new setting must be passed back for use in
				6313	subsequent branches. Reset the greedy defaults and the case value for
				6314	firstcu and reqcu. */
				6315
				6316	case META_OPTIONS:
				6317	optionsptr = options = (++pptr);
				6318	greedy_default = ((options & PCRE2_UNGREEDY) != 0);
				6319	greedy_non_default = greedy_default ^ 1;
				6320	req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
				6321	break;
				6322
				6323
				6324	/* ===================================================================*/
				6325	/* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous
				6326	because it could be a numerical check on recursion, or a name check on a
				6327	group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that
				6328	we can handle it either way. We first try for a name; if not found, process
				6329	the number. */
				6330
				6331	case META_COND_RNUMBER: /* (?(Rdigits) */
				6332	case META_COND_NAME: /* (?(name) or (?'name') or ?(<name>) */
				6333	case META_COND_RNAME: /* (?(R&name) - test for recursion */
				6334	bravalue = OP_COND;
				6335	{
				6336	int count, index;
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	6337	unsigned int i;
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	6338	PCRE2_SPTR name;
				6339	named_group *ng = cb->named_groups;
				6340	uint32_t length = *(++pptr);
				6341
				6342	GETPLUSOFFSET(offset, pptr);
				6343	name = cb->start_pattern + offset;
				6344
				6345	/* In the first pass, the names generated in the pre-pass are available,
				6346	but the main name table has not yet been created. Scan the list of names
				6347	generated in the pre-pass in order to get a number and whether or not
				6348	this name is duplicated. If it is not duplicated, we can handle it as a
				6349	numerical group. */
				6350
				6351	for (i = 0; i < cb->names_found; i++, ng++)
				6352	{
				6353	if (length == ng->length &&
				6354	PRIV(strncmp)(name, ng->name, length) == 0)
				6355	{
				6356	if (!ng->isdup)
				6357	{
				6358	code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
				6359	PUT2(code, 2+LINK_SIZE, ng->number);
				6360	if (ng->number > cb->top_backref) cb->top_backref = ng->number;
				6361	skipunits = 1+IMM2_SIZE;
				6362	goto GROUP_PROCESS_NOTE_EMPTY;
				6363	}
				6364	break; /* Found a duplicated name */
				6365	}
				6366	}
				6367
				6368	/* If the name was not found we have a bad reference, unless we are
				6369	dealing with R<digits>, which is treated as a recursion test by number.
				6370	*/
				6371
				6372	if (i >= cb->names_found)
				6373	{
				6374	groupnumber = 0;
				6375	if (meta == META_COND_RNUMBER)
				6376	{
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	6377	for (i = 1; i < length; i++)
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	6378	{
				6379	groupnumber = groupnumber * 10 + name[i] - CHAR_0;
				6380	if (groupnumber > MAX_GROUP_NUMBER)
				6381	{
				6382	*errorcodeptr = ERR61;
				6383	cb->erroroffset = offset + i;
				6384	return 0;
				6385	}
				6386	}
				6387	}
				6388
				6389	if (meta != META_COND_RNUMBER \|\| groupnumber > cb->bracount)
				6390	{
				6391	*errorcodeptr = ERR15;
				6392	cb->erroroffset = offset;
				6393	return 0;
				6394	}
				6395
				6396	/* (?Rdigits) treated as a recursion reference by number. A value of
				6397	zero (which is the result of both (?R) and (?R0)) means "any", and is
				6398	translated into RREF_ANY (which is 0xffff). */
				6399
				6400	if (groupnumber == 0) groupnumber = RREF_ANY;
				6401	code[1+LINK_SIZE] = OP_RREF;
				6402	PUT2(code, 2+LINK_SIZE, groupnumber);
				6403	skipunits = 1+IMM2_SIZE;
				6404	goto GROUP_PROCESS_NOTE_EMPTY;
				6405	}
				6406
				6407	/* A duplicated name was found. Note that if an R<digits> name is found
				6408	(META_COND_RNUMBER), it is a reference test, not a recursion test. */
				6409
				6410	code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
				6411
				6412	/* We have a duplicated name. In the compile pass we have to search the
				6413	main table in order to get the index and count values. */
				6414
				6415	count = 0; /* Values for first pass (avoids compiler warning) */
				6416	index = 0;
				6417	if (lengthptr == NULL && !find_dupname_details(name, length, &index,
				6418	&count, errorcodeptr, cb)) return 0;
				6419
				6420	/* Add one to the opcode to change CREF/RREF into DNCREF/DNRREF and
				6421	insert appropriate data values. */
				6422
				6423	code[1+LINK_SIZE]++;
				6424	skipunits = 1+2*IMM2_SIZE;
				6425	PUT2(code, 2+LINK_SIZE, index);
				6426	PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
				6427	}
				6428	goto GROUP_PROCESS_NOTE_EMPTY;
				6429
				6430	/* The DEFINE condition is always false. Its internal groups may never
				6431	be called, so matched_char must remain false, hence the jump to
				6432	GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */
				6433
				6434	case META_COND_DEFINE:
				6435	bravalue = OP_COND;
				6436	GETPLUSOFFSET(offset, pptr);
				6437	code[1+LINK_SIZE] = OP_DEFINE;
				6438	skipunits = 1;
				6439	goto GROUP_PROCESS;
				6440
				6441	/* Conditional test of a group's being set. */
				6442
				6443	case META_COND_NUMBER:
				6444	bravalue = OP_COND;
				6445	GETPLUSOFFSET(offset, pptr);
				6446	groupnumber = *(++pptr);
				6447	if (groupnumber > cb->bracount)
				6448	{
				6449	*errorcodeptr = ERR15;
				6450	cb->erroroffset = offset;
				6451	return 0;
				6452	}
				6453	if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
				6454	offset -= 2; /* Point at initial ( for too many branches error */
				6455	code[1+LINK_SIZE] = OP_CREF;
				6456	skipunits = 1+IMM2_SIZE;
				6457	PUT2(code, 2+LINK_SIZE, groupnumber);
				6458	goto GROUP_PROCESS_NOTE_EMPTY;
				6459
				6460	/* Test for the PCRE2 version. */
				6461
				6462	case META_COND_VERSION:
				6463	bravalue = OP_COND;
				6464	if (pptr[1] > 0)
				6465	code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) \|\|
				6466	(PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))?
				6467	OP_TRUE : OP_FALSE;
				6468	else
				6469	code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])?
				6470	OP_TRUE : OP_FALSE;
				6471	skipunits = 1;
				6472	pptr += 3;
				6473	goto GROUP_PROCESS_NOTE_EMPTY;
				6474
				6475	/* The condition is an assertion, possibly preceded by a callout. */
				6476
				6477	case META_COND_ASSERT:
				6478	bravalue = OP_COND;
				6479	goto GROUP_PROCESS_NOTE_EMPTY;
				6480
				6481
				6482	/* ===================================================================*/
				6483	/* Handle all kinds of nested bracketed groups. The non-capturing,
				6484	non-conditional cases are here; others come to GROUP_PROCESS via goto. */
				6485
				6486	case META_LOOKAHEAD:
				6487	bravalue = OP_ASSERT;
				6488	cb->assert_depth += 1;
				6489	goto GROUP_PROCESS;
				6490
				6491	case META_LOOKAHEAD_NA:
				6492	bravalue = OP_ASSERT_NA;
				6493	cb->assert_depth += 1;
				6494	goto GROUP_PROCESS;
				6495
				6496	/* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
				6497	thing to do, but Perl allows all assertions to be quantified, and when
				6498	they contain capturing parentheses there may be a potential use for
				6499	this feature. Not that that applies to a quantified (?!) but we allow
				6500	it for uniformity. */
				6501
				6502	case META_LOOKAHEADNOT:
				6503	if (pptr[1] == META_KET &&
				6504	(pptr[2] < META_ASTERISK \|\| pptr[2] > META_MINMAX_QUERY))
				6505	{
				6506	*code++ = OP_FAIL;
				6507	pptr++;
				6508	}
				6509	else
				6510	{
				6511	bravalue = OP_ASSERT_NOT;
				6512	cb->assert_depth += 1;
				6513	goto GROUP_PROCESS;
				6514	}
				6515	break;
				6516
				6517	case META_LOOKBEHIND:
				6518	bravalue = OP_ASSERTBACK;
				6519	cb->assert_depth += 1;
				6520	goto GROUP_PROCESS;
				6521
				6522	case META_LOOKBEHINDNOT:
				6523	bravalue = OP_ASSERTBACK_NOT;
				6524	cb->assert_depth += 1;
				6525	goto GROUP_PROCESS;
				6526
				6527	case META_LOOKBEHIND_NA:
				6528	bravalue = OP_ASSERTBACK_NA;
				6529	cb->assert_depth += 1;
				6530	goto GROUP_PROCESS;
				6531
				6532	case META_ATOMIC:
				6533	bravalue = OP_ONCE;
				6534	goto GROUP_PROCESS_NOTE_EMPTY;
				6535
				6536	case META_SCRIPT_RUN:
				6537	bravalue = OP_SCRIPT_RUN;
				6538	goto GROUP_PROCESS_NOTE_EMPTY;
				6539
				6540	case META_NOCAPTURE:
				6541	bravalue = OP_BRA;
				6542	/* Fall through */
				6543
				6544	/* Process nested bracketed regex. The nesting depth is maintained for the
				6545	benefit of the stackguard function. The test for too deep nesting is now
				6546	done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
				6547	others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
				6548	note of whether or not they may match an empty string. */
				6549
				6550	GROUP_PROCESS_NOTE_EMPTY:
				6551	note_group_empty = TRUE;
				6552
				6553	GROUP_PROCESS:
				6554	cb->parens_depth += 1;
				6555	*code = bravalue;
				6556	pptr++;
				6557	tempcode = code;
				6558	tempreqvary = cb->req_varyopt; /* Save value before group */
				6559	length_prevgroup = 0; /* Initialize for pre-compile phase */
				6560
				6561	if ((group_return =
				6562	compile_regex(
				6563	options, /* The option state */
				6564	&tempcode, /* Where to put code (updated) */
				6565	&pptr, /* Input pointer (updated) */
				6566	errorcodeptr, /* Where to put an error message */
				6567	skipunits, /* Skip over bracket number */
				6568	&subfirstcu, /* For possible first char */
				6569	&subfirstcuflags,
				6570	&subreqcu, /* For possible last char */
				6571	&subreqcuflags,
				6572	bcptr, /* Current branch chain */
				6573	cb, /* Compile data block */
				6574	(lengthptr == NULL)? NULL : /* Actual compile phase */
				6575	&length_prevgroup /* Pre-compile phase */
				6576	)) == 0)
				6577	return 0; /* Error */
				6578
				6579	cb->parens_depth -= 1;
				6580
				6581	/* If that was a non-conditional significant group (not an assertion, not a
				6582	DEFINE) that matches at least one character, then the current item matches
				6583	a character. Conditionals are handled below. */
				6584
				6585	if (note_group_empty && bravalue != OP_COND && group_return > 0)
				6586	matched_char = TRUE;
				6587
				6588	/* If we've just compiled an assertion, pop the assert depth. */
				6589
				6590	if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NA)
				6591	cb->assert_depth -= 1;
				6592
				6593	/* At the end of compiling, code is still pointing to the start of the
				6594	group, while tempcode has been updated to point past the end of the group.
				6595	The parsed pattern pointer (pptr) is on the closing META_KET.
				6596
				6597	If this is a conditional bracket, check that there are no more than
				6598	two branches in the group, or just one if it's a DEFINE group. We do this
				6599	in the real compile phase, not in the pre-pass, where the whole group may
				6600	not be available. */
				6601
				6602	if (bravalue == OP_COND && lengthptr == NULL)
				6603	{
				6604	PCRE2_UCHAR *tc = code;
				6605	int condcount = 0;
				6606
				6607	do {
				6608	condcount++;
				6609	tc += GET(tc,1);
				6610	}
				6611	while (*tc != OP_KET);
				6612
				6613	/* A DEFINE group is never obeyed inline (the "condition" is always
				6614	false). It must have only one branch. Having checked this, change the
				6615	opcode to OP_FALSE. */
				6616
				6617	if (code[LINK_SIZE+1] == OP_DEFINE)
				6618	{
				6619	if (condcount > 1)
				6620	{
				6621	cb->erroroffset = offset;
				6622	*errorcodeptr = ERR54;
				6623	return 0;
				6624	}
				6625	code[LINK_SIZE+1] = OP_FALSE;
				6626	bravalue = OP_DEFINE; /* A flag to suppress char handling below */
				6627	}
				6628
				6629	/* A "normal" conditional group. If there is just one branch, we must not
				6630	make use of its firstcu or reqcu, because this is equivalent to an
				6631	empty second branch. Also, it may match an empty string. If there are two
				6632	branches, this item must match a character if the group must. */
				6633
				6634	else
				6635	{
				6636	if (condcount > 2)
				6637	{
				6638	cb->erroroffset = offset;
				6639	*errorcodeptr = ERR27;
				6640	return 0;
				6641	}
				6642	if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;
				6643	else if (group_return > 0) matched_char = TRUE;
				6644	}
				6645	}
				6646
				6647	/* In the pre-compile phase, update the length by the length of the group,
				6648	less the brackets at either end. Then reduce the compiled code to just a
				6649	set of non-capturing brackets so that it doesn't use much memory if it is
				6650	duplicated by a quantifier.*/
				6651
				6652	if (lengthptr != NULL)
				6653	{
				6654	if (OFLOW_MAX - lengthptr < length_prevgroup - 2 - 2LINK_SIZE)
				6655	{
				6656	*errorcodeptr = ERR20;
				6657	return 0;
				6658	}
				6659	lengthptr += length_prevgroup - 2 - 2LINK_SIZE;
				6660	code++; /* This already contains bravalue */
				6661	PUTINC(code, 0, 1 + LINK_SIZE);
				6662	*code++ = OP_KET;
				6663	PUTINC(code, 0, 1 + LINK_SIZE);
				6664	break; /* No need to waste time with special character handling */
				6665	}
				6666
				6667	/* Otherwise update the main code pointer to the end of the group. */
				6668
				6669	code = tempcode;
				6670
				6671	/* For a DEFINE group, required and first character settings are not
				6672	relevant. */
				6673
				6674	if (bravalue == OP_DEFINE) break;
				6675
				6676	/* Handle updating of the required and first code units for other types of
				6677	group. Update for normal brackets of all kinds, and conditions with two
				6678	branches (see code above). If the bracket is followed by a quantifier with
				6679	zero repeat, we have to back off. Hence the definition of zeroreqcu and
				6680	zerofirstcu outside the main loop so that they can be accessed for the back
				6681	off. */
				6682
				6683	zeroreqcu = reqcu;
				6684	zeroreqcuflags = reqcuflags;
				6685	zerofirstcu = firstcu;
				6686	zerofirstcuflags = firstcuflags;
				6687	groupsetfirstcu = FALSE;
				6688
				6689	if (bravalue >= OP_ONCE) /* Not an assertion */
				6690	{
				6691	/* If we have not yet set a firstcu in this branch, take it from the
				6692	subpattern, remembering that it was set here so that a repeat of more
				6693	than one can replicate it as reqcu if necessary. If the subpattern has
				6694	no firstcu, set "none" for the whole branch. In both cases, a zero
				6695	repeat forces firstcu to "none". */
				6696
				6697	if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
				6698	{
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	6699	if (subfirstcuflags < REQ_NONE)
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	6700	{
				6701	firstcu = subfirstcu;
				6702	firstcuflags = subfirstcuflags;
				6703	groupsetfirstcu = TRUE;
				6704	}
				6705	else firstcuflags = REQ_NONE;
				6706	zerofirstcuflags = REQ_NONE;
				6707	}
				6708
				6709	/* If firstcu was previously set, convert the subpattern's firstcu
				6710	into reqcu if there wasn't one, using the vary flag that was in
				6711	existence beforehand. */
				6712
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	6713	else if (subfirstcuflags < REQ_NONE && subreqcuflags >= REQ_NONE)
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	6714	{
				6715	subreqcu = subfirstcu;
				6716	subreqcuflags = subfirstcuflags \| tempreqvary;
				6717	}
				6718
				6719	/* If the subpattern set a required code unit (or set a first code unit
				6720	that isn't really the first code unit - see above), set it. */
				6721
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	6722	if (subreqcuflags < REQ_NONE)
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	6723	{
				6724	reqcu = subreqcu;
				6725	reqcuflags = subreqcuflags;
				6726	}
				6727	}
				6728
				6729	/* For a forward assertion, we take the reqcu, if set, provided that the
				6730	group has also set a firstcu. This can be helpful if the pattern that
				6731	follows the assertion doesn't set a different char. For example, it's
				6732	useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
				6733	because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
				6734	the "real" "a" would then become a reqcu instead of a firstcu. This is
				6735	overcome by a scan at the end if there's no firstcu, looking for an
				6736	asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
				6737	we must only take the reqcu when the group also set a firstcu. Otherwise,
				6738	in that example, 'X' ends up set for both. */
				6739
				6740	else if ((bravalue == OP_ASSERT \|\| bravalue == OP_ASSERT_NA) &&
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	6741	subreqcuflags < REQ_NONE && subfirstcuflags < REQ_NONE)
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	6742	{
				6743	reqcu = subreqcu;
				6744	reqcuflags = subreqcuflags;
				6745	}
				6746
				6747	break; /* End of nested group handling */
				6748
				6749
				6750	/* ===================================================================*/
				6751	/* Handle named backreferences and recursions. */
				6752
				6753	case META_BACKREF_BYNAME:
				6754	case META_RECURSE_BYNAME:
				6755	{
				6756	int count, index;
				6757	PCRE2_SPTR name;
				6758	BOOL is_dupname = FALSE;
				6759	named_group *ng = cb->named_groups;
				6760	uint32_t length = *(++pptr);
				6761
				6762	GETPLUSOFFSET(offset, pptr);
				6763	name = cb->start_pattern + offset;
				6764
				6765	/* In the first pass, the names generated in the pre-pass are available,
				6766	but the main name table has not yet been created. Scan the list of names
				6767	generated in the pre-pass in order to get a number and whether or not
				6768	this name is duplicated. */
				6769
				6770	groupnumber = 0;
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	6771	for (unsigned int i = 0; i < cb->names_found; i++, ng++)
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	6772	{
				6773	if (length == ng->length &&
				6774	PRIV(strncmp)(name, ng->name, length) == 0)
				6775	{
				6776	is_dupname = ng->isdup;
				6777	groupnumber = ng->number;
				6778
				6779	/* For a recursion, that's all that is needed. We can now go to
				6780	the code that handles numerical recursion, applying it to the first
				6781	group with the given name. */
				6782
				6783	if (meta == META_RECURSE_BYNAME)
				6784	{
				6785	meta_arg = groupnumber;
				6786	goto HANDLE_NUMERICAL_RECURSION;
				6787	}
				6788
				6789	/* For a back reference, update the back reference map and the
				6790	maximum back reference. */
				6791
				6792	cb->backref_map \|= (groupnumber < 32)? (1u << groupnumber) : 1;
				6793	if (groupnumber > cb->top_backref)
				6794	cb->top_backref = groupnumber;
				6795	}
				6796	}
				6797
				6798	/* If the name was not found we have a bad reference. */
				6799
				6800	if (groupnumber == 0)
				6801	{
				6802	*errorcodeptr = ERR15;
				6803	cb->erroroffset = offset;
				6804	return 0;
				6805	}
				6806
				6807	/* If a back reference name is not duplicated, we can handle it as
				6808	a numerical reference. */
				6809
				6810	if (!is_dupname)
				6811	{
				6812	meta_arg = groupnumber;
				6813	goto HANDLE_SINGLE_REFERENCE;
				6814	}
				6815
				6816	/* If a back reference name is duplicated, we generate a different
				6817	opcode to a numerical back reference. In the second pass we must
				6818	search for the index and count in the final name table. */
				6819
				6820	count = 0; /* Values for first pass (avoids compiler warning) */
				6821	index = 0;
				6822	if (lengthptr == NULL && !find_dupname_details(name, length, &index,
				6823	&count, errorcodeptr, cb)) return 0;
				6824
				6825	if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
				6826	*code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
				6827	PUT2INC(code, 0, index);
				6828	PUT2INC(code, 0, count);
				6829	}
				6830	break;
				6831
				6832
				6833	/* ===================================================================*/
				6834	/* Handle a numerical callout. */
				6835
				6836	case META_CALLOUT_NUMBER:
				6837	code[0] = OP_CALLOUT;
				6838	PUT(code, 1, pptr[1]); /* Offset to next pattern item */
				6839	PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */
				6840	code[1 + 2*LINK_SIZE] = pptr[3];
				6841	pptr += 3;
				6842	code += PRIV(OP_lengths)[OP_CALLOUT];
				6843	break;
				6844
				6845
				6846	/* ===================================================================*/
				6847	/* Handle a callout with a string argument. In the pre-pass we just compute
				6848	the length without generating anything. The length in pptr[3] includes both
				6849	delimiters; in the actual compile only the first one is copied, but a
				6850	terminating zero is added. Any doubled delimiters within the string make
				6851	this an overestimate, but it is not worth bothering about. */
				6852
				6853	case META_CALLOUT_STRING:
				6854	if (lengthptr != NULL)
				6855	{
				6856	lengthptr += pptr[3] + (1 + 4LINK_SIZE);
				6857	pptr += 3;
				6858	SKIPOFFSET(pptr);
				6859	}
				6860
				6861	/* In the real compile we can copy the string. The starting delimiter is
				6862	included so that the client can discover it if they want. We also pass the
				6863	start offset to help a script language give better error messages. */
				6864
				6865	else
				6866	{
				6867	PCRE2_SPTR pp;
				6868	uint32_t delimiter;
				6869	uint32_t length = pptr[3];
				6870	PCRE2_UCHAR callout_string = code + (1 + 4LINK_SIZE);
				6871
				6872	code[0] = OP_CALLOUT_STR;
				6873	PUT(code, 1, pptr[1]); /* Offset to next pattern item */
				6874	PUT(code, 1 + LINK_SIZE, pptr[2]); /* Length of next pattern item */
				6875
				6876	pptr += 3;
				6877	GETPLUSOFFSET(offset, pptr); /* Offset to string in pattern */
				6878	pp = cb->start_pattern + offset;
				6879	delimiter = callout_string++ = pp++;
				6880	if (delimiter == CHAR_LEFT_CURLY_BRACKET)
				6881	delimiter = CHAR_RIGHT_CURLY_BRACKET;
				6882	PUT(code, 1 + 3LINK_SIZE, (int)(offset + 1)); / One after delimiter */
				6883
				6884	/* The syntax of the pattern was checked in the parsing scan. The length
				6885	includes both delimiters, but we have passed the opening one just above,
				6886	so we reduce length before testing it. The test is for > 1 because we do
				6887	not want to copy the final delimiter. This also ensures that pp[1] is
				6888	accessible. */
				6889
				6890	while (--length > 1)
				6891	{
				6892	if (*pp == delimiter && pp[1] == delimiter)
				6893	{
				6894	*callout_string++ = delimiter;
				6895	pp += 2;
				6896	length--;
				6897	}
				6898	else callout_string++ = pp++;
				6899	}
				6900	*callout_string++ = CHAR_NUL;
				6901
				6902	/* Set the length of the entire item, the advance to its end. */
				6903
				6904	PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code));
				6905	code = callout_string;
				6906	}
				6907	break;
				6908
				6909
				6910	/* ===================================================================*/
				6911	/* Handle repetition. The different types are all sorted out in the parsing
				6912	pass. */
				6913
				6914	case META_MINMAX_PLUS:
				6915	case META_MINMAX_QUERY:
				6916	case META_MINMAX:
				6917	repeat_min = *(++pptr);
				6918	repeat_max = *(++pptr);
				6919	goto REPEAT;
				6920
				6921	case META_ASTERISK:
				6922	case META_ASTERISK_PLUS:
				6923	case META_ASTERISK_QUERY:
				6924	repeat_min = 0;
				6925	repeat_max = REPEAT_UNLIMITED;
				6926	goto REPEAT;
				6927
				6928	case META_PLUS:
				6929	case META_PLUS_PLUS:
				6930	case META_PLUS_QUERY:
				6931	repeat_min = 1;
				6932	repeat_max = REPEAT_UNLIMITED;
				6933	goto REPEAT;
				6934
				6935	case META_QUERY:
				6936	case META_QUERY_PLUS:
				6937	case META_QUERY_QUERY:
				6938	repeat_min = 0;
				6939	repeat_max = 1;
				6940
				6941	REPEAT:
				6942	if (previous_matched_char && repeat_min > 0) matched_char = TRUE;
				6943
				6944	/* Remember whether this is a variable length repeat, and default to
				6945	single-char opcodes. */
				6946
				6947	reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
				6948	op_type = 0;
				6949
				6950	/* Adjust first and required code units for a zero repeat. */
				6951
				6952	if (repeat_min == 0)
				6953	{
				6954	firstcu = zerofirstcu;
				6955	firstcuflags = zerofirstcuflags;
				6956	reqcu = zeroreqcu;
				6957	reqcuflags = zeroreqcuflags;
				6958	}
				6959
				6960	/* Note the greediness and possessiveness. */
				6961
				6962	switch (meta)
				6963	{
				6964	case META_MINMAX_PLUS:
				6965	case META_ASTERISK_PLUS:
				6966	case META_PLUS_PLUS:
				6967	case META_QUERY_PLUS:
				6968	repeat_type = 0; /* Force greedy */
				6969	possessive_quantifier = TRUE;
				6970	break;
				6971
				6972	case META_MINMAX_QUERY:
				6973	case META_ASTERISK_QUERY:
				6974	case META_PLUS_QUERY:
				6975	case META_QUERY_QUERY:
				6976	repeat_type = greedy_non_default;
				6977	possessive_quantifier = FALSE;
				6978	break;
				6979
				6980	default:
				6981	repeat_type = greedy_default;
				6982	possessive_quantifier = FALSE;
				6983	break;
				6984	}
				6985
				6986	/* Save start of previous item, in case we have to move it up in order to
				6987	insert something before it, and remember what it was. */
				6988
				6989	tempcode = previous;
				6990	op_previous = *previous;
				6991
				6992	/* Now handle repetition for the different types of item. If the repeat
				6993	minimum and the repeat maximum are both 1, we can ignore the quantifier for
				6994	non-parenthesized items, as they have only one alternative. For anything in
				6995	parentheses, we must not ignore if {1} is possessive. */
				6996
				6997	switch (op_previous)
				6998	{
				6999	/* If previous was a character or negated character match, abolish the
				7000	item and generate a repeat item instead. If a char item has a minimum of
				7001	more than one, ensure that it is set in reqcu - it might not be if a
				7002	sequence such as x{3} is the first thing in a branch because the x will
				7003	have gone into firstcu instead. */
				7004
				7005	case OP_CHAR:
				7006	case OP_CHARI:
				7007	case OP_NOT:
				7008	case OP_NOTI:
				7009	if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
				7010	op_type = chartypeoffset[op_previous - OP_CHAR];
				7011
				7012	/* Deal with UTF characters that take up more than one code unit. */
				7013
				7014	#ifdef MAYBE_UTF_MULTI
				7015	if (utf && NOT_FIRSTCU(code[-1]))
				7016	{
				7017	PCRE2_UCHAR *lastchar = code - 1;
				7018	BACKCHAR(lastchar);
				7019	mclength = (uint32_t)(code - lastchar); /* Length of UTF character */
				7020	memcpy(mcbuffer, lastchar, CU2BYTES(mclength)); /* Save the char */
				7021	}
				7022	else
				7023	#endif /* MAYBE_UTF_MULTI */
				7024
				7025	/* Handle the case of a single code unit - either with no UTF support, or
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	7026	with UTF disabled, or for a single-code-unit UTF character. In the latter
				7027	case, for a repeated positive match, get the caseless flag for the
				7028	required code unit from the previous character, because a class like [Aa]
				7029	sets a caseless A but by now the req_caseopt flag has been reset. */
				7030
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	7031	{
				7032	mcbuffer[0] = code[-1];
				7033	mclength = 1;
				7034	if (op_previous <= OP_CHARI && repeat_min > 1)
				7035	{
				7036	reqcu = mcbuffer[0];
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	7037	reqcuflags = cb->req_varyopt;
				7038	if (op_previous == OP_CHARI) reqcuflags \|= REQ_CASELESS;
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	7039	}
				7040	}
				7041	goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
				7042
				7043	/* If previous was a character class or a back reference, we put the
				7044	repeat stuff after it, but just skip the item if the repeat was {0,0}. */
				7045
				7046	#ifdef SUPPORT_WIDE_CHARS
				7047	case OP_XCLASS:
				7048	#endif
				7049	case OP_CLASS:
				7050	case OP_NCLASS:
				7051	case OP_REF:
				7052	case OP_REFI:
				7053	case OP_DNREF:
				7054	case OP_DNREFI:
				7055
				7056	if (repeat_max == 0)
				7057	{
				7058	code = previous;
				7059	goto END_REPEAT;
				7060	}
				7061	if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
				7062
				7063	if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED)
				7064	*code++ = OP_CRSTAR + repeat_type;
				7065	else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED)
				7066	*code++ = OP_CRPLUS + repeat_type;
				7067	else if (repeat_min == 0 && repeat_max == 1)
				7068	*code++ = OP_CRQUERY + repeat_type;
				7069	else
				7070	{
				7071	*code++ = OP_CRRANGE + repeat_type;
				7072	PUT2INC(code, 0, repeat_min);
				7073	if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0; /* 2-byte encoding for max */
				7074	PUT2INC(code, 0, repeat_max);
				7075	}
				7076	break;
				7077
				7078	/* If previous is OP_FAIL, it was generated by an empty class []
				7079	(PCRE2_ALLOW_EMPTY_CLASS is set). The other ways in which OP_FAIL can be
				7080	generated, that is by (*FAIL) or (?!), disallow a quantifier at parse
				7081	time. We can just ignore this repeat. */
				7082
				7083	case OP_FAIL:
				7084	goto END_REPEAT;
				7085
				7086	/* Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets
				7087	because pcre2_match() could not handle backtracking into recursively
				7088	called groups. Now that this backtracking is available, we no longer need
				7089	to do this. However, we still need to replicate recursions as we do for
				7090	groups so as to have independent backtracking points. We can replicate
				7091	for the minimum number of repeats directly. For optional repeats we now
				7092	wrap the recursion in OP_BRA brackets and make use of the bracket
				7093	repetition. */
				7094
				7095	case OP_RECURSE:
				7096	if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
				7097	goto END_REPEAT;
				7098
				7099	/* Generate unwrapped repeats for a non-zero minimum, except when the
				7100	minimum is 1 and the maximum unlimited, because that can be handled with
				7101	OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the
				7102	minimum, we just need to generate the appropriate additional copies.
				7103	Otherwise we need to generate one more, to simulate the situation when
				7104	the minimum is zero. */
				7105
				7106	if (repeat_min > 0 && (repeat_min != 1 \|\| repeat_max != REPEAT_UNLIMITED))
				7107	{
				7108	int replicate = repeat_min;
				7109	if (repeat_min == repeat_max) replicate--;
				7110
				7111	/* In the pre-compile phase, we don't actually do the replication. We
				7112	just adjust the length as if we had. Do some paranoid checks for
				7113	potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
				7114	integer type when available, otherwise double. */
				7115
				7116	if (lengthptr != NULL)
				7117	{
				7118	PCRE2_SIZE delta = replicate*(1 + LINK_SIZE);
				7119	if ((INT64_OR_DOUBLE)replicate*
				7120	(INT64_OR_DOUBLE)(1 + LINK_SIZE) >
				7121	(INT64_OR_DOUBLE)INT_MAX \|\|
				7122	OFLOW_MAX - *lengthptr < delta)
				7123	{
				7124	*errorcodeptr = ERR20;
				7125	return 0;
				7126	}
				7127	*lengthptr += delta;
				7128	}
				7129
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	7130	else for (int i = 0; i < replicate; i++)
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	7131	{
				7132	memcpy(code, previous, CU2BYTES(1 + LINK_SIZE));
				7133	previous = code;
				7134	code += 1 + LINK_SIZE;
				7135	}
				7136
				7137	/* If the number of repeats is fixed, we are done. Otherwise, adjust
				7138	the counts and fall through. */
				7139
				7140	if (repeat_min == repeat_max) break;
				7141	if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
				7142	repeat_min = 0;
				7143	}
				7144
				7145	/* Wrap the recursion call in OP_BRA brackets. */
				7146
				7147	(void)memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(1 + LINK_SIZE));
				7148	op_previous = *previous = OP_BRA;
				7149	PUT(previous, 1, 2 + 2*LINK_SIZE);
				7150	previous[2 + 2*LINK_SIZE] = OP_KET;
				7151	PUT(previous, 3 + 2LINK_SIZE, 2 + 2LINK_SIZE);
				7152	code += 2 + 2 * LINK_SIZE;
				7153	length_prevgroup = 3 + 3*LINK_SIZE;
				7154	group_return = -1; /* Set "may match empty string" */
				7155
				7156	/* Now treat as a repeated OP_BRA. */
				7157	/* Fall through */
				7158
				7159	/* If previous was a bracket group, we may have to replicate it in
				7160	certain cases. Note that at this point we can encounter only the "basic"
				7161	bracket opcodes such as BRA and CBRA, as this is the place where they get
				7162	converted into the more special varieties such as BRAPOS and SBRA.
				7163	Originally, PCRE did not allow repetition of assertions, but now it does,
				7164	for Perl compatibility. */
				7165
				7166	case OP_ASSERT:
				7167	case OP_ASSERT_NOT:
				7168	case OP_ASSERT_NA:
				7169	case OP_ASSERTBACK:
				7170	case OP_ASSERTBACK_NOT:
				7171	case OP_ASSERTBACK_NA:
				7172	case OP_ONCE:
				7173	case OP_SCRIPT_RUN:
				7174	case OP_BRA:
				7175	case OP_CBRA:
				7176	case OP_COND:
				7177	{
				7178	int len = (int)(code - previous);
				7179	PCRE2_UCHAR *bralink = NULL;
				7180	PCRE2_UCHAR *brazeroptr = NULL;
				7181
				7182	if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
				7183	goto END_REPEAT;
				7184
				7185	/* Repeating a DEFINE group (or any group where the condition is always
				7186	FALSE and there is only one branch) is pointless, but Perl allows the
				7187	syntax, so we just ignore the repeat. */
				7188
				7189	if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE &&
				7190	previous[GET(previous, 1)] != OP_ALT)
				7191	goto END_REPEAT;
				7192
				7193	/* Perl allows all assertions to be quantified, and when they contain
				7194	capturing parentheses and/or are optional there are potential uses for
				7195	this feature. PCRE2 used to force the maximum quantifier to 1 on the
				7196	invalid grounds that further repetition was never useful. This was
				7197	always a bit pointless, since an assertion could be wrapped with a
				7198	repeated group to achieve the effect. General repetition is now
				7199	permitted, but if the maximum is unlimited it is set to one more than
				7200	the minimum. */
				7201
				7202	if (op_previous < OP_ONCE) /* Assertion */
				7203	{
				7204	if (repeat_max == REPEAT_UNLIMITED) repeat_max = repeat_min + 1;
				7205	}
				7206
				7207	/* The case of a zero minimum is special because of the need to stick
				7208	OP_BRAZERO in front of it, and because the group appears once in the
				7209	data, whereas in other cases it appears the minimum number of times. For
				7210	this reason, it is simplest to treat this case separately, as otherwise
				7211	the code gets far too messy. There are several special subcases when the
				7212	minimum is zero. */
				7213
				7214	if (repeat_min == 0)
				7215	{
				7216	/* If the maximum is also zero, we used to just omit the group from
				7217	the output altogether, like this:
				7218
				7219	** if (repeat_max == 0)
				7220	** {
				7221	** code = previous;
				7222	** goto END_REPEAT;
				7223	** }
				7224
				7225	However, that fails when a group or a subgroup within it is
				7226	referenced as a subroutine from elsewhere in the pattern, so now we
				7227	stick in OP_SKIPZERO in front of it so that it is skipped on
				7228	execution. As we don't have a list of which groups are referenced, we
				7229	cannot do this selectively.
				7230
				7231	If the maximum is 1 or unlimited, we just have to stick in the
				7232	BRAZERO and do no more at this point. */
				7233
				7234	if (repeat_max <= 1 \|\| repeat_max == REPEAT_UNLIMITED)
				7235	{
				7236	(void)memmove(previous + 1, previous, CU2BYTES(len));
				7237	code++;
				7238	if (repeat_max == 0)
				7239	{
				7240	*previous++ = OP_SKIPZERO;
				7241	goto END_REPEAT;
				7242	}
				7243	brazeroptr = previous; /* Save for possessive optimizing */
				7244	*previous++ = OP_BRAZERO + repeat_type;
				7245	}
				7246
				7247	/* If the maximum is greater than 1 and limited, we have to replicate
				7248	in a nested fashion, sticking OP_BRAZERO before each set of brackets.
				7249	The first one has to be handled carefully because it's the original
				7250	copy, which has to be moved up. The remainder can be handled by code
				7251	that is common with the non-zero minimum case below. We have to
				7252	adjust the value or repeat_max, since one less copy is required. */
				7253
				7254	else
				7255	{
				7256	int linkoffset;
				7257	(void)memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));
				7258	code += 2 + LINK_SIZE;
				7259	*previous++ = OP_BRAZERO + repeat_type;
				7260	*previous++ = OP_BRA;
				7261
				7262	/* We chain together the bracket link offset fields that have to be
				7263	filled in later when the ends of the brackets are reached. */
				7264
				7265	linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink);
				7266	bralink = previous;
				7267	PUTINC(previous, 0, linkoffset);
				7268	}
				7269
				7270	if (repeat_max != REPEAT_UNLIMITED) repeat_max--;
				7271	}
				7272
				7273	/* If the minimum is greater than zero, replicate the group as many
				7274	times as necessary, and adjust the maximum to the number of subsequent
				7275	copies that we need. */
				7276
				7277	else
				7278	{
				7279	if (repeat_min > 1)
				7280	{
				7281	/* In the pre-compile phase, we don't actually do the replication.
				7282	We just adjust the length as if we had. Do some paranoid checks for
				7283	potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
				7284	integer type when available, otherwise double. */
				7285
				7286	if (lengthptr != NULL)
				7287	{
				7288	PCRE2_SIZE delta = (repeat_min - 1)*length_prevgroup;
				7289	if ((INT64_OR_DOUBLE)(repeat_min - 1)*
				7290	(INT64_OR_DOUBLE)length_prevgroup >
				7291	(INT64_OR_DOUBLE)INT_MAX \|\|
				7292	OFLOW_MAX - *lengthptr < delta)
				7293	{
				7294	*errorcodeptr = ERR20;
				7295	return 0;
				7296	}
				7297	*lengthptr += delta;
				7298	}
				7299
				7300	/* This is compiling for real. If there is a set first code unit
				7301	for the group, and we have not yet set a "required code unit", set
				7302	it. */
				7303
				7304	else
				7305	{
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	7306	if (groupsetfirstcu && reqcuflags >= REQ_NONE)
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	7307	{
				7308	reqcu = firstcu;
				7309	reqcuflags = firstcuflags;
				7310	}
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	7311	for (uint32_t i = 1; i < repeat_min; i++)
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	7312	{
				7313	memcpy(code, previous, CU2BYTES(len));
				7314	code += len;
				7315	}
				7316	}
				7317	}
				7318
				7319	if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
				7320	}
				7321
				7322	/* This code is common to both the zero and non-zero minimum cases. If
				7323	the maximum is limited, it replicates the group in a nested fashion,
				7324	remembering the bracket starts on a stack. In the case of a zero
				7325	minimum, the first one was set up above. In all cases the repeat_max
				7326	now specifies the number of additional copies needed. Again, we must
				7327	remember to replicate entries on the forward reference list. */
				7328
				7329	if (repeat_max != REPEAT_UNLIMITED)
				7330	{
				7331	/* In the pre-compile phase, we don't actually do the replication. We
				7332	just adjust the length as if we had. For each repetition we must add
				7333	1 to the length for BRAZERO and for all but the last repetition we
				7334	must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
				7335	paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type
				7336	is a 64-bit integer type when available, otherwise double. */
				7337
				7338	if (lengthptr != NULL && repeat_max > 0)
				7339	{
				7340	PCRE2_SIZE delta = repeat_max(length_prevgroup + 1 + 2 + 2LINK_SIZE) -
				7341	2 - 2LINK_SIZE; / Last one doesn't nest */
				7342	if ((INT64_OR_DOUBLE)repeat_max *
				7343	(INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
				7344	> (INT64_OR_DOUBLE)INT_MAX \|\|
				7345	OFLOW_MAX - *lengthptr < delta)
				7346	{
				7347	*errorcodeptr = ERR20;
				7348	return 0;
				7349	}
				7350	*lengthptr += delta;
				7351	}
				7352
				7353	/* This is compiling for real */
				7354
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	7355	else for (uint32_t i = repeat_max; i >= 1; i--)
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	7356	{
				7357	*code++ = OP_BRAZERO + repeat_type;
				7358
				7359	/* All but the final copy start a new nesting, maintaining the
				7360	chain of brackets outstanding. */
				7361
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	7362	if (i != 1)
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	7363	{
				7364	int linkoffset;
				7365	*code++ = OP_BRA;
				7366	linkoffset = (bralink == NULL)? 0 : (int)(code - bralink);
				7367	bralink = code;
				7368	PUTINC(code, 0, linkoffset);
				7369	}
				7370
				7371	memcpy(code, previous, CU2BYTES(len));
				7372	code += len;
				7373	}
				7374
				7375	/* Now chain through the pending brackets, and fill in their length
				7376	fields (which are holding the chain links pro tem). */
				7377
				7378	while (bralink != NULL)
				7379	{
				7380	int oldlinkoffset;
				7381	int linkoffset = (int)(code - bralink + 1);
				7382	PCRE2_UCHAR *bra = code - linkoffset;
				7383	oldlinkoffset = GET(bra, 1);
				7384	bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
				7385	*code++ = OP_KET;
				7386	PUTINC(code, 0, linkoffset);
				7387	PUT(bra, 1, linkoffset);
				7388	}
				7389	}
				7390
				7391	/* If the maximum is unlimited, set a repeater in the final copy. For
				7392	SCRIPT_RUN and ONCE brackets, that's all we need to do. However,
				7393	possessively repeated ONCE brackets can be converted into non-capturing
				7394	brackets, as the behaviour of (?:xx)++ is the same as (?>xx)++ and this
				7395	saves having to deal with possessive ONCEs specially.
				7396
				7397	Otherwise, when we are doing the actual compile phase, check to see
				7398	whether this group is one that could match an empty string. If so,
				7399	convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
				7400	that runtime checking can be done. [This check is also applied to ONCE
				7401	and SCRIPT_RUN groups at runtime, but in a different way.]
				7402
				7403	Then, if the quantifier was possessive and the bracket is not a
				7404	conditional, we convert the BRA code to the POS form, and the KET code
				7405	to KETRPOS. (It turns out to be convenient at runtime to detect this
				7406	kind of subpattern at both the start and at the end.) The use of
				7407	special opcodes makes it possible to reduce greatly the stack usage in
				7408	pcre2_match(). If the group is preceded by OP_BRAZERO, convert this to
				7409	OP_BRAPOSZERO.
				7410
				7411	Then, if the minimum number of matches is 1 or 0, cancel the possessive
				7412	flag so that the default action below, of wrapping everything inside
				7413	atomic brackets, does not happen. When the minimum is greater than 1,
				7414	there will be earlier copies of the group, and so we still have to wrap
				7415	the whole thing. */
				7416
				7417	else
				7418	{
				7419	PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE;
				7420	PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1);
				7421
				7422	/* Convert possessive ONCE brackets to non-capturing */
				7423
				7424	if (bracode == OP_ONCE && possessive_quantifier) bracode = OP_BRA;
				7425
				7426	/* For non-possessive ONCE and for SCRIPT_RUN brackets, all we need
				7427	to do is to set the KET. */
				7428
				7429	if (bracode == OP_ONCE \|\| bracode == OP_SCRIPT_RUN)
				7430	*ketcode = OP_KETRMAX + repeat_type;
				7431
				7432	/* Handle non-SCRIPT_RUN and non-ONCE brackets and possessive ONCEs
				7433	(which have been converted to non-capturing above). */
				7434
				7435	else
				7436	{
				7437	/* In the compile phase, adjust the opcode if the group can match
				7438	an empty string. For a conditional group with only one branch, the
				7439	value of group_return will not show "could be empty", so we must
				7440	check that separately. */
				7441
				7442	if (lengthptr == NULL)
				7443	{
				7444	if (group_return < 0) *bracode += OP_SBRA - OP_BRA;
				7445	if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
				7446	*bracode = OP_SCOND;
				7447	}
				7448
				7449	/* Handle possessive quantifiers. */
				7450
				7451	if (possessive_quantifier)
				7452	{
				7453	/* For COND brackets, we wrap the whole thing in a possessively
				7454	repeated non-capturing bracket, because we have not invented POS
				7455	versions of the COND opcodes. */
				7456
				7457	if (bracode == OP_COND \|\| bracode == OP_SCOND)
				7458	{
				7459	int nlen = (int)(code - bracode);
				7460	(void)memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));
				7461	code += 1 + LINK_SIZE;
				7462	nlen += 1 + LINK_SIZE;
				7463	bracode = (bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
				7464	*code++ = OP_KETRPOS;
				7465	PUTINC(code, 0, nlen);
				7466	PUT(bracode, 1, nlen);
				7467	}
				7468
				7469	/* For non-COND brackets, we modify the BRA code and use KETRPOS. */
				7470
				7471	else
				7472	{
				7473	bracode += 1; / Switch to xxxPOS opcodes */
				7474	*ketcode = OP_KETRPOS;
				7475	}
				7476
				7477	/* If the minimum is zero, mark it as possessive, then unset the
				7478	possessive flag when the minimum is 0 or 1. */
				7479
				7480	if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
				7481	if (repeat_min < 2) possessive_quantifier = FALSE;
				7482	}
				7483
				7484	/* Non-possessive quantifier */
				7485
				7486	else *ketcode = OP_KETRMAX + repeat_type;
				7487	}
				7488	}
				7489	}
				7490	break;
				7491
				7492	/* If previous was a character type match (\d or similar), abolish it and
				7493	create a suitable repeat item. The code is shared with single-character
				7494	repeats by setting op_type to add a suitable offset into repeat_type.
				7495	Note the the Unicode property types will be present only when
				7496	SUPPORT_UNICODE is defined, but we don't wrap the little bits of code
				7497	here because it just makes it horribly messy. */
				7498
				7499	default:
				7500	if (op_previous >= OP_EODN) /* Not a character type - internal error */
				7501	{
				7502	*errorcodeptr = ERR10;
				7503	return 0;
				7504	}
				7505	else
				7506	{
				7507	int prop_type, prop_value;
				7508	PCRE2_UCHAR *oldcode;
				7509
				7510	if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
				7511
				7512	op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
				7513	mclength = 0; /* Not a character */
				7514
				7515	if (op_previous == OP_PROP \|\| op_previous == OP_NOTPROP)
				7516	{
				7517	prop_type = previous[1];
				7518	prop_value = previous[2];
				7519	}
				7520	else
				7521	{
				7522	/* Come here from just above with a character in mcbuffer/mclength. */
				7523	OUTPUT_SINGLE_REPEAT:
				7524	prop_type = prop_value = -1;
				7525	}
				7526
				7527	/* At this point, if prop_type == prop_value == -1 we either have a
				7528	character in mcbuffer when mclength is greater than zero, or we have
				7529	mclength zero, in which case there is a non-property character type in
				7530	op_previous. If prop_type/value are not negative, we have a property
				7531	character type in op_previous. */
				7532
				7533	oldcode = code; /* Save where we were */
				7534	code = previous; /* Usually overwrite previous item */
				7535
				7536	/* If the maximum is zero then the minimum must also be zero; Perl allows
				7537	this case, so we do too - by simply omitting the item altogether. */
				7538
				7539	if (repeat_max == 0) goto END_REPEAT;
				7540
				7541	/* Combine the op_type with the repeat_type */
				7542
				7543	repeat_type += op_type;
				7544
				7545	/* A minimum of zero is handled either as the special case * or ?, or as
				7546	an UPTO, with the maximum given. */
				7547
				7548	if (repeat_min == 0)
				7549	{
				7550	if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type;
				7551	else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
				7552	else
				7553	{
				7554	*code++ = OP_UPTO + repeat_type;
				7555	PUT2INC(code, 0, repeat_max);
				7556	}
				7557	}
				7558
				7559	/* A repeat minimum of 1 is optimized into some special cases. If the
				7560	maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
				7561	left in place and, if the maximum is greater than 1, we use OP_UPTO with
				7562	one less than the maximum. */
				7563
				7564	else if (repeat_min == 1)
				7565	{
				7566	if (repeat_max == REPEAT_UNLIMITED)
				7567	*code++ = OP_PLUS + repeat_type;
				7568	else
				7569	{
				7570	code = oldcode; /* Leave previous item in place */
				7571	if (repeat_max == 1) goto END_REPEAT;
				7572	*code++ = OP_UPTO + repeat_type;
				7573	PUT2INC(code, 0, repeat_max - 1);
				7574	}
				7575	}
				7576
				7577	/* The case {n,n} is just an EXACT, while the general case {n,m} is
				7578	handled as an EXACT followed by an UPTO or STAR or QUERY. */
				7579
				7580	else
				7581	{
				7582	code++ = OP_EXACT + op_type; / NB EXACT doesn't have repeat_type */
				7583	PUT2INC(code, 0, repeat_min);
				7584
				7585	/* Unless repeat_max equals repeat_min, fill in the data for EXACT,
				7586	and then generate the second opcode. For a repeated Unicode property
				7587	match, there are two extra values that define the required property,
				7588	and mclength is set zero to indicate this. */
				7589
				7590	if (repeat_max != repeat_min)
				7591	{
				7592	if (mclength > 0)
				7593	{
				7594	memcpy(code, mcbuffer, CU2BYTES(mclength));
				7595	code += mclength;
				7596	}
				7597	else
				7598	{
				7599	*code++ = op_previous;
				7600	if (prop_type >= 0)
				7601	{
				7602	*code++ = prop_type;
				7603	*code++ = prop_value;
				7604	}
				7605	}
				7606
				7607	/* Now set up the following opcode */
				7608
				7609	if (repeat_max == REPEAT_UNLIMITED)
				7610	*code++ = OP_STAR + repeat_type;
				7611	else
				7612	{
				7613	repeat_max -= repeat_min;
				7614	if (repeat_max == 1)
				7615	{
				7616	*code++ = OP_QUERY + repeat_type;
				7617	}
				7618	else
				7619	{
				7620	*code++ = OP_UPTO + repeat_type;
				7621	PUT2INC(code, 0, repeat_max);
				7622	}
				7623	}
				7624	}
				7625	}
				7626
				7627	/* Fill in the character or character type for the final opcode. */
				7628
				7629	if (mclength > 0)
				7630	{
				7631	memcpy(code, mcbuffer, CU2BYTES(mclength));
				7632	code += mclength;
				7633	}
				7634	else
				7635	{
				7636	*code++ = op_previous;
				7637	if (prop_type >= 0)
				7638	{
				7639	*code++ = prop_type;
				7640	*code++ = prop_value;
				7641	}
				7642	}
				7643	}
				7644	break;
				7645	} /* End of switch on different op_previous values */
				7646
				7647
				7648	/* If the character following a repeat is '+', possessive_quantifier is
				7649	TRUE. For some opcodes, there are special alternative opcodes for this
				7650	case. For anything else, we wrap the entire repeated item inside OP_ONCE
				7651	brackets. Logically, the '+' notation is just syntactic sugar, taken from
				7652	Sun's Java package, but the special opcodes can optimize it.
				7653
				7654	Some (but not all) possessively repeated subpatterns have already been
				7655	completely handled in the code just above. For them, possessive_quantifier
				7656	is always FALSE at this stage. Note that the repeated item starts at
				7657	tempcode, not at previous, which might be the first part of a string whose
				7658	(former) last char we repeated. */
				7659
				7660	if (possessive_quantifier)
				7661	{
				7662	int len;
				7663
				7664	/* Possessifying an EXACT quantifier has no effect, so we can ignore it.
				7665	However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
				7666	{5,}, or {5,10}). We skip over an EXACT item; if the length of what
				7667	remains is greater than zero, there's a further opcode that can be
				7668	handled. If not, do nothing, leaving the EXACT alone. */
				7669
				7670	switch(*tempcode)
				7671	{
				7672	case OP_TYPEEXACT:
				7673	tempcode += PRIV(OP_lengths)[*tempcode] +
				7674	((tempcode[1 + IMM2_SIZE] == OP_PROP
				7675	\|\| tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
				7676	break;
				7677
				7678	/* CHAR opcodes are used for exacts whose count is 1. */
				7679
				7680	case OP_CHAR:
				7681	case OP_CHARI:
				7682	case OP_NOT:
				7683	case OP_NOTI:
				7684	case OP_EXACT:
				7685	case OP_EXACTI:
				7686	case OP_NOTEXACT:
				7687	case OP_NOTEXACTI:
				7688	tempcode += PRIV(OP_lengths)[*tempcode];
				7689	#ifdef SUPPORT_UNICODE
				7690	if (utf && HAS_EXTRALEN(tempcode[-1]))
				7691	tempcode += GET_EXTRALEN(tempcode[-1]);
				7692	#endif
				7693	break;
				7694
				7695	/* For the class opcodes, the repeat operator appears at the end;
				7696	adjust tempcode to point to it. */
				7697
				7698	case OP_CLASS:
				7699	case OP_NCLASS:
				7700	tempcode += 1 + 32/sizeof(PCRE2_UCHAR);
				7701	break;
				7702
				7703	#ifdef SUPPORT_WIDE_CHARS
				7704	case OP_XCLASS:
				7705	tempcode += GET(tempcode, 1);
				7706	break;
				7707	#endif
				7708	}
				7709
				7710	/* If tempcode is equal to code (which points to the end of the repeated
				7711	item), it means we have skipped an EXACT item but there is no following
				7712	QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
				7713	all other cases, tempcode will be pointing to the repeat opcode, and will
				7714	be less than code, so the value of len will be greater than 0. */
				7715
				7716	len = (int)(code - tempcode);
				7717	if (len > 0)
				7718	{
				7719	unsigned int repcode = *tempcode;
				7720
				7721	/* There is a table for possessifying opcodes, all of which are less
				7722	than OP_CALLOUT. A zero entry means there is no possessified version.
				7723	*/
				7724
				7725	if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
				7726	*tempcode = opcode_possessify[repcode];
				7727
				7728	/* For opcode without a special possessified version, wrap the item in
				7729	ONCE brackets. */
				7730
				7731	else
				7732	{
				7733	(void)memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));
				7734	code += 1 + LINK_SIZE;
				7735	len += 1 + LINK_SIZE;
				7736	tempcode[0] = OP_ONCE;
				7737	*code++ = OP_KET;
				7738	PUTINC(code, 0, len);
				7739	PUT(tempcode, 1, len);
				7740	}
				7741	}
				7742	}
				7743
				7744	/* We set the "follows varying string" flag for subsequently encountered
				7745	reqcus if it isn't already set and we have just passed a varying length
				7746	item. */
				7747
				7748	END_REPEAT:
				7749	cb->req_varyopt \|= reqvary;
				7750	break;
				7751
				7752
				7753	/* ===================================================================*/
				7754	/* Handle a 32-bit data character with a value greater than META_END. */
				7755
				7756	case META_BIGVALUE:
				7757	pptr++;
				7758	goto NORMAL_CHAR;
				7759
				7760
				7761	/* ===============================================================*/
				7762	/* Handle a back reference by number, which is the meta argument. The
				7763	pattern offsets for back references to group numbers less than 10 are held
				7764	in a special vector, to avoid using more than two parsed pattern elements
				7765	in 64-bit environments. We only need the offset to the first occurrence,
				7766	because if that doesn't fail, subsequent ones will also be OK. */
				7767
				7768	case META_BACKREF:
				7769	if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg];
				7770	else GETPLUSOFFSET(offset, pptr);
				7771
				7772	if (meta_arg > cb->bracount)
				7773	{
				7774	cb->erroroffset = offset;
				7775	errorcodeptr = ERR15; / Non-existent subpattern */
				7776	return 0;
				7777	}
				7778
				7779	/* Come here from named backref handling when the reference is to a
				7780	single group (that is, not to a duplicated name). The back reference
				7781	data will have already been updated. We must disable firstcu if not
				7782	set, to cope with cases like (?=(\w+))\1: which would otherwise set ':'
				7783	later. */
				7784
				7785	HANDLE_SINGLE_REFERENCE:
				7786	if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE;
				7787	*code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
				7788	PUT2INC(code, 0, meta_arg);
				7789
				7790	/* Update the map of back references, and keep the highest one. We
				7791	could do this in parse_regex() for numerical back references, but not
				7792	for named back references, because we don't know the numbers to which
				7793	named back references refer. So we do it all in this function. */
				7794
				7795	cb->backref_map \|= (meta_arg < 32)? (1u << meta_arg) : 1;
				7796	if (meta_arg > cb->top_backref) cb->top_backref = meta_arg;
				7797	break;
				7798
				7799
				7800	/* ===============================================================*/
				7801	/* Handle recursion by inserting the number of the called group (which is
				7802	the meta argument) after OP_RECURSE. At the end of compiling the pattern is
				7803	scanned and these numbers are replaced by offsets within the pattern. It is
				7804	done like this to avoid problems with forward references and adjusting
				7805	offsets when groups are duplicated and moved (as discovered in previous
				7806	implementations). Note that a recursion does not have a set first
				7807	character. */
				7808
				7809	case META_RECURSE:
				7810	GETPLUSOFFSET(offset, pptr);
				7811	if (meta_arg > cb->bracount)
				7812	{
				7813	cb->erroroffset = offset;
				7814	errorcodeptr = ERR15; / Non-existent subpattern */
				7815	return 0;
				7816	}
				7817	HANDLE_NUMERICAL_RECURSION:
				7818	*code = OP_RECURSE;
				7819	PUT(code, 1, meta_arg);
				7820	code += 1 + LINK_SIZE;
				7821	groupsetfirstcu = FALSE;
				7822	cb->had_recurse = TRUE;
				7823	if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
				7824	zerofirstcu = firstcu;
				7825	zerofirstcuflags = firstcuflags;
				7826	break;
				7827
				7828
				7829	/* ===============================================================*/
				7830	/* Handle capturing parentheses; the number is the meta argument. */
				7831
				7832	case META_CAPTURE:
				7833	bravalue = OP_CBRA;
				7834	skipunits = IMM2_SIZE;
				7835	PUT2(code, 1+LINK_SIZE, meta_arg);
				7836	cb->lastcapture = meta_arg;
				7837	goto GROUP_PROCESS_NOTE_EMPTY;
				7838
				7839
				7840	/* ===============================================================*/
				7841	/* Handle escape sequence items. For ones like \d, the ESC_values are
				7842	arranged to be the same as the corresponding OP_values in the default case
				7843	when PCRE2_UCP is not set (which is the only case in which they will appear
				7844	here).
				7845
				7846	Note: \Q and \E are never seen here, as they were dealt with in
				7847	parse_pattern(). Neither are numerical back references or recursions, which
				7848	were turned into META_BACKREF or META_RECURSE items, respectively. \k and
				7849	\g, when followed by names, are turned into META_BACKREF_BYNAME or
				7850	META_RECURSE_BYNAME. */
				7851
				7852	case META_ESCAPE:
				7853
				7854	/* We can test for escape sequences that consume a character because their
				7855	values lie between ESC_b and ESC_Z; this may have to change if any new ones
				7856	are ever created. For these sequences, we disable the setting of a first
				7857	character if it hasn't already been set. */
				7858
				7859	if (meta_arg > ESC_b && meta_arg < ESC_Z)
				7860	{
				7861	matched_char = TRUE;
				7862	if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
				7863	}
				7864
				7865	/* Set values to reset to if this is followed by a zero repeat. */
				7866
				7867	zerofirstcu = firstcu;
				7868	zerofirstcuflags = firstcuflags;
				7869	zeroreqcu = reqcu;
				7870	zeroreqcuflags = reqcuflags;
				7871
				7872	/* If Unicode is not supported, \P and \p are not allowed and are
				7873	faulted at parse time, so will never appear here. */
				7874
				7875	#ifdef SUPPORT_UNICODE
				7876	if (meta_arg == ESC_P \|\| meta_arg == ESC_p)
				7877	{
				7878	uint32_t ptype = *(++pptr) >> 16;
				7879	uint32_t pdata = *pptr & 0xffff;
				7880
				7881	/* The special case of \p{Any} is compiled to OP_ALLANY so as to benefit
				7882	from the auto-anchoring code. */
				7883
				7884	if (meta_arg == ESC_p && ptype == PT_ANY)
				7885	{
				7886	*code++ = OP_ALLANY;
				7887	}
				7888	else
				7889	{
				7890	*code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
				7891	*code++ = ptype;
				7892	*code++ = pdata;
				7893	}
				7894	break; /* End META_ESCAPE */
				7895	}
				7896	#endif
				7897
				7898	/* \K is forbidden in lookarounds since 10.38 because that's what Perl has
				7899	done. However, there's an option, in case anyone was relying on it. */
				7900
				7901	if (cb->assert_depth > 0 && meta_arg == ESC_K &&
				7902	(cb->cx->extra_options & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) == 0)
				7903	{
				7904	*errorcodeptr = ERR99;
				7905	return 0;
				7906	}
				7907
				7908	/* For the rest (including \X when Unicode is supported - if not it's
				7909	faulted at parse time), the OP value is the escape value when PCRE2_UCP is
				7910	not set; if it is set, these escapes do not show up here because they are
				7911	converted into Unicode property tests in parse_regex(). Note that \b and \B
				7912	do a one-character lookbehind, and \A also behaves as if it does. */
				7913
				7914	if (meta_arg == ESC_C) cb->external_flags \|= PCRE2_HASBKC; /* Record */
				7915	if ((meta_arg == ESC_b \|\| meta_arg == ESC_B \|\| meta_arg == ESC_A) &&
				7916	cb->max_lookbehind == 0)
				7917	cb->max_lookbehind = 1;
				7918
				7919	/* In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY
				7920	instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds. */
				7921
				7922	#if PCRE2_CODE_UNIT_WIDTH == 32
				7923	*code++ = (meta_arg == ESC_C)? OP_ALLANY : meta_arg;
				7924	#else
				7925	*code++ = (!utf && meta_arg == ESC_C)? OP_ALLANY : meta_arg;
				7926	#endif
				7927	break; /* End META_ESCAPE */
				7928
				7929
				7930	/* ===================================================================*/
				7931	/* Handle an unrecognized meta value. A parsed pattern value less than
				7932	META_END is a literal. Otherwise we have a problem. */
				7933
				7934	default:
				7935	if (meta >= META_END)
				7936	{
				7937	#ifdef DEBUG_SHOW_PARSED
				7938	fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x\n", *pptr);
				7939	#endif
				7940	errorcodeptr = ERR89; / Internal error - unrecognized. */
				7941	return 0;
				7942	}
				7943
				7944	/* Handle a literal character. We come here by goto in the case of a
				7945	32-bit, non-UTF character whose value is greater than META_END. */
				7946
				7947	NORMAL_CHAR:
				7948	meta = pptr; / Get the full 32 bits */
				7949	NORMAL_CHAR_SET: /* Character is already in meta */
				7950	matched_char = TRUE;
				7951
				7952	/* For caseless UTF or UCP mode, check whether this character has more than
				7953	one other case. If so, generate a special OP_PROP item instead of OP_CHARI.
				7954	*/
				7955
				7956	#ifdef SUPPORT_UNICODE
				7957	if ((utf\|\|ucp) && (options & PCRE2_CASELESS) != 0)
				7958	{
				7959	uint32_t caseset = UCD_CASESET(meta);
				7960	if (caseset != 0)
				7961	{
				7962	*code++ = OP_PROP;
				7963	*code++ = PT_CLIST;
				7964	*code++ = caseset;
				7965	if (firstcuflags == REQ_UNSET)
				7966	firstcuflags = zerofirstcuflags = REQ_NONE;
				7967	break; /* End handling this meta item */
				7968	}
				7969	}
				7970	#endif
				7971
				7972	/* Caseful matches, or caseless and not one of the multicase characters. We
				7973	come here by goto in the case of a positive class that contains only
				7974	case-partners of a character with just two cases; matched_char has already
				7975	been set TRUE and options fudged if necessary. */
				7976
				7977	CLASS_CASELESS_CHAR:
				7978
				7979	/* Get the character's code units into mcbuffer, with the length in
				7980	mclength. When not in UTF mode, the length is always 1. */
				7981
				7982	#ifdef SUPPORT_UNICODE
				7983	if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
				7984	#endif
				7985	{
				7986	mclength = 1;
				7987	mcbuffer[0] = meta;
				7988	}
				7989
				7990	/* Generate the appropriate code */
				7991
				7992	*code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR;
				7993	memcpy(code, mcbuffer, CU2BYTES(mclength));
				7994	code += mclength;
				7995
				7996	/* Remember if \r or \n were seen */
				7997
				7998	if (mcbuffer[0] == CHAR_CR \|\| mcbuffer[0] == CHAR_NL)
				7999	cb->external_flags \|= PCRE2_HASCRORLF;
				8000
				8001	/* Set the first and required code units appropriately. If no previous
				8002	first code unit, set it from this character, but revert to none on a zero
				8003	repeat. Otherwise, leave the firstcu value alone, and don't change it on
				8004	a zero repeat. */
				8005
				8006	if (firstcuflags == REQ_UNSET)
				8007	{
				8008	zerofirstcuflags = REQ_NONE;
				8009	zeroreqcu = reqcu;
				8010	zeroreqcuflags = reqcuflags;
				8011
				8012	/* If the character is more than one code unit long, we can set a single
				8013	firstcu only if it is not to be matched caselessly. Multiple possible
				8014	starting code units may be picked up later in the studying code. */
				8015
				8016	if (mclength == 1 \|\| req_caseopt == 0)
				8017	{
				8018	firstcu = mcbuffer[0];
				8019	firstcuflags = req_caseopt;
				8020	if (mclength != 1)
				8021	{
				8022	reqcu = code[-1];
				8023	reqcuflags = cb->req_varyopt;
				8024	}
				8025	}
				8026	else firstcuflags = reqcuflags = REQ_NONE;
				8027	}
				8028
				8029	/* firstcu was previously set; we can set reqcu only if the length is
				8030	1 or the matching is caseful. */
				8031
				8032	else
				8033	{
				8034	zerofirstcu = firstcu;
				8035	zerofirstcuflags = firstcuflags;
				8036	zeroreqcu = reqcu;
				8037	zeroreqcuflags = reqcuflags;
				8038	if (mclength == 1 \|\| req_caseopt == 0)
				8039	{
				8040	reqcu = code[-1];
				8041	reqcuflags = req_caseopt \| cb->req_varyopt;
				8042	}
				8043	}
				8044
				8045	/* If caselessness was temporarily instated, reset it. */
				8046
				8047	if (reset_caseful)
				8048	{
				8049	options &= ~PCRE2_CASELESS;
				8050	req_caseopt = 0;
				8051	reset_caseful = FALSE;
				8052	}
				8053
				8054	break; /* End literal character handling */
				8055	} /* End of big switch */
				8056	} /* End of big loop */
				8057
				8058	/* Control never reaches here. */
				8059	}
				8060
				8061
				8062
				8063	/*************************************************
				8064	* Compile regex: a sequence of alternatives *
				8065	*************************************************/
				8066
				8067	/* On entry, pptr is pointing past the bracket meta, but on return it points to
				8068	the closing bracket or META_END. The code variable is pointing at the code unit
				8069	into which the BRA operator has been stored. This function is used during the
				8070	pre-compile phase when we are trying to find out the amount of memory needed,
				8071	as well as during the real compile phase. The value of lengthptr distinguishes
				8072	the two phases.
				8073
				8074	Arguments:
				8075	options option bits, including any changes for this subpattern
				8076	codeptr -> the address of the current code pointer
				8077	pptrptr -> the address of the current parsed pattern pointer
				8078	errorcodeptr -> pointer to error code variable
				8079	skipunits skip this many code units at start (for brackets and OP_COND)
				8080	firstcuptr place to put the first required code unit
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	8081	firstcuflagsptr place to put the first code unit flags
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	8082	reqcuptr place to put the last required code unit
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	8083	reqcuflagsptr place to put the last required code unit flags
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	8084	bcptr pointer to the chain of currently open branches
				8085	cb points to the data block with tables pointers etc.
				8086	lengthptr NULL during the real compile phase
				8087	points to length accumulator during pre-compile phase
				8088
				8089	Returns: 0 There has been an error
				8090	+1 Success, this group must match at least one character
				8091	-1 Success, this group may match an empty string
				8092	*/
				8093
				8094	static int
				8095	compile_regex(uint32_t options, PCRE2_UCHAR codeptr, uint32_t pptrptr,
				8096	int errorcodeptr, uint32_t skipunits, uint32_t firstcuptr,
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	8097	uint32_t firstcuflagsptr, uint32_t reqcuptr, uint32_t *reqcuflagsptr,
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	8098	branch_chain bcptr, compile_block cb, PCRE2_SIZE *lengthptr)
				8099	{
				8100	PCRE2_UCHAR code = codeptr;
				8101	PCRE2_UCHAR *last_branch = code;
				8102	PCRE2_UCHAR *start_bracket = code;
				8103	BOOL lookbehind;
				8104	open_capitem capitem;
				8105	int capnumber = 0;
				8106	int okreturn = 1;
				8107	uint32_t pptr = pptrptr;
				8108	uint32_t firstcu, reqcu;
				8109	uint32_t lookbehindlength;
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	8110	uint32_t firstcuflags, reqcuflags;
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	8111	uint32_t branchfirstcu, branchreqcu;
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	8112	uint32_t branchfirstcuflags, branchreqcuflags;
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	8113	PCRE2_SIZE length;
				8114	branch_chain bc;
				8115
				8116	/* If set, call the external function that checks for stack availability. */
				8117
				8118	if (cb->cx->stack_guard != NULL &&
				8119	cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
				8120	{
				8121	*errorcodeptr= ERR33;
				8122	return 0;
				8123	}
				8124
				8125	/* Miscellaneous initialization */
				8126
				8127	bc.outer = bcptr;
				8128	bc.current_branch = code;
				8129
				8130	firstcu = reqcu = 0;
				8131	firstcuflags = reqcuflags = REQ_UNSET;
				8132
				8133	/* Accumulate the length for use in the pre-compile phase. Start with the
				8134	length of the BRA and KET and any extra code units that are required at the
				8135	beginning. We accumulate in a local variable to save frequent testing of
				8136	lengthptr for NULL. We cannot do this by looking at the value of 'code' at the
				8137	start and end of each alternative, because compiled items are discarded during
				8138	the pre-compile phase so that the workspace is not exceeded. */
				8139
				8140	length = 2 + 2*LINK_SIZE + skipunits;
				8141
				8142	/* Remember if this is a lookbehind assertion, and if it is, save its length
				8143	and skip over the pattern offset. */
				8144
				8145	lookbehind = *code == OP_ASSERTBACK \|\|
				8146	*code == OP_ASSERTBACK_NOT \|\|
				8147	*code == OP_ASSERTBACK_NA;
				8148
				8149	if (lookbehind)
				8150	{
				8151	lookbehindlength = META_DATA(pptr[-1]);
				8152	pptr += SIZEOFFSET;
				8153	}
				8154	else lookbehindlength = 0;
				8155
				8156	/* If this is a capturing subpattern, add to the chain of open capturing items
				8157	so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA
				8158	need be tested here; changing this opcode to one of its variants, e.g.
				8159	OP_SCBRAPOS, happens later, after the group has been compiled. */
				8160
				8161	if (*code == OP_CBRA)
				8162	{
				8163	capnumber = GET2(code, 1 + LINK_SIZE);
				8164	capitem.number = capnumber;
				8165	capitem.next = cb->open_caps;
				8166	capitem.assert_depth = cb->assert_depth;
				8167	cb->open_caps = &capitem;
				8168	}
				8169
				8170	/* Offset is set zero to mark that this bracket is still open */
				8171
				8172	PUT(code, 1, 0);
				8173	code += 1 + LINK_SIZE + skipunits;
				8174
				8175	/* Loop for each alternative branch */
				8176
				8177	for (;;)
				8178	{
				8179	int branch_return;
				8180
				8181	/* Insert OP_REVERSE if this is as lookbehind assertion. */
				8182
				8183	if (lookbehind && lookbehindlength > 0)
				8184	{
				8185	*code++ = OP_REVERSE;
				8186	PUTINC(code, 0, lookbehindlength);
				8187	length += 1 + LINK_SIZE;
				8188	}
				8189
				8190	/* Now compile the branch; in the pre-compile phase its length gets added
				8191	into the length. */
				8192
				8193	if ((branch_return =
				8194	compile_branch(&options, &code, &pptr, errorcodeptr, &branchfirstcu,
				8195	&branchfirstcuflags, &branchreqcu, &branchreqcuflags, &bc,
				8196	cb, (lengthptr == NULL)? NULL : &length)) == 0)
				8197	return 0;
				8198
				8199	/* If a branch can match an empty string, so can the whole group. */
				8200
				8201	if (branch_return < 0) okreturn = -1;
				8202
				8203	/* In the real compile phase, there is some post-processing to be done. */
				8204
				8205	if (lengthptr == NULL)
				8206	{
				8207	/* If this is the first branch, the firstcu and reqcu values for the
				8208	branch become the values for the regex. */
				8209
				8210	if (*last_branch != OP_ALT)
				8211	{
				8212	firstcu = branchfirstcu;
				8213	firstcuflags = branchfirstcuflags;
				8214	reqcu = branchreqcu;
				8215	reqcuflags = branchreqcuflags;
				8216	}
				8217
				8218	/* If this is not the first branch, the first char and reqcu have to
				8219	match the values from all the previous branches, except that if the
				8220	previous value for reqcu didn't have REQ_VARY set, it can still match,
				8221	and we set REQ_VARY for the group from this branch's value. */
				8222
				8223	else
				8224	{
				8225	/* If we previously had a firstcu, but it doesn't match the new branch,
				8226	we have to abandon the firstcu for the regex, but if there was
				8227	previously no reqcu, it takes on the value of the old firstcu. */
				8228
				8229	if (firstcuflags != branchfirstcuflags \|\| firstcu != branchfirstcu)
				8230	{
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	8231	if (firstcuflags < REQ_NONE)
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	8232	{
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	8233	if (reqcuflags >= REQ_NONE)
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	8234	{
				8235	reqcu = firstcu;
				8236	reqcuflags = firstcuflags;
				8237	}
				8238	}
				8239	firstcuflags = REQ_NONE;
				8240	}
				8241
				8242	/* If we (now or from before) have no firstcu, a firstcu from the
				8243	branch becomes a reqcu if there isn't a branch reqcu. */
				8244
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	8245	if (firstcuflags >= REQ_NONE && branchfirstcuflags < REQ_NONE &&
				8246	branchreqcuflags >= REQ_NONE)
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	8247	{
				8248	branchreqcu = branchfirstcu;
				8249	branchreqcuflags = branchfirstcuflags;
				8250	}
				8251
				8252	/* Now ensure that the reqcus match */
				8253
				8254	if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) \|\|
				8255	reqcu != branchreqcu)
				8256	reqcuflags = REQ_NONE;
				8257	else
				8258	{
				8259	reqcu = branchreqcu;
				8260	reqcuflags \|= branchreqcuflags; /* To "or" REQ_VARY if present */
				8261	}
				8262	}
				8263	}
				8264
				8265	/* Handle reaching the end of the expression, either ')' or end of pattern.
				8266	In the real compile phase, go back through the alternative branches and
				8267	reverse the chain of offsets, with the field in the BRA item now becoming an
				8268	offset to the first alternative. If there are no alternatives, it points to
				8269	the end of the group. The length in the terminating ket is always the length
				8270	of the whole bracketed item. Return leaving the pointer at the terminating
				8271	char. */
				8272
				8273	if (META_CODE(*pptr) != META_ALT)
				8274	{
				8275	if (lengthptr == NULL)
				8276	{
				8277	PCRE2_SIZE branch_length = code - last_branch;
				8278	do
				8279	{
				8280	PCRE2_SIZE prev_length = GET(last_branch, 1);
				8281	PUT(last_branch, 1, branch_length);
				8282	branch_length = prev_length;
				8283	last_branch -= branch_length;
				8284	}
				8285	while (branch_length > 0);
				8286	}
				8287
				8288	/* Fill in the ket */
				8289
				8290	*code = OP_KET;
				8291	PUT(code, 1, (int)(code - start_bracket));
				8292	code += 1 + LINK_SIZE;
				8293
				8294	/* If it was a capturing subpattern, remove the block from the chain. */
				8295
				8296	if (capnumber > 0) cb->open_caps = cb->open_caps->next;
				8297
				8298	/* Set values to pass back */
				8299
				8300	*codeptr = code;
				8301	*pptrptr = pptr;
				8302	*firstcuptr = firstcu;
				8303	*firstcuflagsptr = firstcuflags;
				8304	*reqcuptr = reqcu;
				8305	*reqcuflagsptr = reqcuflags;
				8306	if (lengthptr != NULL)
				8307	{
				8308	if (OFLOW_MAX - *lengthptr < length)
				8309	{
				8310	*errorcodeptr = ERR20;
				8311	return 0;
				8312	}
				8313	*lengthptr += length;
				8314	}
				8315	return okreturn;
				8316	}
				8317
				8318	/* Another branch follows. In the pre-compile phase, we can move the code
				8319	pointer back to where it was for the start of the first branch. (That is,
				8320	pretend that each branch is the only one.)
				8321
				8322	In the real compile phase, insert an ALT node. Its length field points back
				8323	to the previous branch while the bracket remains open. At the end the chain
				8324	is reversed. It's done like this so that the start of the bracket has a
				8325	zero offset until it is closed, making it possible to detect recursion. */
				8326
				8327	if (lengthptr != NULL)
				8328	{
				8329	code = *codeptr + 1 + LINK_SIZE + skipunits;
				8330	length += 1 + LINK_SIZE;
				8331	}
				8332	else
				8333	{
				8334	*code = OP_ALT;
				8335	PUT(code, 1, (int)(code - last_branch));
				8336	bc.current_branch = last_branch = code;
				8337	code += 1 + LINK_SIZE;
				8338	}
				8339
				8340	/* Set the lookbehind length (if not in a lookbehind the value will be zero)
				8341	and then advance past the vertical bar. */
				8342
				8343	lookbehindlength = META_DATA(*pptr);
				8344	pptr++;
				8345	}
				8346	/* Control never reaches here */
				8347	}
				8348
				8349
				8350
				8351	/*************************************************
				8352	* Check for anchored pattern *
				8353	*************************************************/
				8354
				8355	/* Try to find out if this is an anchored regular expression. Consider each
				8356	alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
				8357	all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
				8358	it's anchored. However, if this is a multiline pattern, then only OP_SOD will
				8359	be found, because ^ generates OP_CIRCM in that mode.
				8360
				8361	We can also consider a regex to be anchored if OP_SOM starts all its branches.
				8362	This is the code for \G, which means "match at start of match position, taking
				8363	into account the match offset".
				8364
				8365	A branch is also implicitly anchored if it starts with .* and DOTALL is set,
				8366	because that will try the rest of the pattern at all possible matching points,
				8367	so there is no point trying again.... er ....
				8368
				8369	.... except when the .* appears inside capturing parentheses, and there is a
				8370	subsequent back reference to those parentheses. We haven't enough information
				8371	to catch that case precisely.
				8372
				8373	At first, the best we could do was to detect when .* was in capturing brackets
				8374	and the highest back reference was greater than or equal to that level.
				8375	However, by keeping a bitmap of the first 31 back references, we can catch some
				8376	of the more common cases more precisely.
				8377
				8378	... A second exception is when the .* appears inside an atomic group, because
				8379	this prevents the number of characters it matches from being adjusted.
				8380
				8381	Arguments:
				8382	code points to start of the compiled pattern
				8383	bracket_map a bitmap of which brackets we are inside while testing; this
				8384	handles up to substring 31; after that we just have to take
				8385	the less precise approach
				8386	cb points to the compile data block
				8387	atomcount atomic group level
				8388	inassert TRUE if in an assertion
				8389
				8390	Returns: TRUE or FALSE
				8391	*/
				8392
				8393	static BOOL
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	8394	is_anchored(PCRE2_SPTR code, uint32_t bracket_map, compile_block *cb,
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	8395	int atomcount, BOOL inassert)
				8396	{
				8397	do {
				8398	PCRE2_SPTR scode = first_significant_code(
				8399	code + PRIV(OP_lengths)[*code], FALSE);
				8400	int op = *scode;
				8401
				8402	/* Non-capturing brackets */
				8403
				8404	if (op == OP_BRA \|\| op == OP_BRAPOS \|\|
				8405	op == OP_SBRA \|\| op == OP_SBRAPOS)
				8406	{
				8407	if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
				8408	return FALSE;
				8409	}
				8410
				8411	/* Capturing brackets */
				8412
				8413	else if (op == OP_CBRA \|\| op == OP_CBRAPOS \|\|
				8414	op == OP_SCBRA \|\| op == OP_SCBRAPOS)
				8415	{
				8416	int n = GET2(scode, 1+LINK_SIZE);
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	8417	uint32_t new_map = bracket_map \| ((n < 32)? (1u << n) : 1);
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	8418	if (!is_anchored(scode, new_map, cb, atomcount, inassert)) return FALSE;
				8419	}
				8420
				8421	/* Positive forward assertion */
				8422
				8423	else if (op == OP_ASSERT \|\| op == OP_ASSERT_NA)
				8424	{
				8425	if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
				8426	}
				8427
				8428	/* Condition. If there is no second branch, it can't be anchored. */
				8429
				8430	else if (op == OP_COND \|\| op == OP_SCOND)
				8431	{
				8432	if (scode[GET(scode,1)] != OP_ALT) return FALSE;
				8433	if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
				8434	return FALSE;
				8435	}
				8436
				8437	/* Atomic groups */
				8438
				8439	else if (op == OP_ONCE)
				8440	{
				8441	if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert))
				8442	return FALSE;
				8443	}
				8444
				8445	/* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
				8446	it isn't in brackets that are or may be referenced or inside an atomic
				8447	group or an assertion. Also the pattern must not contain PRUNE or SKIP,
				8448	because these break the feature. Consider, for example, /(?s).?(PRUNE)b/
				8449	with the subject "aab", which matches "b", i.e. not at the start of a line.
				8450	There is also an option that disables auto-anchoring. */
				8451
				8452	else if ((op == OP_TYPESTAR \|\| op == OP_TYPEMINSTAR \|\|
				8453	op == OP_TYPEPOSSTAR))
				8454	{
				8455	if (scode[1] != OP_ALLANY \|\| (bracket_map & cb->backref_map) != 0 \|\|
				8456	atomcount > 0 \|\| cb->had_pruneorskip \|\| inassert \|\|
				8457	(cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
				8458	return FALSE;
				8459	}
				8460
				8461	/* Check for explicit anchoring */
				8462
				8463	else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
				8464
				8465	code += GET(code, 1);
				8466	}
				8467	while (code == OP_ALT); / Loop for each alternative */
				8468	return TRUE;
				8469	}
				8470
				8471
				8472
				8473	/*************************************************
				8474	* Check for starting with ^ or .* *
				8475	*************************************************/
				8476
				8477	/* This is called to find out if every branch starts with ^ or .* so that
				8478	"first char" processing can be done to speed things up in multiline
				8479	matching and for non-DOTALL patterns that start with .* (which must start at
				8480	the beginning or after \n). As in the case of is_anchored() (see above), we
				8481	have to take account of back references to capturing brackets that contain .*
				8482	because in that case we can't make the assumption. Also, the appearance of .*
				8483	inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
				8484	or *SKIP does not count, because once again the assumption no longer holds.
				8485
				8486	Arguments:
				8487	code points to start of the compiled pattern or a group
				8488	bracket_map a bitmap of which brackets we are inside while testing; this
				8489	handles up to substring 31; after that we just have to take
				8490	the less precise approach
				8491	cb points to the compile data
				8492	atomcount atomic group level
				8493	inassert TRUE if in an assertion
				8494
				8495	Returns: TRUE or FALSE
				8496	*/
				8497
				8498	static BOOL
				8499	is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
				8500	int atomcount, BOOL inassert)
				8501	{
				8502	do {
				8503	PCRE2_SPTR scode = first_significant_code(
				8504	code + PRIV(OP_lengths)[*code], FALSE);
				8505	int op = *scode;
				8506
				8507	/* If we are at the start of a conditional assertion group, both the
				8508	conditional assertion and what follows the condition must satisfy the test
				8509	for start of line. Other kinds of condition fail. Note that there may be an
				8510	auto-callout at the start of a condition. */
				8511
				8512	if (op == OP_COND)
				8513	{
				8514	scode += 1 + LINK_SIZE;
				8515
				8516	if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
				8517	else if (scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2LINK_SIZE);
				8518
				8519	switch (*scode)
				8520	{
				8521	case OP_CREF:
				8522	case OP_DNCREF:
				8523	case OP_RREF:
				8524	case OP_DNRREF:
				8525	case OP_FAIL:
				8526	case OP_FALSE:
				8527	case OP_TRUE:
				8528	return FALSE;
				8529
				8530	default: /* Assertion */
				8531	if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
				8532	do scode += GET(scode, 1); while (*scode == OP_ALT);
				8533	scode += 1 + LINK_SIZE;
				8534	break;
				8535	}
				8536	scode = first_significant_code(scode, FALSE);
				8537	op = *scode;
				8538	}
				8539
				8540	/* Non-capturing brackets */
				8541
				8542	if (op == OP_BRA \|\| op == OP_BRAPOS \|\|
				8543	op == OP_SBRA \|\| op == OP_SBRAPOS)
				8544	{
				8545	if (!is_startline(scode, bracket_map, cb, atomcount, inassert))
				8546	return FALSE;
				8547	}
				8548
				8549	/* Capturing brackets */
				8550
				8551	else if (op == OP_CBRA \|\| op == OP_CBRAPOS \|\|
				8552	op == OP_SCBRA \|\| op == OP_SCBRAPOS)
				8553	{
				8554	int n = GET2(scode, 1+LINK_SIZE);
				8555	int new_map = bracket_map \| ((n < 32)? (1u << n) : 1);
				8556	if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE;
				8557	}
				8558
				8559	/* Positive forward assertions */
				8560
				8561	else if (op == OP_ASSERT \|\| op == OP_ASSERT_NA)
				8562	{
				8563	if (!is_startline(scode, bracket_map, cb, atomcount, TRUE))
				8564	return FALSE;
				8565	}
				8566
				8567	/* Atomic brackets */
				8568
				8569	else if (op == OP_ONCE)
				8570	{
				8571	if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert))
				8572	return FALSE;
				8573	}
				8574
				8575	/* .* means "start at start or after \n" if it isn't in atomic brackets or
				8576	brackets that may be referenced or an assertion, and as long as the pattern
				8577	does not contain PRUNE or SKIP, because these break the feature. Consider,
				8578	for example, /.?a(PRUNE)b/ with the subject "aab", which matches "ab",
				8579	i.e. not at the start of a line. There is also an option that disables this
				8580	optimization. */
				8581
				8582	else if (op == OP_TYPESTAR \|\| op == OP_TYPEMINSTAR \|\| op == OP_TYPEPOSSTAR)
				8583	{
				8584	if (scode[1] != OP_ANY \|\| (bracket_map & cb->backref_map) != 0 \|\|
				8585	atomcount > 0 \|\| cb->had_pruneorskip \|\| inassert \|\|
				8586	(cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
				8587	return FALSE;
				8588	}
				8589
				8590	/* Check for explicit circumflex; anything else gives a FALSE result. Note
				8591	in particular that this includes atomic brackets OP_ONCE because the number
				8592	of characters matched by .* cannot be adjusted inside them. */
				8593
				8594	else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
				8595
				8596	/* Move on to the next alternative */
				8597
				8598	code += GET(code, 1);
				8599	}
				8600	while (code == OP_ALT); / Loop for each alternative */
				8601	return TRUE;
				8602	}
				8603
				8604
				8605
				8606	/*************************************************
				8607	* Scan compiled regex for recursion reference *
				8608	*************************************************/
				8609
				8610	/* This function scans through a compiled pattern until it finds an instance of
				8611	OP_RECURSE.
				8612
				8613	Arguments:
				8614	code points to start of expression
				8615	utf TRUE in UTF mode
				8616
				8617	Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
				8618	*/
				8619
				8620	static PCRE2_SPTR
				8621	find_recurse(PCRE2_SPTR code, BOOL utf)
				8622	{
				8623	for (;;)
				8624	{
				8625	PCRE2_UCHAR c = *code;
				8626	if (c == OP_END) return NULL;
				8627	if (c == OP_RECURSE) return code;
				8628
				8629	/* XCLASS is used for classes that cannot be represented just by a bit map.
				8630	This includes negated single high-valued characters. CALLOUT_STR is used for
				8631	callouts with string arguments. In both cases the length in the table is
				8632	zero; the actual length is stored in the compiled code. */
				8633
				8634	if (c == OP_XCLASS) code += GET(code, 1);
				8635	else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
				8636
				8637	/* Otherwise, we can get the item's length from the table, except that for
				8638	repeated character types, we have to test for \p and \P, which have an extra
				8639	two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument,
				8640	we must add in its length. */
				8641
				8642	else
				8643	{
				8644	switch(c)
				8645	{
				8646	case OP_TYPESTAR:
				8647	case OP_TYPEMINSTAR:
				8648	case OP_TYPEPLUS:
				8649	case OP_TYPEMINPLUS:
				8650	case OP_TYPEQUERY:
				8651	case OP_TYPEMINQUERY:
				8652	case OP_TYPEPOSSTAR:
				8653	case OP_TYPEPOSPLUS:
				8654	case OP_TYPEPOSQUERY:
				8655	if (code[1] == OP_PROP \|\| code[1] == OP_NOTPROP) code += 2;
				8656	break;
				8657
				8658	case OP_TYPEPOSUPTO:
				8659	case OP_TYPEUPTO:
				8660	case OP_TYPEMINUPTO:
				8661	case OP_TYPEEXACT:
				8662	if (code[1 + IMM2_SIZE] == OP_PROP \|\| code[1 + IMM2_SIZE] == OP_NOTPROP)
				8663	code += 2;
				8664	break;
				8665
				8666	case OP_MARK:
				8667	case OP_COMMIT_ARG:
				8668	case OP_PRUNE_ARG:
				8669	case OP_SKIP_ARG:
				8670	case OP_THEN_ARG:
				8671	code += code[1];
				8672	break;
				8673	}
				8674
				8675	/* Add in the fixed length from the table */
				8676
				8677	code += PRIV(OP_lengths)[c];
				8678
				8679	/* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may
				8680	be followed by a multi-unit character. The length in the table is a
				8681	minimum, so we have to arrange to skip the extra units. */
				8682
				8683	#ifdef MAYBE_UTF_MULTI
				8684	if (utf) switch(c)
				8685	{
				8686	case OP_CHAR:
				8687	case OP_CHARI:
				8688	case OP_NOT:
				8689	case OP_NOTI:
				8690	case OP_EXACT:
				8691	case OP_EXACTI:
				8692	case OP_NOTEXACT:
				8693	case OP_NOTEXACTI:
				8694	case OP_UPTO:
				8695	case OP_UPTOI:
				8696	case OP_NOTUPTO:
				8697	case OP_NOTUPTOI:
				8698	case OP_MINUPTO:
				8699	case OP_MINUPTOI:
				8700	case OP_NOTMINUPTO:
				8701	case OP_NOTMINUPTOI:
				8702	case OP_POSUPTO:
				8703	case OP_POSUPTOI:
				8704	case OP_NOTPOSUPTO:
				8705	case OP_NOTPOSUPTOI:
				8706	case OP_STAR:
				8707	case OP_STARI:
				8708	case OP_NOTSTAR:
				8709	case OP_NOTSTARI:
				8710	case OP_MINSTAR:
				8711	case OP_MINSTARI:
				8712	case OP_NOTMINSTAR:
				8713	case OP_NOTMINSTARI:
				8714	case OP_POSSTAR:
				8715	case OP_POSSTARI:
				8716	case OP_NOTPOSSTAR:
				8717	case OP_NOTPOSSTARI:
				8718	case OP_PLUS:
				8719	case OP_PLUSI:
				8720	case OP_NOTPLUS:
				8721	case OP_NOTPLUSI:
				8722	case OP_MINPLUS:
				8723	case OP_MINPLUSI:
				8724	case OP_NOTMINPLUS:
				8725	case OP_NOTMINPLUSI:
				8726	case OP_POSPLUS:
				8727	case OP_POSPLUSI:
				8728	case OP_NOTPOSPLUS:
				8729	case OP_NOTPOSPLUSI:
				8730	case OP_QUERY:
				8731	case OP_QUERYI:
				8732	case OP_NOTQUERY:
				8733	case OP_NOTQUERYI:
				8734	case OP_MINQUERY:
				8735	case OP_MINQUERYI:
				8736	case OP_NOTMINQUERY:
				8737	case OP_NOTMINQUERYI:
				8738	case OP_POSQUERY:
				8739	case OP_POSQUERYI:
				8740	case OP_NOTPOSQUERY:
				8741	case OP_NOTPOSQUERYI:
				8742	if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
				8743	break;
				8744	}
				8745	#else
				8746	(void)(utf); /* Keep compiler happy by referencing function argument */
				8747	#endif /* MAYBE_UTF_MULTI */
				8748	}
				8749	}
				8750	}
				8751
				8752
				8753
				8754	/*************************************************
				8755	* Check for asserted fixed first code unit *
				8756	*************************************************/
				8757
				8758	/* During compilation, the "first code unit" settings from forward assertions
				8759	are discarded, because they can cause conflicts with actual literals that
				8760	follow. However, if we end up without a first code unit setting for an
				8761	unanchored pattern, it is worth scanning the regex to see if there is an
				8762	initial asserted first code unit. If all branches start with the same asserted
				8763	code unit, or with a non-conditional bracket all of whose alternatives start
				8764	with the same asserted code unit (recurse ad lib), then we return that code
				8765	unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
				8766	REQ_NONE in the flags.
				8767
				8768	Arguments:
				8769	code points to start of compiled pattern
				8770	flags points to the first code unit flags
				8771	inassert non-zero if in an assertion
				8772
				8773	Returns: the fixed first code unit, or 0 with REQ_NONE in flags
				8774	*/
				8775
				8776	static uint32_t
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	8777	find_firstassertedcu(PCRE2_SPTR code, uint32_t *flags, uint32_t inassert)
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	8778	{
				8779	uint32_t c = 0;
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	8780	uint32_t cflags = REQ_NONE;
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	8781
				8782	*flags = REQ_NONE;
				8783	do {
				8784	uint32_t d;
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	8785	uint32_t dflags;
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	8786	int xl = (code == OP_CBRA \|\| code == OP_SCBRA \|\|
				8787	code == OP_CBRAPOS \|\| code == OP_SCBRAPOS)? IMM2_SIZE:0;
				8788	PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
				8789	PCRE2_UCHAR op = *scode;
				8790
				8791	switch(op)
				8792	{
				8793	default:
				8794	return 0;
				8795
				8796	case OP_BRA:
				8797	case OP_BRAPOS:
				8798	case OP_CBRA:
				8799	case OP_SCBRA:
				8800	case OP_CBRAPOS:
				8801	case OP_SCBRAPOS:
				8802	case OP_ASSERT:
				8803	case OP_ASSERT_NA:
				8804	case OP_ONCE:
				8805	case OP_SCRIPT_RUN:
				8806	d = find_firstassertedcu(scode, &dflags, inassert +
				8807	((op == OP_ASSERT \|\| op == OP_ASSERT_NA)?1:0));
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	8808	if (dflags >= REQ_NONE) return 0;
				8809	if (cflags >= REQ_NONE) { c = d; cflags = dflags; }
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	8810	else if (c != d \|\| cflags != dflags) return 0;
				8811	break;
				8812
				8813	case OP_EXACT:
				8814	scode += IMM2_SIZE;
				8815	/* Fall through */
				8816
				8817	case OP_CHAR:
				8818	case OP_PLUS:
				8819	case OP_MINPLUS:
				8820	case OP_POSPLUS:
				8821	if (inassert == 0) return 0;
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	8822	if (cflags >= REQ_NONE) { c = scode[1]; cflags = 0; }
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	8823	else if (c != scode[1]) return 0;
				8824	break;
				8825
				8826	case OP_EXACTI:
				8827	scode += IMM2_SIZE;
				8828	/* Fall through */
				8829
				8830	case OP_CHARI:
				8831	case OP_PLUSI:
				8832	case OP_MINPLUSI:
				8833	case OP_POSPLUSI:
				8834	if (inassert == 0) return 0;
				8835
				8836	/* If the character is more than one code unit long, we cannot set its
				8837	first code unit when matching caselessly. Later scanning may pick up
				8838	multiple code units. */
				8839
				8840	#ifdef SUPPORT_UNICODE
				8841	#if PCRE2_CODE_UNIT_WIDTH == 8
				8842	if (scode[1] >= 0x80) return 0;
				8843	#elif PCRE2_CODE_UNIT_WIDTH == 16
				8844	if (scode[1] >= 0xd800 && scode[1] <= 0xdfff) return 0;
				8845	#endif
				8846	#endif
				8847
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	8848	if (cflags >= REQ_NONE) { c = scode[1]; cflags = REQ_CASELESS; }
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	8849	else if (c != scode[1]) return 0;
				8850	break;
				8851	}
				8852
				8853	code += GET(code, 1);
				8854	}
				8855	while (*code == OP_ALT);
				8856
				8857	*flags = cflags;
				8858	return c;
				8859	}
				8860
				8861
				8862
				8863	/*************************************************
				8864	* Add an entry to the name/number table *
				8865	*************************************************/
				8866
				8867	/* This function is called between compiling passes to add an entry to the
				8868	name/number table, maintaining alphabetical order. Checking for permitted
				8869	and forbidden duplicates has already been done.
				8870
				8871	Arguments:
				8872	cb the compile data block
				8873	name the name to add
				8874	length the length of the name
				8875	groupno the group number
				8876	tablecount the count of names in the table so far
				8877
				8878	Returns: nothing
				8879	*/
				8880
				8881	static void
				8882	add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length,
				8883	unsigned int groupno, uint32_t tablecount)
				8884	{
				8885	uint32_t i;
				8886	PCRE2_UCHAR *slot = cb->name_table;
				8887
				8888	for (i = 0; i < tablecount; i++)
				8889	{
				8890	int crc = memcmp(name, slot+IMM2_SIZE, CU2BYTES(length));
				8891	if (crc == 0 && slot[IMM2_SIZE+length] != 0)
				8892	crc = -1; /* Current name is a substring */
				8893
				8894	/* Make space in the table and break the loop for an earlier name. For a
				8895	duplicate or later name, carry on. We do this for duplicates so that in the
				8896	simple case (when ?(\| is not used) they are in order of their numbers. In all
				8897	cases they are in the order in which they appear in the pattern. */
				8898
				8899	if (crc < 0)
				8900	{
				8901	(void)memmove(slot + cb->name_entry_size, slot,
				8902	CU2BYTES((tablecount - i) * cb->name_entry_size));
				8903	break;
				8904	}
				8905
				8906	/* Continue the loop for a later or duplicate name */
				8907
				8908	slot += cb->name_entry_size;
				8909	}
				8910
				8911	PUT2(slot, 0, groupno);
				8912	memcpy(slot + IMM2_SIZE, name, CU2BYTES(length));
				8913
				8914	/* Add a terminating zero and fill the rest of the slot with zeroes so that
				8915	the memory is all initialized. Otherwise valgrind moans about uninitialized
				8916	memory when saving serialized compiled patterns. */
				8917
				8918	memset(slot + IMM2_SIZE + length, 0,
				8919	CU2BYTES(cb->name_entry_size - length - IMM2_SIZE));
				8920	}
				8921
				8922
				8923
				8924	/*************************************************
				8925	* Skip in parsed pattern *
				8926	*************************************************/
				8927
				8928	/* This function is called to skip parts of the parsed pattern when finding the
				8929	length of a lookbehind branch. It is called after (ACCEPT) and (FAIL) to find
				8930	the end of the branch, it is called to skip over an internal lookaround or
				8931	(DEFINE) group, and it is also called to skip to the end of a class, during
				8932	which it will never encounter nested groups (but there's no need to have
				8933	special code for that).
				8934
				8935	When called to find the end of a branch or group, pptr must point to the first
				8936	meta code inside the branch, not the branch-starting code. In other cases it
				8937	can point to the item that causes the function to be called.
				8938
				8939	Arguments:
				8940	pptr current pointer to skip from
				8941	skiptype PSKIP_CLASS when skipping to end of class
				8942	PSKIP_ALT when META_ALT ends the skip
				8943	PSKIP_KET when only META_KET ends the skip
				8944
				8945	Returns: new value of pptr
				8946	NULL if META_END is reached - should never occur
				8947	or for an unknown meta value - likewise
				8948	*/
				8949
				8950	static uint32_t *
				8951	parsed_skip(uint32_t *pptr, uint32_t skiptype)
				8952	{
				8953	uint32_t nestlevel = 0;
				8954
				8955	for (;; pptr++)
				8956	{
				8957	uint32_t meta = META_CODE(*pptr);
				8958
				8959	switch(meta)
				8960	{
				8961	default: /* Just skip over most items */
				8962	if (meta < META_END) continue; /* Literal */
				8963	break;
				8964
				8965	/* This should never occur. */
				8966
				8967	case META_END:
				8968	return NULL;
				8969
				8970	/* The data for these items is variable in length. */
				8971
				8972	case META_BACKREF: /* Offset is present only if group >= 10 */
				8973	if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET;
				8974	break;
				8975
				8976	case META_ESCAPE: /* A few escapes are followed by data items. */
				8977	switch (META_DATA(*pptr))
				8978	{
				8979	case ESC_P:
				8980	case ESC_p:
				8981	pptr += 1;
				8982	break;
				8983
				8984	case ESC_g:
				8985	case ESC_k:
				8986	pptr += 1 + SIZEOFFSET;
				8987	break;
				8988	}
				8989	break;
				8990
				8991	case META_MARK: /* Add the length of the name. */
				8992	case META_COMMIT_ARG:
				8993	case META_PRUNE_ARG:
				8994	case META_SKIP_ARG:
				8995	case META_THEN_ARG:
				8996	pptr += pptr[1];
				8997	break;
				8998
				8999	/* These are the "active" items in this loop. */
				9000
				9001	case META_CLASS_END:
				9002	if (skiptype == PSKIP_CLASS) return pptr;
				9003	break;
				9004
				9005	case META_ATOMIC:
				9006	case META_CAPTURE:
				9007	case META_COND_ASSERT:
				9008	case META_COND_DEFINE:
				9009	case META_COND_NAME:
				9010	case META_COND_NUMBER:
				9011	case META_COND_RNAME:
				9012	case META_COND_RNUMBER:
				9013	case META_COND_VERSION:
				9014	case META_LOOKAHEAD:
				9015	case META_LOOKAHEADNOT:
				9016	case META_LOOKAHEAD_NA:
				9017	case META_LOOKBEHIND:
				9018	case META_LOOKBEHINDNOT:
				9019	case META_LOOKBEHIND_NA:
				9020	case META_NOCAPTURE:
				9021	case META_SCRIPT_RUN:
				9022	nestlevel++;
				9023	break;
				9024
				9025	case META_ALT:
				9026	if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr;
				9027	break;
				9028
				9029	case META_KET:
				9030	if (nestlevel == 0) return pptr;
				9031	nestlevel--;
				9032	break;
				9033	}
				9034
				9035	/* The extra data item length for each meta is in a table. */
				9036
				9037	meta = (meta >> 16) & 0x7fff;
				9038	if (meta >= sizeof(meta_extra_lengths)) return NULL;
				9039	pptr += meta_extra_lengths[meta];
				9040	}
				9041	/* Control never reaches here */
				9042	return pptr;
				9043	}
				9044
				9045
				9046
				9047	/*************************************************
				9048	* Find length of a parsed group *
				9049	*************************************************/
				9050
				9051	/* This is called for nested groups within a branch of a lookbehind whose
				9052	length is being computed. If all the branches in the nested group have the same
				9053	length, that is OK. On entry, the pointer must be at the first element after
				9054	the group initializing code. On exit it points to OP_KET. Caching is used to
				9055	improve processing speed when the same capturing group occurs many times.
				9056
				9057	Arguments:
				9058	pptrptr pointer to pointer in the parsed pattern
				9059	isinline FALSE if a reference or recursion; TRUE for inline group
				9060	errcodeptr pointer to the errorcode
				9061	lcptr pointer to the loop counter
				9062	group number of captured group or -1 for a non-capturing group
				9063	recurses chain of recurse_check to catch mutual recursion
				9064	cb pointer to the compile data
				9065
				9066	Returns: the group length or a negative number
				9067	*/
				9068
				9069	static int
				9070	get_grouplength(uint32_t *pptrptr, BOOL isinline, int errcodeptr, int *lcptr,
				9071	int group, parsed_recurse_check recurses, compile_block cb)
				9072	{
				9073	int branchlength;
				9074	int grouplength = -1;
				9075
				9076	/* The cache can be used only if there is no possibility of there being two
				9077	groups with the same number. We do not need to set the end pointer for a group
				9078	that is being processed as a back reference or recursion, but we must do so for
				9079	an inline group. */
				9080
				9081	if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
				9082	{
				9083	uint32_t groupinfo = cb->groupinfo[group];
				9084	if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1;
				9085	if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
				9086	{
				9087	if (isinline) pptrptr = parsed_skip(pptrptr, PSKIP_KET);
				9088	return groupinfo & GI_FIXED_LENGTH_MASK;
				9089	}
				9090	}
				9091
				9092	/* Scan the group. In this case we find the end pointer of necessity. */
				9093
				9094	for(;;)
				9095	{
				9096	branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
				9097	if (branchlength < 0) goto ISNOTFIXED;
				9098	if (grouplength == -1) grouplength = branchlength;
				9099	else if (grouplength != branchlength) goto ISNOTFIXED;
				9100	if (**pptrptr == META_KET) break;
				9101	pptrptr += 1; / Skip META_ALT */
				9102	}
				9103
				9104	if (group > 0)
				9105	cb->groupinfo[group] \|= (uint32_t)(GI_SET_FIXED_LENGTH \| grouplength);
				9106	return grouplength;
				9107
				9108	ISNOTFIXED:
				9109	if (group > 0) cb->groupinfo[group] \|= GI_NOT_FIXED_LENGTH;
				9110	return -1;
				9111	}
				9112
				9113
				9114
				9115	/*************************************************
				9116	* Find length of a parsed branch *
				9117	*************************************************/
				9118
				9119	/* Return a fixed length for a branch in a lookbehind, giving an error if the
				9120	length is not fixed. On entry, *pptrptr points to the first element inside the
				9121	branch. On exit it is set to point to the ALT or KET.
				9122
				9123	Arguments:
				9124	pptrptr pointer to pointer in the parsed pattern
				9125	errcodeptr pointer to error code
				9126	lcptr pointer to loop counter
				9127	recurses chain of recurse_check to catch mutual recursion
				9128	cb pointer to compile block
				9129
				9130	Returns: the length, or a negative value on error
				9131	*/
				9132
				9133	static int
				9134	get_branchlength(uint32_t *pptrptr, int errcodeptr, int *lcptr,
				9135	parsed_recurse_check recurses, compile_block cb)
				9136	{
				9137	int branchlength = 0;
				9138	int grouplength;
				9139	uint32_t lastitemlength = 0;
				9140	uint32_t pptr = pptrptr;
				9141	PCRE2_SIZE offset;
				9142	parsed_recurse_check this_recurse;
				9143
				9144	/* A large and/or complex regex can take too long to process. This can happen
				9145	more often when (?\| groups are present in the pattern because their length
				9146	cannot be cached. */
				9147
				9148	if ((*lcptr)++ > 2000)
				9149	{
				9150	errcodeptr = ERR35; / Lookbehind is too complicated */
				9151	return -1;
				9152	}
				9153
				9154	/* Scan the branch, accumulating the length. */
				9155
				9156	for (;; pptr++)
				9157	{
				9158	parsed_recurse_check *r;
				9159	uint32_t gptr, gptrend;
				9160	uint32_t escape;
				9161	uint32_t group = 0;
				9162	uint32_t itemlength = 0;
				9163
				9164	if (*pptr < META_END)
				9165	{
				9166	itemlength = 1;
				9167	}
				9168
				9169	else switch (META_CODE(*pptr))
				9170	{
				9171	case META_KET:
				9172	case META_ALT:
				9173	goto EXIT;
				9174
				9175	/* (ACCEPT) and (FAIL) terminate the branch, but we must skip to the
				9176	actual termination. */
				9177
				9178	case META_ACCEPT:
				9179	case META_FAIL:
				9180	pptr = parsed_skip(pptr, PSKIP_ALT);
				9181	if (pptr == NULL) goto PARSED_SKIP_FAILED;
				9182	goto EXIT;
				9183
				9184	case META_MARK:
				9185	case META_COMMIT_ARG:
				9186	case META_PRUNE_ARG:
				9187	case META_SKIP_ARG:
				9188	case META_THEN_ARG:
				9189	pptr += pptr[1] + 1;
				9190	break;
				9191
				9192	case META_CIRCUMFLEX:
				9193	case META_COMMIT:
				9194	case META_DOLLAR:
				9195	case META_PRUNE:
				9196	case META_SKIP:
				9197	case META_THEN:
				9198	break;
				9199
				9200	case META_OPTIONS:
				9201	pptr += 1;
				9202	break;
				9203
				9204	case META_BIGVALUE:
				9205	itemlength = 1;
				9206	pptr += 1;
				9207	break;
				9208
				9209	case META_CLASS:
				9210	case META_CLASS_NOT:
				9211	itemlength = 1;
				9212	pptr = parsed_skip(pptr, PSKIP_CLASS);
				9213	if (pptr == NULL) goto PARSED_SKIP_FAILED;
				9214	break;
				9215
				9216	case META_CLASS_EMPTY_NOT:
				9217	case META_DOT:
				9218	itemlength = 1;
				9219	break;
				9220
				9221	case META_CALLOUT_NUMBER:
				9222	pptr += 3;
				9223	break;
				9224
				9225	case META_CALLOUT_STRING:
				9226	pptr += 3 + SIZEOFFSET;
				9227	break;
				9228
				9229	/* Only some escapes consume a character. Of those, \R and \X are never
				9230	allowed because they might match more than character. \C is allowed only in
				9231	32-bit and non-UTF 8/16-bit modes. */
				9232
				9233	case META_ESCAPE:
				9234	escape = META_DATA(*pptr);
				9235	if (escape == ESC_R \|\| escape == ESC_X) return -1;
				9236	if (escape > ESC_b && escape < ESC_Z)
				9237	{
				9238	#if PCRE2_CODE_UNIT_WIDTH != 32
				9239	if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C)
				9240	{
				9241	*errcodeptr = ERR36;
				9242	return -1;
				9243	}
				9244	#endif
				9245	itemlength = 1;
				9246	if (escape == ESC_p \|\| escape == ESC_P) pptr++; /* Skip prop data */
				9247	}
				9248	break;
				9249
				9250	/* Lookaheads do not contribute to the length of this branch, but they may
				9251	contain lookbehinds within them whose lengths need to be set. */
				9252
				9253	case META_LOOKAHEAD:
				9254	case META_LOOKAHEADNOT:
				9255	case META_LOOKAHEAD_NA:
				9256	*errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb, lcptr);
				9257	if (*errcodeptr != 0) return -1;
				9258
				9259	/* Ignore any qualifiers that follow a lookahead assertion. */
				9260
				9261	switch (pptr[1])
				9262	{
				9263	case META_ASTERISK:
				9264	case META_ASTERISK_PLUS:
				9265	case META_ASTERISK_QUERY:
				9266	case META_PLUS:
				9267	case META_PLUS_PLUS:
				9268	case META_PLUS_QUERY:
				9269	case META_QUERY:
				9270	case META_QUERY_PLUS:
				9271	case META_QUERY_QUERY:
				9272	pptr++;
				9273	break;
				9274
				9275	case META_MINMAX:
				9276	case META_MINMAX_PLUS:
				9277	case META_MINMAX_QUERY:
				9278	pptr += 3;
				9279	break;
				9280
				9281	default:
				9282	break;
				9283	}
				9284	break;
				9285
				9286	/* A nested lookbehind does not contribute any length to this lookbehind,
				9287	but must itself be checked and have its lengths set. */
				9288
				9289	case META_LOOKBEHIND:
				9290	case META_LOOKBEHINDNOT:
				9291	case META_LOOKBEHIND_NA:
				9292	if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb))
				9293	return -1;
				9294	break;
				9295
				9296	/* Back references and recursions are handled by very similar code. At this
				9297	stage, the names generated in the parsing pass are available, but the main
				9298	name table has not yet been created. So for the named varieties, scan the
				9299	list of names in order to get the number of the first one in the pattern,
				9300	and whether or not this name is duplicated. */
				9301
				9302	case META_BACKREF_BYNAME:
				9303	if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0)
				9304	goto ISNOTFIXED;
				9305	/* Fall through */
				9306
				9307	case META_RECURSE_BYNAME:
				9308	{
				9309	int i;
				9310	PCRE2_SPTR name;
				9311	BOOL is_dupname = FALSE;
				9312	named_group *ng = cb->named_groups;
				9313	uint32_t meta_code = META_CODE(*pptr);
				9314	uint32_t length = *(++pptr);
				9315
				9316	GETPLUSOFFSET(offset, pptr);
				9317	name = cb->start_pattern + offset;
				9318	for (i = 0; i < cb->names_found; i++, ng++)
				9319	{
				9320	if (length == ng->length && PRIV(strncmp)(name, ng->name, length) == 0)
				9321	{
				9322	group = ng->number;
				9323	is_dupname = ng->isdup;
				9324	break;
				9325	}
				9326	}
				9327
				9328	if (group == 0)
				9329	{
				9330	errcodeptr = ERR15; / Non-existent subpattern */
				9331	cb->erroroffset = offset;
				9332	return -1;
				9333	}
				9334
				9335	/* A numerical back reference can be fixed length if duplicate capturing
				9336	groups are not being used. A non-duplicate named back reference can also
				9337	be handled. */
				9338
				9339	if (meta_code == META_RECURSE_BYNAME \|\|
				9340	(!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0))
				9341	goto RECURSE_OR_BACKREF_LENGTH; /* Handle as a numbered version. */
				9342	}
				9343	goto ISNOTFIXED; /* Duplicate name or number */
				9344
				9345	/* The offset values for back references < 10 are in a separate vector
				9346	because otherwise they would use more than two parsed pattern elements on
				9347	64-bit systems. */
				9348
				9349	case META_BACKREF:
				9350	if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 \|\|
				9351	(cb->external_flags & PCRE2_DUPCAPUSED) != 0)
				9352	goto ISNOTFIXED;
				9353	group = META_DATA(*pptr);
				9354	if (group < 10)
				9355	{
				9356	offset = cb->small_ref_offset[group];
				9357	goto RECURSE_OR_BACKREF_LENGTH;
				9358	}
				9359
				9360	/* Fall through */
				9361	/* For groups >= 10 - picking up group twice does no harm. */
				9362
				9363	/* A true recursion implies not fixed length, but a subroutine call may
				9364	be OK. Back reference "recursions" are also failed. */
				9365
				9366	case META_RECURSE:
				9367	group = META_DATA(*pptr);
				9368	GETPLUSOFFSET(offset, pptr);
				9369
				9370	RECURSE_OR_BACKREF_LENGTH:
				9371	if (group > cb->bracount)
				9372	{
				9373	cb->erroroffset = offset;
				9374	errcodeptr = ERR15; / Non-existent subpattern */
				9375	return -1;
				9376	}
				9377	if (group == 0) goto ISNOTFIXED; /* Local recursion */
				9378	for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++)
				9379	{
				9380	if (META_CODE(*gptr) == META_BIGVALUE) gptr++;
				9381	else if (*gptr == (META_CAPTURE \| group)) break;
				9382	}
				9383
				9384	/* We must start the search for the end of the group at the first meta code
				9385	inside the group. Otherwise it will be treated as an enclosed group. */
				9386
				9387	gptrend = parsed_skip(gptr + 1, PSKIP_KET);
				9388	if (gptrend == NULL) goto PARSED_SKIP_FAILED;
				9389	if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED; /* Local recursion */
				9390	for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break;
				9391	if (r != NULL) goto ISNOTFIXED; /* Mutual recursion */
				9392	this_recurse.prev = recurses;
				9393	this_recurse.groupptr = gptr;
				9394
				9395	/* We do not need to know the position of the end of the group, that is,
				9396	gptr is not used after the call to get_grouplength(). Setting the second
				9397	argument FALSE stops it scanning for the end when the length can be found
				9398	in the cache. */
				9399
				9400	gptr++;
				9401	grouplength = get_grouplength(&gptr, FALSE, errcodeptr, lcptr, group,
				9402	&this_recurse, cb);
				9403	if (grouplength < 0)
				9404	{
				9405	if (*errcodeptr == 0) goto ISNOTFIXED;
				9406	return -1; /* Error already set */
				9407	}
				9408	itemlength = grouplength;
				9409	break;
				9410
				9411	/* A (DEFINE) group is never obeyed inline and so it does not contribute to
				9412	the length of this branch. Skip from the following item to the next
				9413	unpaired ket. */
				9414
				9415	case META_COND_DEFINE:
				9416	pptr = parsed_skip(pptr + 1, PSKIP_KET);
				9417	break;
				9418
				9419	/* Check other nested groups - advance past the initial data for each type
				9420	and then seek a fixed length with get_grouplength(). */
				9421
				9422	case META_COND_NAME:
				9423	case META_COND_NUMBER:
				9424	case META_COND_RNAME:
				9425	case META_COND_RNUMBER:
				9426	pptr += 2 + SIZEOFFSET;
				9427	goto CHECK_GROUP;
				9428
				9429	case META_COND_ASSERT:
				9430	pptr += 1;
				9431	goto CHECK_GROUP;
				9432
				9433	case META_COND_VERSION:
				9434	pptr += 4;
				9435	goto CHECK_GROUP;
				9436
				9437	case META_CAPTURE:
				9438	group = META_DATA(*pptr);
				9439	/* Fall through */
				9440
				9441	case META_ATOMIC:
				9442	case META_NOCAPTURE:
				9443	case META_SCRIPT_RUN:
				9444	pptr++;
				9445	CHECK_GROUP:
				9446	grouplength = get_grouplength(&pptr, TRUE, errcodeptr, lcptr, group,
				9447	recurses, cb);
				9448	if (grouplength < 0) return -1;
				9449	itemlength = grouplength;
				9450	break;
				9451
				9452	/* Exact repetition is OK; variable repetition is not. A repetition of zero
				9453	must subtract the length that has already been added. */
				9454
				9455	case META_MINMAX:
				9456	case META_MINMAX_PLUS:
				9457	case META_MINMAX_QUERY:
				9458	if (pptr[1] == pptr[2])
				9459	{
				9460	switch(pptr[1])
				9461	{
				9462	case 0:
				9463	branchlength -= lastitemlength;
				9464	break;
				9465
				9466	case 1:
				9467	itemlength = 0;
				9468	break;
				9469
				9470	default: /* Check for integer overflow */
				9471	if (lastitemlength != 0 && /* Should not occur, but just in case */
				9472	INT_MAX/lastitemlength < pptr[1] - 1)
				9473	{
				9474	errcodeptr = ERR87; / Integer overflow; lookbehind too big */
				9475	return -1;
				9476	}
				9477	itemlength = (pptr[1] - 1) * lastitemlength;
				9478	break;
				9479	}
				9480	pptr += 2;
				9481	break;
				9482	}
				9483	/* Fall through */
				9484
				9485	/* Any other item means this branch does not have a fixed length. */
				9486
				9487	default:
				9488	ISNOTFIXED:
				9489	errcodeptr = ERR25; / Not fixed length */
				9490	return -1;
				9491	}
				9492
				9493	/* Add the item length to the branchlength, checking for integer overflow and
				9494	for the branch length exceeding the limit. */
				9495
				9496	if (INT_MAX - branchlength < (int)itemlength \|\|
				9497	(branchlength += itemlength) > LOOKBEHIND_MAX)
				9498	{
				9499	*errcodeptr = ERR87;
				9500	return -1;
				9501	}
				9502
				9503	/* Save this item length for use if the next item is a quantifier. */
				9504
				9505	lastitemlength = itemlength;
				9506	}
				9507
				9508	EXIT:
				9509	*pptrptr = pptr;
				9510	return branchlength;
				9511
				9512	PARSED_SKIP_FAILED:
				9513	*errcodeptr = ERR90;
				9514	return -1;
				9515	}
				9516
				9517
				9518
				9519	/*************************************************
				9520	* Set lengths in a lookbehind *
				9521	*************************************************/
				9522
				9523	/* This function is called for each lookbehind, to set the lengths in its
				9524	branches. An error occurs if any branch does not have a fixed length that is
				9525	less than the maximum (65535). On exit, the pointer must be left on the final
				9526	ket.
				9527
				9528	The function also maintains the max_lookbehind value. Any lookbehind branch
				9529	that contains a nested lookbehind may actually look further back than the
				9530	length of the branch. The additional amount is passed back from
				9531	get_branchlength() as an "extra" value.
				9532
				9533	Arguments:
				9534	pptrptr pointer to pointer in the parsed pattern
				9535	errcodeptr pointer to error code
				9536	lcptr pointer to loop counter
				9537	recurses chain of recurse_check to catch mutual recursion
				9538	cb pointer to compile block
				9539
				9540	Returns: TRUE if all is well
				9541	FALSE otherwise, with error code and offset set
				9542	*/
				9543
				9544	static BOOL
				9545	set_lookbehind_lengths(uint32_t *pptrptr, int errcodeptr, int *lcptr,
				9546	parsed_recurse_check recurses, compile_block cb)
				9547	{
				9548	PCRE2_SIZE offset;
				9549	int branchlength;
				9550	uint32_t bptr = pptrptr;
				9551
				9552	READPLUSOFFSET(offset, bptr); /* Offset for error messages */
				9553	*pptrptr += SIZEOFFSET;
				9554
				9555	do
				9556	{
				9557	*pptrptr += 1;
				9558	branchlength = get_branchlength(pptrptr, errcodeptr, lcptr, recurses, cb);
				9559	if (branchlength < 0)
				9560	{
				9561	/* The errorcode and offset may already be set from a nested lookbehind. */
				9562	if (errcodeptr == 0) errcodeptr = ERR25;
				9563	if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
				9564	return FALSE;
				9565	}
				9566	if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength;
				9567	bptr \|= branchlength; / branchlength never more than 65535 */
				9568	bptr = *pptrptr;
				9569	}
				9570	while (*bptr == META_ALT);
				9571
				9572	return TRUE;
				9573	}
				9574
				9575
				9576
				9577	/*************************************************
				9578	* Check parsed pattern lookbehinds *
				9579	*************************************************/
				9580
				9581	/* This function is called at the end of parsing a pattern if any lookbehinds
				9582	were encountered. It scans the parsed pattern for them, calling
				9583	set_lookbehind_lengths() for each one. At the start, the errorcode is zero and
				9584	the error offset is marked unset. The enables the functions above not to
				9585	override settings from deeper nestings.
				9586
				9587	This function is called recursively from get_branchlength() for lookaheads in
				9588	order to process any lookbehinds that they may contain. It stops when it hits a
				9589	non-nested closing parenthesis in this case, returning a pointer to it.
				9590
				9591	Arguments
				9592	pptr points to where to start (start of pattern or start of lookahead)
				9593	retptr if not NULL, return the ket pointer here
				9594	recurses chain of recurse_check to catch mutual recursion
				9595	cb points to the compile block
				9596	lcptr points to loop counter
				9597
				9598	Returns: 0 on success, or an errorcode (cb->erroroffset will be set)
				9599	*/
				9600
				9601	static int
				9602	check_lookbehinds(uint32_t pptr, uint32_t *retptr,
				9603	parsed_recurse_check recurses, compile_block cb, int *lcptr)
				9604	{
				9605	int errorcode = 0;
				9606	int nestlevel = 0;
				9607
				9608	cb->erroroffset = PCRE2_UNSET;
				9609
				9610	for (; *pptr != META_END; pptr++)
				9611	{
				9612	if (pptr < META_END) continue; / Literal */
				9613
				9614	switch (META_CODE(*pptr))
				9615	{
				9616	default:
				9617	return ERR70; /* Unrecognized meta code */
				9618
				9619	case META_ESCAPE:
				9620	if (pptr - META_ESCAPE == ESC_P \|\| pptr - META_ESCAPE == ESC_p)
				9621	pptr += 1;
				9622	break;
				9623
				9624	case META_KET:
				9625	if (--nestlevel < 0)
				9626	{
				9627	if (retptr != NULL) *retptr = pptr;
				9628	return 0;
				9629	}
				9630	break;
				9631
				9632	case META_ATOMIC:
				9633	case META_CAPTURE:
				9634	case META_COND_ASSERT:
				9635	case META_LOOKAHEAD:
				9636	case META_LOOKAHEADNOT:
				9637	case META_LOOKAHEAD_NA:
				9638	case META_NOCAPTURE:
				9639	case META_SCRIPT_RUN:
				9640	nestlevel++;
				9641	break;
				9642
				9643	case META_ACCEPT:
				9644	case META_ALT:
				9645	case META_ASTERISK:
				9646	case META_ASTERISK_PLUS:
				9647	case META_ASTERISK_QUERY:
				9648	case META_BACKREF:
				9649	case META_CIRCUMFLEX:
				9650	case META_CLASS:
				9651	case META_CLASS_EMPTY:
				9652	case META_CLASS_EMPTY_NOT:
				9653	case META_CLASS_END:
				9654	case META_CLASS_NOT:
				9655	case META_COMMIT:
				9656	case META_DOLLAR:
				9657	case META_DOT:
				9658	case META_FAIL:
				9659	case META_PLUS:
				9660	case META_PLUS_PLUS:
				9661	case META_PLUS_QUERY:
				9662	case META_PRUNE:
				9663	case META_QUERY:
				9664	case META_QUERY_PLUS:
				9665	case META_QUERY_QUERY:
				9666	case META_RANGE_ESCAPED:
				9667	case META_RANGE_LITERAL:
				9668	case META_SKIP:
				9669	case META_THEN:
				9670	break;
				9671
				9672	case META_RECURSE:
				9673	pptr += SIZEOFFSET;
				9674	break;
				9675
				9676	case META_BACKREF_BYNAME:
				9677	case META_RECURSE_BYNAME:
				9678	pptr += 1 + SIZEOFFSET;
				9679	break;
				9680
				9681	case META_COND_DEFINE:
				9682	pptr += SIZEOFFSET;
				9683	nestlevel++;
				9684	break;
				9685
				9686	case META_COND_NAME:
				9687	case META_COND_NUMBER:
				9688	case META_COND_RNAME:
				9689	case META_COND_RNUMBER:
				9690	pptr += 1 + SIZEOFFSET;
				9691	nestlevel++;
				9692	break;
				9693
				9694	case META_COND_VERSION:
				9695	pptr += 3;
				9696	nestlevel++;
				9697	break;
				9698
				9699	case META_CALLOUT_STRING:
				9700	pptr += 3 + SIZEOFFSET;
				9701	break;
				9702
				9703	case META_BIGVALUE:
				9704	case META_OPTIONS:
				9705	case META_POSIX:
				9706	case META_POSIX_NEG:
				9707	pptr += 1;
				9708	break;
				9709
				9710	case META_MINMAX:
				9711	case META_MINMAX_QUERY:
				9712	case META_MINMAX_PLUS:
				9713	pptr += 2;
				9714	break;
				9715
				9716	case META_CALLOUT_NUMBER:
				9717	pptr += 3;
				9718	break;
				9719
				9720	case META_MARK:
				9721	case META_COMMIT_ARG:
				9722	case META_PRUNE_ARG:
				9723	case META_SKIP_ARG:
				9724	case META_THEN_ARG:
				9725	pptr += 1 + pptr[1];
				9726	break;
				9727
				9728	case META_LOOKBEHIND:
				9729	case META_LOOKBEHINDNOT:
				9730	case META_LOOKBEHIND_NA:
				9731	if (!set_lookbehind_lengths(&pptr, &errorcode, lcptr, recurses, cb))
				9732	return errorcode;
				9733	break;
				9734	}
				9735	}
				9736
				9737	return 0;
				9738	}
				9739
				9740
				9741
				9742	/*************************************************
				9743	* External function to compile a pattern *
				9744	*************************************************/
				9745
				9746	/* This function reads a regular expression in the form of a string and returns
				9747	a pointer to a block of store holding a compiled version of the expression.
				9748
				9749	Arguments:
				9750	pattern the regular expression
				9751	patlen the length of the pattern, or PCRE2_ZERO_TERMINATED
				9752	options option bits
				9753	errorptr pointer to errorcode
				9754	erroroffset pointer to error offset
				9755	ccontext points to a compile context or is NULL
				9756
				9757	Returns: pointer to compiled data block, or NULL on error,
				9758	with errorcode and erroroffset set
				9759	*/
				9760
				9761	PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
				9762	pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
				9763	int errorptr, PCRE2_SIZE erroroffset, pcre2_compile_context *ccontext)
				9764	{
				9765	BOOL utf; /* Set TRUE for UTF mode */
				9766	BOOL ucp; /* Set TRUE for UCP mode */
				9767	BOOL has_lookbehind = FALSE; /* Set TRUE if a lookbehind is found */
				9768	BOOL zero_terminated; /* Set TRUE for zero-terminated pattern */
				9769	pcre2_real_code re = NULL; / What we will return */
				9770	compile_block cb; /* "Static" compile-time data */
				9771	const uint8_t tables; / Char tables base pointer */
				9772
				9773	PCRE2_UCHAR code; / Current pointer in compiled code */
				9774	PCRE2_SPTR codestart; /* Start of compiled code */
				9775	PCRE2_SPTR ptr; /* Current pointer in pattern */
				9776	uint32_t pptr; / Current pointer in parsed pattern */
				9777
				9778	PCRE2_SIZE length = 1; /* Allow for final END opcode */
				9779	PCRE2_SIZE usedlength; /* Actual length used */
				9780	PCRE2_SIZE re_blocksize; /* Size of memory block */
				9781	PCRE2_SIZE big32count = 0; /* 32-bit literals >= 0x80000000 */
				9782	PCRE2_SIZE parsed_size_needed; /* Needed for parsed pattern */
				9783
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	9784	uint32_t firstcuflags, reqcuflags; /* Type of first/req code unit */
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	9785	uint32_t firstcu, reqcu; /* Value of first/req code unit */
				9786	uint32_t setflags = 0; /* NL and BSR set flags */
				9787
				9788	uint32_t skipatstart; /* When checking (UTF) etc /
				9789	uint32_t limit_heap = UINT32_MAX;
				9790	uint32_t limit_match = UINT32_MAX; /* Unset match limits */
				9791	uint32_t limit_depth = UINT32_MAX;
				9792
				9793	int newline = 0; /* Unset; can be set by the pattern */
				9794	int bsr = 0; /* Unset; can be set by the pattern */
				9795	int errorcode = 0; /* Initialize to avoid compiler warn */
				9796	int regexrc; /* Return from compile */
				9797
				9798	uint32_t i; /* Local loop counter */
				9799
				9800	/* Comments at the head of this file explain about these variables. */
				9801
				9802	uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE];
				9803	uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE];
				9804	named_group named_groups[NAMED_GROUP_LIST_SIZE];
				9805
				9806	/* The workspace is used in different ways in the different compiling phases.
				9807	It needs to be 16-bit aligned for the preliminary parsing scan. */
				9808
				9809	uint32_t c16workspace[C16_WORK_SIZE];
				9810	PCRE2_UCHAR cworkspace = (PCRE2_UCHAR )c16workspace;
				9811
				9812
				9813	/* -------------- Check arguments and set up the pattern ----------------- */
				9814
				9815	/* There must be error code and offset pointers. */
				9816
				9817	if (errorptr == NULL \|\| erroroffset == NULL) return NULL;
				9818	*errorptr = ERR0;
				9819	*erroroffset = 0;
				9820
				9821	/* There must be a pattern! */
				9822
				9823	if (pattern == NULL)
				9824	{
				9825	*errorptr = ERR16;
				9826	return NULL;
				9827	}
				9828
				9829	/* A NULL compile context means "use a default context" */
				9830
				9831	if (ccontext == NULL)
				9832	ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
				9833
				9834	/* PCRE2_MATCH_INVALID_UTF implies UTF */
				9835
				9836	if ((options & PCRE2_MATCH_INVALID_UTF) != 0) options \|= PCRE2_UTF;
				9837
				9838	/* Check that all undefined public option bits are zero. */
				9839
				9840	if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 \|\|
				9841	(ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0)
				9842	{
				9843	*errorptr = ERR17;
				9844	return NULL;
				9845	}
				9846
				9847	if ((options & PCRE2_LITERAL) != 0 &&
				9848	((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 \|\|
				9849	(ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0))
				9850	{
				9851	*errorptr = ERR92;
				9852	return NULL;
				9853	}
				9854
				9855	/* A zero-terminated pattern is indicated by the special length value
				9856	PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */
				9857
				9858	if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED)))
				9859	patlen = PRIV(strlen)(pattern);
				9860
				9861	if (patlen > ccontext->max_pattern_length)
				9862	{
				9863	*errorptr = ERR88;
				9864	return NULL;
				9865	}
				9866
				9867	/* From here on, all returns from this function should end up going via the
				9868	EXIT label. */
				9869
				9870
				9871	/* ------------ Initialize the "static" compile data -------------- */
				9872
				9873	tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);
				9874
				9875	cb.lcc = tables + lcc_offset; /* Individual */
				9876	cb.fcc = tables + fcc_offset; /* character */
				9877	cb.cbits = tables + cbits_offset; /* tables */
				9878	cb.ctypes = tables + ctypes_offset;
				9879
				9880	cb.assert_depth = 0;
				9881	cb.bracount = 0;
				9882	cb.cx = ccontext;
				9883	cb.dupnames = FALSE;
				9884	cb.end_pattern = pattern + patlen;
				9885	cb.erroroffset = 0;
				9886	cb.external_flags = 0;
				9887	cb.external_options = options;
				9888	cb.groupinfo = stack_groupinfo;
				9889	cb.had_recurse = FALSE;
				9890	cb.lastcapture = 0;
				9891	cb.max_lookbehind = 0;
				9892	cb.name_entry_size = 0;
				9893	cb.name_table = NULL;
				9894	cb.named_groups = named_groups;
				9895	cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;
				9896	cb.names_found = 0;
				9897	cb.open_caps = NULL;
				9898	cb.parens_depth = 0;
				9899	cb.parsed_pattern = stack_parsed_pattern;
				9900	cb.req_varyopt = 0;
				9901	cb.start_code = cworkspace;
				9902	cb.start_pattern = pattern;
				9903	cb.start_workspace = cworkspace;
				9904	cb.workspace_size = COMPILE_WORK_SIZE;
				9905
				9906	/* Maximum back reference and backref bitmap. The bitmap records up to 31 back
				9907	references to help in deciding whether (.*) can be treated as anchored or not.
				9908	*/
				9909
				9910	cb.top_backref = 0;
				9911	cb.backref_map = 0;
				9912
				9913	/* Escape sequences \1 to \9 are always back references, but as they are only
				9914	two characters long, only two elements can be used in the parsed_pattern
				9915	vector. The first contains the reference, and we'd like to use the second to
				9916	record the offset in the pattern, so that forward references to non-existent
				9917	groups can be diagnosed later with an offset. However, on 64-bit systems,
				9918	PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first
				9919	occurrence of \1 to \9, indexed by the second parsed_pattern value. All other
				9920	references have enough space for the offset to be put into the parsed pattern.
				9921	*/
				9922
				9923	for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET;
				9924
				9925
				9926	/* --------------- Start looking at the pattern --------------- */
				9927
				9928	/* Unless PCRE2_LITERAL is set, check for global one-time option settings at
				9929	the start of the pattern, and remember the offset to the actual regex. With
				9930	valgrind support, make the terminator of a zero-terminated pattern
				9931	inaccessible. This catches bugs that would otherwise only show up for
				9932	non-zero-terminated patterns. */
				9933
				9934	#ifdef SUPPORT_VALGRIND
				9935	if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1));
				9936	#endif
				9937
				9938	ptr = pattern;
				9939	skipatstart = 0;
				9940
				9941	if ((options & PCRE2_LITERAL) == 0)
				9942	{
				9943	while (patlen - skipatstart >= 2 &&
				9944	ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
				9945	ptr[skipatstart+1] == CHAR_ASTERISK)
				9946	{
				9947	for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)
				9948	{
				9949	uint32_t c, pp;
				9950	pso *p = pso_list + i;
				9951
				9952	if (patlen - skipatstart - 2 >= p->length &&
				9953	PRIV(strncmp_c8)(ptr + skipatstart + 2, (char *)(p->name),
				9954	p->length) == 0)
				9955	{
				9956	skipatstart += p->length + 2;
				9957	switch(p->type)
				9958	{
				9959	case PSO_OPT:
				9960	cb.external_options \|= p->value;
				9961	break;
				9962
				9963	case PSO_FLG:
				9964	setflags \|= p->value;
				9965	break;
				9966
				9967	case PSO_NL:
				9968	newline = p->value;
				9969	setflags \|= PCRE2_NL_SET;
				9970	break;
				9971
				9972	case PSO_BSR:
				9973	bsr = p->value;
				9974	setflags \|= PCRE2_BSR_SET;
				9975	break;
				9976
				9977	case PSO_LIMM:
				9978	case PSO_LIMD:
				9979	case PSO_LIMH:
				9980	c = 0;
				9981	pp = skipatstart;
				9982	if (!IS_DIGIT(ptr[pp]))
				9983	{
				9984	errorcode = ERR60;
				9985	ptr += pp;
				9986	goto HAD_EARLY_ERROR;
				9987	}
				9988	while (IS_DIGIT(ptr[pp]))
				9989	{
				9990	if (c > UINT32_MAX / 10 - 1) break; /* Integer overflow */
				9991	c = c*10 + (ptr[pp++] - CHAR_0);
				9992	}
				9993	if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS)
				9994	{
				9995	errorcode = ERR60;
				9996	ptr += pp;
				9997	goto HAD_EARLY_ERROR;
				9998	}
				9999	if (p->type == PSO_LIMH) limit_heap = c;
				10000	else if (p->type == PSO_LIMM) limit_match = c;
				10001	else limit_depth = c;
				10002	skipatstart += pp - skipatstart;
				10003	break;
				10004	}
				10005	break; /* Out of the table scan loop */
				10006	}
				10007	}
				10008	if (i >= sizeof(pso_list)/sizeof(pso)) break; /* Out of pso loop */
				10009	}
				10010	}
				10011
				10012	/* End of pattern-start options; advance to start of real regex. */
				10013
				10014	ptr += skipatstart;
				10015
				10016	/* Can't support UTF or UCP if PCRE2 was built without Unicode support. */
				10017
				10018	#ifndef SUPPORT_UNICODE
				10019	if ((cb.external_options & (PCRE2_UTF\|PCRE2_UCP)) != 0)
				10020	{
				10021	errorcode = ERR32;
				10022	goto HAD_EARLY_ERROR;
				10023	}
				10024	#endif
				10025
				10026	/* Check UTF. We have the original options in 'options', with that value as
				10027	modified by (*UTF) etc in cb->external_options. The extra option
				10028	PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the
				10029	surrogate code points cannot be represented in UTF-16. */
				10030
				10031	utf = (cb.external_options & PCRE2_UTF) != 0;
				10032	if (utf)
				10033	{
				10034	if ((options & PCRE2_NEVER_UTF) != 0)
				10035	{
				10036	errorcode = ERR74;
				10037	goto HAD_EARLY_ERROR;
				10038	}
				10039	if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
				10040	(errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
				10041	goto HAD_ERROR; /* Offset was set by valid_utf() */
				10042
				10043	#if PCRE2_CODE_UNIT_WIDTH == 16
				10044	if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)
				10045	{
				10046	errorcode = ERR91;
				10047	goto HAD_EARLY_ERROR;
				10048	}
				10049	#endif
				10050	}
				10051
				10052	/* Check UCP lockout. */
				10053
				10054	ucp = (cb.external_options & PCRE2_UCP) != 0;
				10055	if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)
				10056	{
				10057	errorcode = ERR75;
				10058	goto HAD_EARLY_ERROR;
				10059	}
				10060
				10061	/* Process the BSR setting. */
				10062
				10063	if (bsr == 0) bsr = ccontext->bsr_convention;
				10064
				10065	/* Process the newline setting. */
				10066
				10067	if (newline == 0) newline = ccontext->newline_convention;
				10068	cb.nltype = NLTYPE_FIXED;
				10069	switch(newline)
				10070	{
				10071	case PCRE2_NEWLINE_CR:
				10072	cb.nllen = 1;
				10073	cb.nl[0] = CHAR_CR;
				10074	break;
				10075
				10076	case PCRE2_NEWLINE_LF:
				10077	cb.nllen = 1;
				10078	cb.nl[0] = CHAR_NL;
				10079	break;
				10080
				10081	case PCRE2_NEWLINE_NUL:
				10082	cb.nllen = 1;
				10083	cb.nl[0] = CHAR_NUL;
				10084	break;
				10085
				10086	case PCRE2_NEWLINE_CRLF:
				10087	cb.nllen = 2;
				10088	cb.nl[0] = CHAR_CR;
				10089	cb.nl[1] = CHAR_NL;
				10090	break;
				10091
				10092	case PCRE2_NEWLINE_ANY:
				10093	cb.nltype = NLTYPE_ANY;
				10094	break;
				10095
				10096	case PCRE2_NEWLINE_ANYCRLF:
				10097	cb.nltype = NLTYPE_ANYCRLF;
				10098	break;
				10099
				10100	default:
				10101	errorcode = ERR56;
				10102	goto HAD_EARLY_ERROR;
				10103	}
				10104
				10105	/* Pre-scan the pattern to do two things: (1) Discover the named groups and
				10106	their numerical equivalents, so that this information is always available for
				10107	the remaining processing. (2) At the same time, parse the pattern and put a
				10108	processed version into the parsed_pattern vector. This has escapes interpreted
				10109	and comments removed (amongst other things).
				10110
				10111	In all but one case, when PCRE2_AUTO_CALLOUT is not set, the number of unsigned
				10112	32-bit ints in the parsed pattern is bounded by the length of the pattern plus
				10113	one (for the terminator) plus four if PCRE2_EXTRA_WORD or PCRE2_EXTRA_LINE is
				10114	set. The exceptional case is when running in 32-bit, non-UTF mode, when literal
				10115	characters greater than META_END (0x80000000) have to be coded as two units. In
				10116	this case, therefore, we scan the pattern to check for such values. */
				10117
				10118	#if PCRE2_CODE_UNIT_WIDTH == 32
				10119	if (!utf)
				10120	{
				10121	PCRE2_SPTR p;
				10122	for (p = ptr; p < cb.end_pattern; p++) if (*p >= META_END) big32count++;
				10123	}
				10124	#endif
				10125
				10126	/* Ensure that the parsed pattern buffer is big enough. When PCRE2_AUTO_CALLOUT
				10127	is set we have to assume a numerical callout (4 elements) for each character
				10128	plus one at the end. This is overkill, but memory is plentiful these days. For
				10129	many smaller patterns the vector on the stack (which was set up above) can be
				10130	used. */
				10131
				10132	parsed_size_needed = patlen - skipatstart + big32count;
				10133
				10134	if ((ccontext->extra_options &
				10135	(PCRE2_EXTRA_MATCH_WORD\|PCRE2_EXTRA_MATCH_LINE)) != 0)
				10136	parsed_size_needed += 4;
				10137
				10138	if ((options & PCRE2_AUTO_CALLOUT) != 0)
				10139	parsed_size_needed = (parsed_size_needed + 1) * 5;
				10140
				10141	if (parsed_size_needed >= PARSED_PATTERN_DEFAULT_SIZE)
				10142	{
				10143	uint32_t *heap_parsed_pattern = ccontext->memctl.malloc(
				10144	(parsed_size_needed + 1) * sizeof(uint32_t), ccontext->memctl.memory_data);
				10145	if (heap_parsed_pattern == NULL)
				10146	{
				10147	*errorptr = ERR21;
				10148	goto EXIT;
				10149	}
				10150	cb.parsed_pattern = heap_parsed_pattern;
				10151	}
				10152	cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed + 1;
				10153
				10154	/* Do the parsing scan. */
				10155
				10156	errorcode = parse_regex(ptr, cb.external_options, &has_lookbehind, &cb);
				10157	if (errorcode != 0) goto HAD_CB_ERROR;
				10158
				10159	/* Workspace is needed to remember information about numbered groups: whether a
				10160	group can match an empty string and what its fixed length is. This is done to
				10161	avoid the possibility of recursive references causing very long compile times
				10162	when checking these features. Unnumbered groups do not have this exposure since
				10163	they cannot be referenced. We use an indexed vector for this purpose. If there
				10164	are sufficiently few groups, the default vector on the stack, as set up above,
				10165	can be used. Otherwise we have to get/free a special vector. The vector must be
				10166	initialized to zero. */
				10167
				10168	if (cb.bracount >= GROUPINFO_DEFAULT_SIZE)
				10169	{
				10170	cb.groupinfo = ccontext->memctl.malloc(
				10171	(cb.bracount + 1)*sizeof(uint32_t), ccontext->memctl.memory_data);
				10172	if (cb.groupinfo == NULL)
				10173	{
				10174	errorcode = ERR21;
				10175	cb.erroroffset = 0;
				10176	goto HAD_CB_ERROR;
				10177	}
				10178	}
				10179	memset(cb.groupinfo, 0, (cb.bracount + 1) * sizeof(uint32_t));
				10180
				10181	/* If there were any lookbehinds, scan the parsed pattern to figure out their
				10182	lengths. */
				10183
				10184	if (has_lookbehind)
				10185	{
				10186	int loopcount = 0;
				10187	errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb, &loopcount);
				10188	if (errorcode != 0) goto HAD_CB_ERROR;
				10189	}
				10190
				10191	/* For debugging, there is a function that shows the parsed data vector. */
				10192
				10193	#ifdef DEBUG_SHOW_PARSED
				10194	fprintf(stderr, "+++ Pre-scan complete:\n");
				10195	show_parsed(&cb);
				10196	#endif
				10197
				10198	/* For debugging capturing information this code can be enabled. */
				10199
				10200	#ifdef DEBUG_SHOW_CAPTURES
				10201	{
				10202	named_group *ng = cb.named_groups;
				10203	fprintf(stderr, "+++Captures: %d\n", cb.bracount);
				10204	for (i = 0; i < cb.names_found; i++, ng++)
				10205	{
				10206	fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);
				10207	}
				10208	}
				10209	#endif
				10210
				10211	/* Pretend to compile the pattern while actually just accumulating the amount
				10212	of memory required in the 'length' variable. This behaviour is triggered by
				10213	passing a non-NULL final argument to compile_regex(). We pass a block of
				10214	workspace (cworkspace) for it to compile parts of the pattern into; the
				10215	compiled code is discarded when it is no longer needed, so hopefully this
				10216	workspace will never overflow, though there is a test for its doing so.
				10217
				10218	On error, errorcode will be set non-zero, so we don't need to look at the
				10219	result of the function. The initial options have been put into the cb block,
				10220	but we still have to pass a separate options variable (the first argument)
				10221	because the options may change as the pattern is processed. */
				10222
				10223	cb.erroroffset = patlen; /* For any subsequent errors that do not set it */
				10224	pptr = cb.parsed_pattern;
				10225	code = cworkspace;
				10226	*code = OP_BRA;
				10227
				10228	(void)compile_regex(cb.external_options, &code, &pptr, &errorcode, 0, &firstcu,
				10229	&firstcuflags, &reqcu, &reqcuflags, NULL, &cb, &length);
				10230
				10231	if (errorcode != 0) goto HAD_CB_ERROR; /* Offset is in cb.erroroffset */
				10232
				10233	/* This should be caught in compile_regex(), but just in case... */
				10234
				10235	if (length > MAX_PATTERN_SIZE)
				10236	{
				10237	errorcode = ERR20;
				10238	goto HAD_CB_ERROR;
				10239	}
				10240
				10241	/* Compute the size of, and then get and initialize, the data block for storing
				10242	the compiled pattern and names table. Integer overflow should no longer be
				10243	possible because nowadays we limit the maximum value of cb.names_found and
				10244	cb.name_entry_size. */
				10245
				10246	re_blocksize = sizeof(pcre2_real_code) +
				10247	CU2BYTES(length +
				10248	(PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size);
				10249	re = (pcre2_real_code *)
				10250	ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
				10251	if (re == NULL)
				10252	{
				10253	errorcode = ERR21;
				10254	goto HAD_CB_ERROR;
				10255	}
				10256
				10257	/* The compiler may put padding at the end of the pcre2_real_code structure in
				10258	order to round it up to a multiple of 4 or 8 bytes. This means that when a
				10259	compiled pattern is copied (for example, when serialized) undefined bytes are
				10260	read, and this annoys debuggers such as valgrind. To avoid this, we explicitly
				10261	write to the last 8 bytes of the structure before setting the fields. */
				10262
				10263	memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8);
				10264	re->memctl = ccontext->memctl;
				10265	re->tables = tables;
				10266	re->executable_jit = NULL;
				10267	memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
				10268	re->blocksize = re_blocksize;
				10269	re->magic_number = MAGIC_NUMBER;
				10270	re->compile_options = options;
				10271	re->overall_options = cb.external_options;
				10272	re->extra_options = ccontext->extra_options;
				10273	re->flags = PCRE2_CODE_UNIT_WIDTH/8 \| cb.external_flags \| setflags;
				10274	re->limit_heap = limit_heap;
				10275	re->limit_match = limit_match;
				10276	re->limit_depth = limit_depth;
				10277	re->first_codeunit = 0;
				10278	re->last_codeunit = 0;
				10279	re->bsr_convention = bsr;
				10280	re->newline_convention = newline;
				10281	re->max_lookbehind = 0;
				10282	re->minlength = 0;
				10283	re->top_bracket = 0;
				10284	re->top_backref = 0;
				10285	re->name_entry_size = cb.name_entry_size;
				10286	re->name_count = cb.names_found;
				10287
				10288	/* The basic block is immediately followed by the name table, and the compiled
				10289	code follows after that. */
				10290
				10291	codestart = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) +
				10292	re->name_entry_size * re->name_count;
				10293
				10294	/* Update the compile data block for the actual compile. The starting points of
				10295	the name/number translation table and of the code are passed around in the
				10296	compile data block. The start/end pattern and initial options are already set
				10297	from the pre-compile phase, as is the name_entry_size field. */
				10298
				10299	cb.parens_depth = 0;
				10300	cb.assert_depth = 0;
				10301	cb.lastcapture = 0;
				10302	cb.name_table = (PCRE2_UCHAR )((uint8_t )re + sizeof(pcre2_real_code));
				10303	cb.start_code = codestart;
				10304	cb.req_varyopt = 0;
				10305	cb.had_accept = FALSE;
				10306	cb.had_pruneorskip = FALSE;
				10307	cb.open_caps = NULL;
				10308
				10309	/* If any named groups were found, create the name/number table from the list
				10310	created in the pre-pass. */
				10311
				10312	if (cb.names_found > 0)
				10313	{
				10314	named_group *ng = cb.named_groups;
				10315	for (i = 0; i < cb.names_found; i++, ng++)
				10316	add_name_to_table(&cb, ng->name, ng->length, ng->number, i);
				10317	}
				10318
				10319	/* Set up a starting, non-extracting bracket, then compile the expression. On
				10320	error, errorcode will be set non-zero, so we don't need to look at the result
				10321	of the function here. */
				10322
				10323	pptr = cb.parsed_pattern;
				10324	code = (PCRE2_UCHAR *)codestart;
				10325	*code = OP_BRA;
				10326	regexrc = compile_regex(re->overall_options, &code, &pptr, &errorcode, 0,
				10327	&firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, NULL);
				10328	if (regexrc < 0) re->flags \|= PCRE2_MATCH_EMPTY;
				10329	re->top_bracket = cb.bracount;
				10330	re->top_backref = cb.top_backref;
				10331	re->max_lookbehind = cb.max_lookbehind;
				10332
				10333	if (cb.had_accept)
				10334	{
				10335	reqcu = 0; /* Must disable after (ACCEPT) /
				10336	reqcuflags = REQ_NONE;
				10337	re->flags \|= PCRE2_HASACCEPT; /* Disables minimum length */
				10338	}
				10339
				10340	/* Fill in the final opcode and check for disastrous overflow. If no overflow,
				10341	but the estimated length exceeds the really used length, adjust the value of
				10342	re->blocksize, and if valgrind support is configured, mark the extra allocated
				10343	memory as unaddressable, so that any out-of-bound reads can be detected. */
				10344
				10345	*code++ = OP_END;
				10346	usedlength = code - codestart;
				10347	if (usedlength > length) errorcode = ERR23; else
				10348	{
				10349	re->blocksize -= CU2BYTES(length - usedlength);
				10350	#ifdef SUPPORT_VALGRIND
				10351	VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
				10352	#endif
				10353	}
				10354
				10355	/* Scan the pattern for recursion/subroutine calls and convert the group
				10356	numbers into offsets. Maintain a small cache so that repeated groups containing
				10357	recursions are efficiently handled. */
				10358
				10359	#define RSCAN_CACHE_SIZE 8
				10360
				10361	if (errorcode == 0 && cb.had_recurse)
				10362	{
				10363	PCRE2_UCHAR *rcode;
				10364	PCRE2_SPTR rgroup;
				10365	unsigned int ccount = 0;
				10366	int start = RSCAN_CACHE_SIZE;
				10367	recurse_cache rc[RSCAN_CACHE_SIZE];
				10368
				10369	for (rcode = (PCRE2_UCHAR *)find_recurse(codestart, utf);
				10370	rcode != NULL;
				10371	rcode = (PCRE2_UCHAR *)find_recurse(rcode + 1 + LINK_SIZE, utf))
				10372	{
				10373	int p, groupnumber;
				10374
				10375	groupnumber = (int)GET(rcode, 1);
				10376	if (groupnumber == 0) rgroup = codestart; else
				10377	{
				10378	PCRE2_SPTR search_from = codestart;
				10379	rgroup = NULL;
				10380	for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
				10381	{
				10382	if (groupnumber == rc[p].groupnumber)
				10383	{
				10384	rgroup = rc[p].group;
				10385	break;
				10386	}
				10387
				10388	/* Group n+1 must always start to the right of group n, so we can save
				10389	search time below when the new group number is greater than any of the
				10390	previously found groups. */
				10391
				10392	if (groupnumber > rc[p].groupnumber) search_from = rc[p].group;
				10393	}
				10394
				10395	if (rgroup == NULL)
				10396	{
				10397	rgroup = PRIV(find_bracket)(search_from, utf, groupnumber);
				10398	if (rgroup == NULL)
				10399	{
				10400	errorcode = ERR53;
				10401	break;
				10402	}
				10403	if (--start < 0) start = RSCAN_CACHE_SIZE - 1;
				10404	rc[start].groupnumber = groupnumber;
				10405	rc[start].group = rgroup;
				10406	if (ccount < RSCAN_CACHE_SIZE) ccount++;
				10407	}
				10408	}
				10409
				10410	PUT(rcode, 1, rgroup - codestart);
				10411	}
				10412	}
				10413
				10414	/* In rare debugging situations we sometimes need to look at the compiled code
				10415	at this stage. */
				10416
				10417	#ifdef DEBUG_CALL_PRINTINT
				10418	pcre2_printint(re, stderr, TRUE);
				10419	fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);
				10420	#endif
				10421
				10422	/* Unless disabled, check whether any single character iterators can be
				10423	auto-possessified. The function overwrites the appropriate opcode values, so
				10424	the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
				10425	used in this code because at least one compiler gives a warning about loss of
				10426	"const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the
				10427	function call. */
				10428
				10429	if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
				10430	{
				10431	PCRE2_UCHAR temp = (PCRE2_UCHAR )codestart;
				10432	if (PRIV(auto_possessify)(temp, &cb) != 0) errorcode = ERR80;
				10433	}
				10434
				10435	/* Failed to compile, or error while post-processing. */
				10436
				10437	if (errorcode != 0) goto HAD_CB_ERROR;
				10438
				10439	/* Successful compile. If the anchored option was not passed, set it if
				10440	we can determine that the pattern is anchored by virtue of ^ characters or \A
				10441	or anything else, such as starting with non-atomic .* when DOTALL is set and
				10442	there are no occurrences of PRUNE or SKIP (though there is an option to
				10443	disable this case). */
				10444
				10445	if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
				10446	is_anchored(codestart, 0, &cb, 0, FALSE))
				10447	re->overall_options \|= PCRE2_ANCHORED;
				10448
				10449	/* Set up the first code unit or startline flag, the required code unit, and
				10450	then study the pattern. This code need not be obeyed if PCRE2_NO_START_OPTIMIZE
				10451	is set, as the data it would create will not be used. Note that a first code
				10452	unit (but not the startline flag) is useful for anchored patterns because it
				10453	can still give a quick "no match" and also avoid searching for a last code
				10454	unit. */
				10455
				10456	if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
				10457	{
				10458	int minminlength = 0; /* For minimal minlength from first/required CU */
				10459
				10460	/* If we do not have a first code unit, see if there is one that is asserted
				10461	(these are not saved during the compile because they can cause conflicts with
				10462	actual literals that follow). */
				10463
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	10464	if (firstcuflags >= REQ_NONE)
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	10465	firstcu = find_firstassertedcu(codestart, &firstcuflags, 0);
				10466
				10467	/* Save the data for a first code unit. The existence of one means the
				10468	minimum length must be at least 1. */
				10469
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	10470	if (firstcuflags < REQ_NONE)
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	10471	{
				10472	re->first_codeunit = firstcu;
				10473	re->flags \|= PCRE2_FIRSTSET;
				10474	minminlength++;
				10475
				10476	/* Handle caseless first code units. */
				10477
				10478	if ((firstcuflags & REQ_CASELESS) != 0)
				10479	{
				10480	if (firstcu < 128 \|\| (!utf && !ucp && firstcu < 255))
				10481	{
				10482	if (cb.fcc[firstcu] != firstcu) re->flags \|= PCRE2_FIRSTCASELESS;
				10483	}
				10484
				10485	/* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.
				10486	In 8-bit UTF mode, codepoints in the range 128-255 are introductory code
				10487	points and cannot have another case, but if UCP is set they may do. */
				10488
				10489	#ifdef SUPPORT_UNICODE
				10490	#if PCRE2_CODE_UNIT_WIDTH == 8
				10491	else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
				10492	re->flags \|= PCRE2_FIRSTCASELESS;
				10493	#else
				10494	else if ((utf \|\| ucp) && firstcu <= MAX_UTF_CODE_POINT &&
				10495	UCD_OTHERCASE(firstcu) != firstcu)
				10496	re->flags \|= PCRE2_FIRSTCASELESS;
				10497	#endif
				10498	#endif /* SUPPORT_UNICODE */
				10499	}
				10500	}
				10501
				10502	/* When there is no first code unit, for non-anchored patterns, see if we can
				10503	set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all
				10504	branches start with ^ and also when all branches start with non-atomic .* for
				10505	non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option
				10506	that disables this case.) */
				10507
				10508	else if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
				10509	is_startline(codestart, 0, &cb, 0, FALSE))
				10510	re->flags \|= PCRE2_STARTLINE;
				10511
				10512	/* Handle the "required code unit", if one is set. In the UTF case we can
				10513	increment the minimum minimum length only if we are sure this really is a
				10514	different character and not a non-starting code unit of the first character,
				10515	because the minimum length count is in characters, not code units. */
				10516
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	10517	if (reqcuflags < REQ_NONE)
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	10518	{
				10519	#if PCRE2_CODE_UNIT_WIDTH == 16
				10520	if ((re->overall_options & PCRE2_UTF) == 0 \|\| /* Not UTF */
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	10521	firstcuflags >= REQ_NONE \|\| /* First not set */
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	10522	(firstcu & 0xf800) != 0xd800 \|\| /* First not surrogate */
				10523	(reqcu & 0xfc00) != 0xdc00) /* Req not low surrogate */
				10524	#elif PCRE2_CODE_UNIT_WIDTH == 8
				10525	if ((re->overall_options & PCRE2_UTF) == 0 \|\| /* Not UTF */
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	10526	firstcuflags >= REQ_NONE \|\| /* First not set */
Elliott Hughes	5b80804	2021-10-01 10:56:10 -0700	[diff] [blame]	10527	(firstcu & 0x80) == 0 \|\| /* First is ASCII */
				10528	(reqcu & 0x80) == 0) /* Req is ASCII */
				10529	#endif
				10530	{
				10531	minminlength++;
				10532	}
				10533
				10534	/* In the case of an anchored pattern, set up the value only if it follows
				10535	a variable length item in the pattern. */
				10536
				10537	if ((re->overall_options & PCRE2_ANCHORED) == 0 \|\|
				10538	(reqcuflags & REQ_VARY) != 0)
				10539	{
				10540	re->last_codeunit = reqcu;
				10541	re->flags \|= PCRE2_LASTSET;
				10542
				10543	/* Handle caseless required code units as for first code units (above). */
				10544
				10545	if ((reqcuflags & REQ_CASELESS) != 0)
				10546	{
				10547	if (reqcu < 128 \|\| (!utf && !ucp && reqcu < 255))
				10548	{
				10549	if (cb.fcc[reqcu] != reqcu) re->flags \|= PCRE2_LASTCASELESS;
				10550	}
				10551	#ifdef SUPPORT_UNICODE
				10552	#if PCRE2_CODE_UNIT_WIDTH == 8
				10553	else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
				10554	re->flags \|= PCRE2_LASTCASELESS;
				10555	#else
				10556	else if ((utf \|\| ucp) && reqcu <= MAX_UTF_CODE_POINT &&
				10557	UCD_OTHERCASE(reqcu) != reqcu)
				10558	re->flags \|= PCRE2_LASTCASELESS;
				10559	#endif
				10560	#endif /* SUPPORT_UNICODE */
				10561	}
				10562	}
				10563	}
				10564
				10565	/* Study the compiled pattern to set up information such as a bitmap of
				10566	starting code units and a minimum matching length. */
				10567
				10568	if (PRIV(study)(re) != 0)
				10569	{
				10570	errorcode = ERR31;
				10571	goto HAD_CB_ERROR;
				10572	}
				10573
				10574	/* If study() set a bitmap of starting code units, it implies a minimum
				10575	length of at least one. */
				10576
				10577	if ((re->flags & PCRE2_FIRSTMAPSET) != 0 && minminlength == 0)
				10578	minminlength = 1;
				10579
				10580	/* If the minimum length set (or not set) by study() is less than the minimum
				10581	implied by required code units, override it. */
				10582
				10583	if (re->minlength < minminlength) re->minlength = minminlength;
				10584	} /* End of start-of-match optimizations. */
				10585
				10586	/* Control ends up here in all cases. When running under valgrind, make a
				10587	pattern's terminating zero defined again. If memory was obtained for the parsed
				10588	version of the pattern, free it before returning. Also free the list of named
				10589	groups if a larger one had to be obtained, and likewise the group information
				10590	vector. */
				10591
				10592	EXIT:
				10593	#ifdef SUPPORT_VALGRIND
				10594	if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1));
				10595	#endif
				10596	if (cb.parsed_pattern != stack_parsed_pattern)
				10597	ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data);
				10598	if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)
				10599	ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);
				10600	if (cb.groupinfo != stack_groupinfo)
				10601	ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);
				10602	return re; /* Will be NULL after an error */
				10603
				10604	/* Errors discovered in parse_regex() set the offset value in the compile
				10605	block. Errors discovered before it is called must compute it from the ptr
				10606	value. After parse_regex() is called, the offset in the compile block is set to
				10607	the end of the pattern, but certain errors in compile_regex() may reset it if
				10608	an offset is available in the parsed pattern. */
				10609
				10610	HAD_CB_ERROR:
				10611	ptr = pattern + cb.erroroffset;
				10612
				10613	HAD_EARLY_ERROR:
				10614	*erroroffset = ptr - pattern;
				10615
				10616	HAD_ERROR:
				10617	*errorptr = errorcode;
				10618	pcre2_code_free(re);
				10619	re = NULL;
				10620	goto EXIT;
				10621	}
				10622
				10623	/* End of pcre2_compile.c */