Blame - src/pcre2_study.c - platform/external/pcre

blob: 4db3ad11842347e76755eee465d7e5f832307c87 [file] [log] [blame]

Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	1	/*************************************************
				2	* Perl-Compatible Regular Expressions *
				3	*************************************************/
				4
				5	/* PCRE is a library of functions to support regular expressions whose syntax
				6	and semantics are as close as possible to those of the Perl 5 language.
				7
				8	Written by Philip Hazel
				9	Original API code Copyright (c) 1997-2012 University of Cambridge
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	10	New API code Copyright (c) 2016-2021 University of Cambridge
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	11
				12	-----------------------------------------------------------------------------
				13	Redistribution and use in source and binary forms, with or without
				14	modification, are permitted provided that the following conditions are met:
				15
				16	* Redistributions of source code must retain the above copyright notice,
				17	this list of conditions and the following disclaimer.
				18
				19	* Redistributions in binary form must reproduce the above copyright
				20	notice, this list of conditions and the following disclaimer in the
				21	documentation and/or other materials provided with the distribution.
				22
				23	* Neither the name of the University of Cambridge nor the names of its
				24	contributors may be used to endorse or promote products derived from
				25	this software without specific prior written permission.
				26
				27	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
				28	AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
				29	IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
				30	ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
				31	LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
				32	CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
				33	SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
				34	INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
				35	CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
				36	ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
				37	POSSIBILITY OF SUCH DAMAGE.
				38	-----------------------------------------------------------------------------
				39	*/
				40
				41	/* This module contains functions for scanning a compiled pattern and
				42	collecting data (e.g. minimum matching length). */
				43
				44
				45	#ifdef HAVE_CONFIG_H
				46	#include "config.h"
				47	#endif
				48
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	49	#include "pcre2_internal.h"
				50
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	51	/* The maximum remembered capturing brackets minimum. */
				52
				53	#define MAX_CACHE_BACKREF 128
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	54
				55	/* Set a bit in the starting code unit bit map. */
				56
Elliott Hughes	0c26e19	2019-08-07 12:24:46 -0700	[diff] [blame]	57	#define SET_BIT(c) re->start_bitmap[(c)/8] \|= (1u << ((c)&7))
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	58
				59	/* Returns from set_start_bits() */
				60
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	61	enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN, SSB_TOODEEP };
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	62
				63
				64	/*************************************************
				65	* Find the minimum subject length for a group *
				66	*************************************************/
				67
				68	/* Scan a parenthesized group and compute the minimum length of subject that
				69	is needed to match it. This is a lower bound; it does not mean there is a
				70	string of that length that matches. In UTF mode, the result is in characters
				71	rather than code units. The field in a compiled pattern for storing the minimum
				72	length is 16-bits long (on the grounds that anything longer than that is
				73	pathological), so we give up when we reach that amount. This also means that
				74	integer overflow for really crazy patterns cannot happen.
				75
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	76	Backreference minimum lengths are cached to speed up multiple references. This
				77	function is called only when the highest back reference in the pattern is less
				78	than or equal to MAX_CACHE_BACKREF, which is one less than the size of the
				79	caching vector. The zeroth element contains the number of the highest set
				80	value.
				81
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	82	Arguments:
				83	re compiled pattern block
				84	code pointer to start of group (the bracket)
				85	startcode pointer to start of the whole pattern's code
				86	utf UTF flag
				87	recurses chain of recurse_check to catch mutual recursion
				88	countptr pointer to call count (to catch over complexity)
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	89	backref_cache vector for caching back references.
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	90
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	91	This function is no longer called when the pattern contains (*ACCEPT); however,
				92	the old code for returning -1 is retained, just in case.
				93
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	94	Returns: the minimum length
				95	-1 \C in UTF-8 mode
				96	or (*ACCEPT)
				97	or pattern too complicated
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	98	-2 internal error (missing capturing bracket)
				99	-3 internal error (opcode not listed)
				100	*/
				101
				102	static int
				103	find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	104	PCRE2_SPTR startcode, BOOL utf, recurse_check recurses, int countptr,
				105	int *backref_cache)
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	106	{
				107	int length = -1;
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	108	int branchlength = 0;
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	109	int prev_cap_recno = -1;
				110	int prev_cap_d = 0;
				111	int prev_recurse_recno = -1;
				112	int prev_recurse_d = 0;
				113	uint32_t once_fudge = 0;
				114	BOOL had_recurse = FALSE;
				115	BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0;
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	116	PCRE2_SPTR nextbranch = code + GET(code, 1);
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	117	PCRE2_UCHAR cc = (PCRE2_UCHAR )code + 1 + LINK_SIZE;
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	118	recurse_check this_recurse;
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	119
				120	/* If this is a "could be empty" group, its minimum length is 0. */
				121
				122	if (code >= OP_SBRA && code <= OP_SCOND) return 0;
				123
				124	/* Skip over capturing bracket number */
				125
				126	if (code == OP_CBRA \|\| code == OP_CBRAPOS) cc += IMM2_SIZE;
				127
				128	/* A large and/or complex regex can take too long to process. */
				129
				130	if ((*countptr)++ > 1000) return -1;
				131
				132	/* Scan along the opcodes for this branch. If we get to the end of the branch,
				133	check the length against that of the other branches. If the accumulated length
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	134	passes 16-bits, reset to that value and skip the rest of the branch. */
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	135
				136	for (;;)
				137	{
				138	int d, min, recno;
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	139	PCRE2_UCHAR op, cs, ce;
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	140
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	141	if (branchlength >= UINT16_MAX)
				142	{
				143	branchlength = UINT16_MAX;
				144	cc = (PCRE2_UCHAR *)nextbranch;
				145	}
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	146
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	147	op = *cc;
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	148	switch (op)
				149	{
				150	case OP_COND:
				151	case OP_SCOND:
				152
				153	/* If there is only one branch in a condition, the implied branch has zero
				154	length, so we don't add anything. This covers the DEFINE "condition"
				155	automatically. If there are two branches we can treat it the same as any
				156	other non-capturing subpattern. */
				157
				158	cs = cc + GET(cc, 1);
				159	if (*cs != OP_ALT)
				160	{
				161	cc = cs + 1 + LINK_SIZE;
				162	break;
				163	}
				164	goto PROCESS_NON_CAPTURE;
				165
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	166	case OP_BRA:
				167	/* There's a special case of OP_BRA, when it is wrapped round a repeated
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	168	OP_RECURSE. We'd like to process the latter at this level so that
				169	remembering the value works for repeated cases. So we do nothing, but
				170	set a fudge value to skip over the OP_KET after the recurse. */
				171
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	172	if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET)
				173	{
				174	once_fudge = 1 + LINK_SIZE;
				175	cc += 1 + LINK_SIZE;
				176	break;
				177	}
				178	/* Fall through */
				179
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	180	case OP_ONCE:
Elliott Hughes	0c26e19	2019-08-07 12:24:46 -0700	[diff] [blame]	181	case OP_SCRIPT_RUN:
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	182	case OP_SBRA:
				183	case OP_BRAPOS:
				184	case OP_SBRAPOS:
				185	PROCESS_NON_CAPTURE:
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	186	d = find_minlength(re, cc, startcode, utf, recurses, countptr,
				187	backref_cache);
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	188	if (d < 0) return d;
				189	branchlength += d;
				190	do cc += GET(cc, 1); while (*cc == OP_ALT);
				191	cc += 1 + LINK_SIZE;
				192	break;
				193
				194	/* To save time for repeated capturing subpatterns, we remember the
				195	length of the previous one. Unfortunately we can't do the same for
				196	the unnumbered ones above. Nor can we do this if (?\| is present in the
				197	pattern because captures with the same number are not then identical. */
				198
				199	case OP_CBRA:
				200	case OP_SCBRA:
				201	case OP_CBRAPOS:
				202	case OP_SCBRAPOS:
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	203	recno = (int)GET2(cc, 1+LINK_SIZE);
				204	if (dupcapused \|\| recno != prev_cap_recno)
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	205	{
				206	prev_cap_recno = recno;
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	207	prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr,
				208	backref_cache);
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	209	if (prev_cap_d < 0) return prev_cap_d;
				210	}
				211	branchlength += prev_cap_d;
				212	do cc += GET(cc, 1); while (*cc == OP_ALT);
				213	cc += 1 + LINK_SIZE;
				214	break;
				215
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	216	/* ACCEPT makes things far too complicated; we have to give up. In fact,
				217	from 10.34 onwards, if a pattern contains (*ACCEPT), this function is not
				218	used. However, leave the code in place, just in case. */
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	219
				220	case OP_ACCEPT:
				221	case OP_ASSERT_ACCEPT:
				222	return -1;
				223
				224	/* Reached end of a branch; if it's a ket it is the end of a nested
				225	call. If it's ALT it is an alternation in a nested call. If it is END it's
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	226	the end of the outer call. All can be handled by the same code. If the
				227	length of any branch is zero, there is no need to scan any subsequent
				228	branches. */
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	229
				230	case OP_ALT:
				231	case OP_KET:
				232	case OP_KETRMAX:
				233	case OP_KETRMIN:
				234	case OP_KETRPOS:
				235	case OP_END:
				236	if (length < 0 \|\| (!had_recurse && branchlength < length))
				237	length = branchlength;
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	238	if (op != OP_ALT \|\| length == 0) return length;
				239	nextbranch = cc + GET(cc, 1);
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	240	cc += 1 + LINK_SIZE;
				241	branchlength = 0;
				242	had_recurse = FALSE;
				243	break;
				244
				245	/* Skip over assertive subpatterns */
				246
				247	case OP_ASSERT:
				248	case OP_ASSERT_NOT:
				249	case OP_ASSERTBACK:
				250	case OP_ASSERTBACK_NOT:
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	251	case OP_ASSERT_NA:
				252	case OP_ASSERTBACK_NA:
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	253	do cc += GET(cc, 1); while (*cc == OP_ALT);
				254	/* Fall through */
				255
				256	/* Skip over things that don't match chars */
				257
				258	case OP_REVERSE:
				259	case OP_CREF:
				260	case OP_DNCREF:
				261	case OP_RREF:
				262	case OP_DNRREF:
				263	case OP_FALSE:
				264	case OP_TRUE:
				265	case OP_CALLOUT:
				266	case OP_SOD:
				267	case OP_SOM:
				268	case OP_EOD:
				269	case OP_EODN:
				270	case OP_CIRC:
				271	case OP_CIRCM:
				272	case OP_DOLL:
				273	case OP_DOLLM:
				274	case OP_NOT_WORD_BOUNDARY:
				275	case OP_WORD_BOUNDARY:
				276	cc += PRIV(OP_lengths)[*cc];
				277	break;
				278
				279	case OP_CALLOUT_STR:
				280	cc += GET(cc, 1 + 2*LINK_SIZE);
				281	break;
				282
				283	/* Skip over a subpattern that has a {0} or {0,x} quantifier */
				284
				285	case OP_BRAZERO:
				286	case OP_BRAMINZERO:
				287	case OP_BRAPOSZERO:
				288	case OP_SKIPZERO:
				289	cc += PRIV(OP_lengths)[*cc];
				290	do cc += GET(cc, 1); while (*cc == OP_ALT);
				291	cc += 1 + LINK_SIZE;
				292	break;
				293
				294	/* Handle literal characters and + repetitions */
				295
				296	case OP_CHAR:
				297	case OP_CHARI:
				298	case OP_NOT:
				299	case OP_NOTI:
				300	case OP_PLUS:
				301	case OP_PLUSI:
				302	case OP_MINPLUS:
				303	case OP_MINPLUSI:
				304	case OP_POSPLUS:
				305	case OP_POSPLUSI:
				306	case OP_NOTPLUS:
				307	case OP_NOTPLUSI:
				308	case OP_NOTMINPLUS:
				309	case OP_NOTMINPLUSI:
				310	case OP_NOTPOSPLUS:
				311	case OP_NOTPOSPLUSI:
				312	branchlength++;
				313	cc += 2;
				314	#ifdef SUPPORT_UNICODE
				315	if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
				316	#endif
				317	break;
				318
				319	case OP_TYPEPLUS:
				320	case OP_TYPEMINPLUS:
				321	case OP_TYPEPOSPLUS:
				322	branchlength++;
				323	cc += (cc[1] == OP_PROP \|\| cc[1] == OP_NOTPROP)? 4 : 2;
				324	break;
				325
				326	/* Handle exact repetitions. The count is already in characters, but we
				327	may need to skip over a multibyte character in UTF mode. */
				328
				329	case OP_EXACT:
				330	case OP_EXACTI:
				331	case OP_NOTEXACT:
				332	case OP_NOTEXACTI:
				333	branchlength += GET2(cc,1);
				334	cc += 2 + IMM2_SIZE;
				335	#ifdef SUPPORT_UNICODE
				336	if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
				337	#endif
				338	break;
				339
				340	case OP_TYPEEXACT:
				341	branchlength += GET2(cc,1);
				342	cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
				343	\|\| cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
				344	break;
				345
				346	/* Handle single-char non-literal matchers */
				347
				348	case OP_PROP:
				349	case OP_NOTPROP:
				350	cc += 2;
				351	/* Fall through */
				352
				353	case OP_NOT_DIGIT:
				354	case OP_DIGIT:
				355	case OP_NOT_WHITESPACE:
				356	case OP_WHITESPACE:
				357	case OP_NOT_WORDCHAR:
				358	case OP_WORDCHAR:
				359	case OP_ANY:
				360	case OP_ALLANY:
				361	case OP_EXTUNI:
				362	case OP_HSPACE:
				363	case OP_NOT_HSPACE:
				364	case OP_VSPACE:
				365	case OP_NOT_VSPACE:
				366	branchlength++;
				367	cc++;
				368	break;
				369
				370	/* "Any newline" might match two characters, but it also might match just
				371	one. */
				372
				373	case OP_ANYNL:
				374	branchlength += 1;
				375	cc++;
				376	break;
				377
				378	/* The single-byte matcher means we can't proceed in UTF mode. (In
				379	non-UTF mode \C will actually be turned into OP_ALLANY, so won't ever
				380	appear, but leave the code, just in case.) */
				381
				382	case OP_ANYBYTE:
				383	#ifdef SUPPORT_UNICODE
				384	if (utf) return -1;
				385	#endif
				386	branchlength++;
				387	cc++;
				388	break;
				389
				390	/* For repeated character types, we have to test for \p and \P, which have
				391	an extra two bytes of parameters. */
				392
				393	case OP_TYPESTAR:
				394	case OP_TYPEMINSTAR:
				395	case OP_TYPEQUERY:
				396	case OP_TYPEMINQUERY:
				397	case OP_TYPEPOSSTAR:
				398	case OP_TYPEPOSQUERY:
				399	if (cc[1] == OP_PROP \|\| cc[1] == OP_NOTPROP) cc += 2;
				400	cc += PRIV(OP_lengths)[op];
				401	break;
				402
				403	case OP_TYPEUPTO:
				404	case OP_TYPEMINUPTO:
				405	case OP_TYPEPOSUPTO:
				406	if (cc[1 + IMM2_SIZE] == OP_PROP
				407	\|\| cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
				408	cc += PRIV(OP_lengths)[op];
				409	break;
				410
				411	/* Check a class for variable quantification */
				412
				413	case OP_CLASS:
				414	case OP_NCLASS:
				415	#ifdef SUPPORT_WIDE_CHARS
				416	case OP_XCLASS:
				417	/* The original code caused an unsigned overflow in 64 bit systems,
				418	so now we use a conditional statement. */
				419	if (op == OP_XCLASS)
				420	cc += GET(cc, 1);
				421	else
				422	cc += PRIV(OP_lengths)[OP_CLASS];
				423	#else
				424	cc += PRIV(OP_lengths)[OP_CLASS];
				425	#endif
				426
				427	switch (*cc)
				428	{
				429	case OP_CRPLUS:
				430	case OP_CRMINPLUS:
				431	case OP_CRPOSPLUS:
				432	branchlength++;
				433	/* Fall through */
				434
				435	case OP_CRSTAR:
				436	case OP_CRMINSTAR:
				437	case OP_CRQUERY:
				438	case OP_CRMINQUERY:
				439	case OP_CRPOSSTAR:
				440	case OP_CRPOSQUERY:
				441	cc++;
				442	break;
				443
				444	case OP_CRRANGE:
				445	case OP_CRMINRANGE:
				446	case OP_CRPOSRANGE:
				447	branchlength += GET2(cc,1);
				448	cc += 1 + 2 * IMM2_SIZE;
				449	break;
				450
				451	default:
				452	branchlength++;
				453	break;
				454	}
				455	break;
				456
				457	/* Backreferences and subroutine calls (OP_RECURSE) are treated in the same
				458	way: we find the minimum length for the subpattern. A recursion
				459	(backreference or subroutine) causes an a flag to be set that causes the
				460	length of this branch to be ignored. The logic is that a recursion can only
				461	make sense if there is another alternative that stops the recursing. That
				462	will provide the minimum length (when no recursion happens).
				463
				464	If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket
				465	matches an empty string (by default it causes a matching failure), so in
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	466	that case we must set the minimum length to zero.
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	467
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	468	For backreferenes, if duplicate numbers are present in the pattern we check
				469	for a reference to a duplicate. If it is, we don't know which version will
				470	be referenced, so we have to set the minimum length to zero. */
				471
				472	/* Duplicate named pattern back reference. */
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	473
				474	case OP_DNREF:
				475	case OP_DNREFI:
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	476	if (!dupcapused && (re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	477	{
				478	int count = GET2(cc, 1+IMM2_SIZE);
				479	PCRE2_UCHAR *slot =
				480	(PCRE2_UCHAR )((uint8_t )re + sizeof(pcre2_real_code)) +
				481	GET2(cc, 1) * re->name_entry_size;
				482
				483	d = INT_MAX;
				484
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	485	/* Scan all groups with the same name; find the shortest. */
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	486
				487	while (count-- > 0)
				488	{
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	489	int dd, i;
				490	recno = GET2(slot, 0);
				491
				492	if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
				493	dd = backref_cache[recno];
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	494	else
				495	{
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	496	ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
				497	if (cs == NULL) return -2;
				498	do ce += GET(ce, 1); while (*ce == OP_ALT);
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	499
				500	dd = 0;
				501	if (!dupcapused \|\|
				502	(PCRE2_UCHAR *)PRIV(find_bracket)(ce, utf, recno) == NULL)
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	503	{
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	504	if (cc > cs && cc < ce) /* Simple recursion */
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	505	{
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	506	had_recurse = TRUE;
				507	}
				508	else
				509	{
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	510	recurse_check *r = recurses;
				511	for (r = recurses; r != NULL; r = r->prev)
				512	if (r->group == cs) break;
				513	if (r != NULL) /* Mutual recursion */
				514	{
				515	had_recurse = TRUE;
				516	}
				517	else
				518	{
				519	this_recurse.prev = recurses; /* No recursion */
				520	this_recurse.group = cs;
				521	dd = find_minlength(re, cs, startcode, utf, &this_recurse,
				522	countptr, backref_cache);
				523	if (dd < 0) return dd;
				524	}
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	525	}
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	526	}
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	527
				528	backref_cache[recno] = dd;
				529	for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
				530	backref_cache[0] = recno;
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	531	}
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	532
				533	if (dd < d) d = dd;
				534	if (d <= 0) break; /* No point looking at any more */
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	535	slot += re->name_entry_size;
				536	}
				537	}
				538	else d = 0;
				539	cc += 1 + 2*IMM2_SIZE;
				540	goto REPEAT_BACK_REFERENCE;
				541
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	542	/* Single back reference by number. References by name are converted to by
				543	number when there is no duplication. */
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	544
				545	case OP_REF:
				546	case OP_REFI:
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	547	recno = GET2(cc, 1);
				548	if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
				549	d = backref_cache[recno];
				550	else
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	551	{
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	552	int i;
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	553	d = 0;
				554
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	555	if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	556	{
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	557	ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
				558	if (cs == NULL) return -2;
				559	do ce += GET(ce, 1); while (*ce == OP_ALT);
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	560
				561	if (!dupcapused \|\|
				562	(PCRE2_UCHAR *)PRIV(find_bracket)(ce, utf, recno) == NULL)
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	563	{
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	564	if (cc > cs && cc < ce) /* Simple recursion */
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	565	{
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	566	had_recurse = TRUE;
				567	}
				568	else
				569	{
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	570	recurse_check *r = recurses;
				571	for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
				572	if (r != NULL) /* Mutual recursion */
				573	{
				574	had_recurse = TRUE;
				575	}
				576	else /* No recursion */
				577	{
				578	this_recurse.prev = recurses;
				579	this_recurse.group = cs;
				580	d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr,
				581	backref_cache);
				582	if (d < 0) return d;
				583	}
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	584	}
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	585	}
				586	}
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	587
				588	backref_cache[recno] = d;
				589	for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
				590	backref_cache[0] = recno;
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	591	}
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	592
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	593	cc += 1 + IMM2_SIZE;
				594
				595	/* Handle repeated back references */
				596
				597	REPEAT_BACK_REFERENCE:
				598	switch (*cc)
				599	{
				600	case OP_CRSTAR:
				601	case OP_CRMINSTAR:
				602	case OP_CRQUERY:
				603	case OP_CRMINQUERY:
				604	case OP_CRPOSSTAR:
				605	case OP_CRPOSQUERY:
				606	min = 0;
				607	cc++;
				608	break;
				609
				610	case OP_CRPLUS:
				611	case OP_CRMINPLUS:
				612	case OP_CRPOSPLUS:
				613	min = 1;
				614	cc++;
				615	break;
				616
				617	case OP_CRRANGE:
				618	case OP_CRMINRANGE:
				619	case OP_CRPOSRANGE:
				620	min = GET2(cc, 1);
				621	cc += 1 + 2 * IMM2_SIZE;
				622	break;
				623
				624	default:
				625	min = 1;
				626	break;
				627	}
				628
				629	/* Take care not to overflow: (1) min and d are ints, so check that their
				630	product is not greater than INT_MAX. (2) branchlength is limited to
				631	UINT16_MAX (checked at the top of the loop). */
				632
				633	if ((d > 0 && (INT_MAX/d) < min) \|\| UINT16_MAX - branchlength < min*d)
				634	branchlength = UINT16_MAX;
				635	else branchlength += min * d;
				636	break;
				637
				638	/* Recursion always refers to the first occurrence of a subpattern with a
				639	given number. Therefore, we can always make use of caching, even when the
				640	pattern contains multiple subpatterns with the same number. */
				641
				642	case OP_RECURSE:
				643	cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1);
				644	recno = GET2(cs, 1+LINK_SIZE);
				645	if (recno == prev_recurse_recno)
				646	{
				647	branchlength += prev_recurse_d;
				648	}
				649	else
				650	{
				651	do ce += GET(ce, 1); while (*ce == OP_ALT);
				652	if (cc > cs && cc < ce) /* Simple recursion */
				653	had_recurse = TRUE;
				654	else
				655	{
				656	recurse_check *r = recurses;
				657	for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
				658	if (r != NULL) /* Mutual recursion */
				659	had_recurse = TRUE;
				660	else
				661	{
				662	this_recurse.prev = recurses;
				663	this_recurse.group = cs;
				664	prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse,
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	665	countptr, backref_cache);
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	666	if (prev_recurse_d < 0) return prev_recurse_d;
				667	prev_recurse_recno = recno;
				668	branchlength += prev_recurse_d;
				669	}
				670	}
				671	}
				672	cc += 1 + LINK_SIZE + once_fudge;
				673	once_fudge = 0;
				674	break;
				675
				676	/* Anything else does not or need not match a character. We can get the
				677	item's length from the table, but for those that can match zero occurrences
				678	of a character, we must take special action for UTF-8 characters. As it
				679	happens, the "NOT" versions of these opcodes are used at present only for
				680	ASCII characters, so they could be omitted from this list. However, in
				681	future that may change, so we include them here so as not to leave a
				682	gotcha for a future maintainer. */
				683
				684	case OP_UPTO:
				685	case OP_UPTOI:
				686	case OP_NOTUPTO:
				687	case OP_NOTUPTOI:
				688	case OP_MINUPTO:
				689	case OP_MINUPTOI:
				690	case OP_NOTMINUPTO:
				691	case OP_NOTMINUPTOI:
				692	case OP_POSUPTO:
				693	case OP_POSUPTOI:
				694	case OP_NOTPOSUPTO:
				695	case OP_NOTPOSUPTOI:
				696
				697	case OP_STAR:
				698	case OP_STARI:
				699	case OP_NOTSTAR:
				700	case OP_NOTSTARI:
				701	case OP_MINSTAR:
				702	case OP_MINSTARI:
				703	case OP_NOTMINSTAR:
				704	case OP_NOTMINSTARI:
				705	case OP_POSSTAR:
				706	case OP_POSSTARI:
				707	case OP_NOTPOSSTAR:
				708	case OP_NOTPOSSTARI:
				709
				710	case OP_QUERY:
				711	case OP_QUERYI:
				712	case OP_NOTQUERY:
				713	case OP_NOTQUERYI:
				714	case OP_MINQUERY:
				715	case OP_MINQUERYI:
				716	case OP_NOTMINQUERY:
				717	case OP_NOTMINQUERYI:
				718	case OP_POSQUERY:
				719	case OP_POSQUERYI:
				720	case OP_NOTPOSQUERY:
				721	case OP_NOTPOSQUERYI:
				722
				723	cc += PRIV(OP_lengths)[op];
				724	#ifdef SUPPORT_UNICODE
				725	if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
				726	#endif
				727	break;
				728
				729	/* Skip these, but we need to add in the name length. */
				730
				731	case OP_MARK:
Elliott Hughes	653c210	2019-01-09 15:41:36 -0800	[diff] [blame]	732	case OP_COMMIT_ARG:
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	733	case OP_PRUNE_ARG:
				734	case OP_SKIP_ARG:
				735	case OP_THEN_ARG:
				736	cc += PRIV(OP_lengths)[op] + cc[1];
				737	break;
				738
				739	/* The remaining opcodes are just skipped over. */
				740
				741	case OP_CLOSE:
				742	case OP_COMMIT:
				743	case OP_FAIL:
				744	case OP_PRUNE:
				745	case OP_SET_SOM:
				746	case OP_SKIP:
				747	case OP_THEN:
				748	cc += PRIV(OP_lengths)[op];
				749	break;
				750
				751	/* This should not occur: we list all opcodes explicitly so that when
				752	new ones get added they are properly considered. */
				753
				754	default:
				755	return -3;
				756	}
				757	}
				758	/* Control never gets here */
				759	}
				760
				761
				762
				763	/*************************************************
				764	* Set a bit and maybe its alternate case *
				765	*************************************************/
				766
				767	/* Given a character, set its first code unit's bit in the table, and also the
				768	corresponding bit for the other version of a letter if we are caseless.
				769
				770	Arguments:
				771	re points to the regex block
				772	p points to the first code unit of the character
				773	caseless TRUE if caseless
				774	utf TRUE for UTF mode
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	775	ucp TRUE for UCP mode
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	776
				777	Returns: pointer after the character
				778	*/
				779
				780	static PCRE2_SPTR
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	781	set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf,
				782	BOOL ucp)
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	783	{
				784	uint32_t c = p++; / First code unit */
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	785
				786	(void)utf; /* Stop compiler warnings when UTF not supported */
				787	(void)ucp;
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	788
				789	/* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
				790	0xff. */
				791
				792	#if PCRE2_CODE_UNIT_WIDTH != 8
				793	if (c > 0xff) SET_BIT(0xff); else
				794	#endif
				795
				796	SET_BIT(c);
				797
				798	/* In UTF-8 or UTF-16 mode, pick up the remaining code units in order to find
				799	the end of the character, even when caseless. */
				800
				801	#ifdef SUPPORT_UNICODE
				802	if (utf)
				803	{
				804	#if PCRE2_CODE_UNIT_WIDTH == 8
				805	if (c >= 0xc0) GETUTF8INC(c, p);
				806	#elif PCRE2_CODE_UNIT_WIDTH == 16
				807	if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, p);
				808	#endif
				809	}
				810	#endif /* SUPPORT_UNICODE */
				811
				812	/* If caseless, handle the other case of the character. */
				813
				814	if (caseless)
				815	{
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	816	#ifdef SUPPORT_UNICODE
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	817	if (utf \|\| ucp)
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	818	{
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	819	c = UCD_OTHERCASE(c);
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	820	#if PCRE2_CODE_UNIT_WIDTH == 8
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	821	if (utf)
				822	{
				823	PCRE2_UCHAR buff[6];
				824	(void)PRIV(ord2utf)(c, buff);
				825	SET_BIT(buff[0]);
				826	}
				827	else if (c < 256) SET_BIT(c);
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	828	#else /* 16-bit or 32-bit mode */
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	829	if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
				830	#endif
				831	}
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	832
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	833	else
				834	#endif /* SUPPORT_UNICODE */
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	835
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	836	/* Not UTF or UCP */
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	837
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	838	if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	839	}
				840
				841	return p;
				842	}
				843
				844
				845
				846	/*************************************************
				847	* Set bits for a positive character type *
				848	*************************************************/
				849
				850	/* This function sets starting bits for a character type. In UTF-8 mode, we can
				851	only do a direct setting for bytes less than 128, as otherwise there can be
				852	confusion with bytes in the middle of UTF-8 characters. In a "traditional"
				853	environment, the tables will only recognize ASCII characters anyway, but in at
				854	least one Windows environment, some higher bytes bits were set in the tables.
				855	So we deal with that case by considering the UTF-8 encoding.
				856
				857	Arguments:
				858	re the regex block
				859	cbit type the type of character wanted
				860	table_limit 32 for non-UTF-8; 16 for UTF-8
				861
				862	Returns: nothing
				863	*/
				864
				865	static void
				866	set_type_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
				867	{
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	868	uint32_t c;
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	869	for (c = 0; c < table_limit; c++)
				870	re->start_bitmap[c] \|= re->tables[c+cbits_offset+cbit_type];
				871	#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
				872	if (table_limit == 32) return;
				873	for (c = 128; c < 256; c++)
				874	{
Elliott Hughes	0c26e19	2019-08-07 12:24:46 -0700	[diff] [blame]	875	if ((re->tables[cbits_offset + c/8] & (1u << (c&7))) != 0)
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	876	{
				877	PCRE2_UCHAR buff[6];
				878	(void)PRIV(ord2utf)(c, buff);
				879	SET_BIT(buff[0]);
				880	}
				881	}
				882	#endif /* UTF-8 */
				883	}
				884
				885
				886	/*************************************************
				887	* Set bits for a negative character type *
				888	*************************************************/
				889
				890	/* This function sets starting bits for a negative character type such as \D.
				891	In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
				892	otherwise there can be confusion with bytes in the middle of UTF-8 characters.
				893	Unlike in the positive case, where we can set appropriate starting bits for
				894	specific high-valued UTF-8 characters, in this case we have to set the bits for
				895	all high-valued characters. The lowest is 0xc2, but we overkill by starting at
				896	0xc0 (192) for simplicity.
				897
				898	Arguments:
				899	re the regex block
				900	cbit type the type of character wanted
				901	table_limit 32 for non-UTF-8; 16 for UTF-8
				902
				903	Returns: nothing
				904	*/
				905
				906	static void
				907	set_nottype_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
				908	{
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	909	uint32_t c;
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	910	for (c = 0; c < table_limit; c++)
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	911	re->start_bitmap[c] \|= (uint8_t)(~(re->tables[c+cbits_offset+cbit_type]));
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	912	#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
				913	if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff;
				914	#endif
				915	}
				916
				917
				918
				919	/*************************************************
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	920	* Create bitmap of starting code units *
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	921	*************************************************/
				922
				923	/* This function scans a compiled unanchored expression recursively and
				924	attempts to build a bitmap of the set of possible starting code units whose
				925	values are less than 256. In 16-bit and 32-bit mode, values above 255 all cause
				926	the 255 bit to be set. When calling set[_not]_type_bits() in UTF-8 (sic) mode
				927	we pass a value of 16 rather than 32 as the final argument. (See comments in
				928	those functions for the reason.)
				929
				930	The SSB_CONTINUE return is useful for parenthesized groups in patterns such as
				931	(a*)b where the group provides some optional starting code units but scanning
				932	must continue at the outer level to find at least one mandatory code unit. At
				933	the outermost level, this function fails unless the result is SSB_DONE.
				934
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	935	We restrict recursion (for nested groups) to 1000 to avoid stack overflow
				936	issues.
				937
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	938	Arguments:
				939	re points to the compiled regex block
				940	code points to an expression
				941	utf TRUE if in UTF mode
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	942	ucp TRUE if in UCP mode
				943	depthptr pointer to recurse depth
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	944
				945	Returns: SSB_FAIL => Failed to find any starting code units
				946	SSB_DONE => Found mandatory starting code units
				947	SSB_CONTINUE => Found optional starting code units
				948	SSB_UNKNOWN => Hit an unrecognized opcode
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	949	SSB_TOODEEP => Recursion is too deep
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	950	*/
				951
				952	static int
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	953	set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, BOOL ucp,
				954	int *depthptr)
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	955	{
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	956	uint32_t c;
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	957	int yield = SSB_DONE;
				958
				959	#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
				960	int table_limit = utf? 16:32;
				961	#else
				962	int table_limit = 32;
				963	#endif
				964
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	965	*depthptr += 1;
				966	if (*depthptr > 1000) return SSB_TOODEEP;
				967
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	968	do
				969	{
				970	BOOL try_next = TRUE;
				971	PCRE2_SPTR tcode = code + 1 + LINK_SIZE;
				972
				973	if (code == OP_CBRA \|\| code == OP_SCBRA \|\|
				974	code == OP_CBRAPOS \|\| code == OP_SCBRAPOS) tcode += IMM2_SIZE;
				975
				976	while (try_next) /* Loop for items in this branch */
				977	{
				978	int rc;
				979	uint8_t *classmap = NULL;
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	980	#ifdef SUPPORT_WIDE_CHARS
				981	PCRE2_UCHAR xclassflags;
				982	#endif
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	983
				984	switch(*tcode)
				985	{
				986	/* If we reach something we don't understand, it means a new opcode has
				987	been created that hasn't been added to this function. Hopefully this
				988	problem will be discovered during testing. */
				989
				990	default:
				991	return SSB_UNKNOWN;
				992
				993	/* Fail for a valid opcode that implies no starting bits. */
				994
				995	case OP_ACCEPT:
				996	case OP_ASSERT_ACCEPT:
				997	case OP_ALLANY:
				998	case OP_ANY:
				999	case OP_ANYBYTE:
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	1000	case OP_CIRCM:
				1001	case OP_CLOSE:
				1002	case OP_COMMIT:
Elliott Hughes	653c210	2019-01-09 15:41:36 -0800	[diff] [blame]	1003	case OP_COMMIT_ARG:
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	1004	case OP_COND:
				1005	case OP_CREF:
				1006	case OP_FALSE:
				1007	case OP_TRUE:
				1008	case OP_DNCREF:
				1009	case OP_DNREF:
				1010	case OP_DNREFI:
				1011	case OP_DNRREF:
				1012	case OP_DOLL:
				1013	case OP_DOLLM:
				1014	case OP_END:
				1015	case OP_EOD:
				1016	case OP_EODN:
				1017	case OP_EXTUNI:
				1018	case OP_FAIL:
				1019	case OP_MARK:
				1020	case OP_NOT:
				1021	case OP_NOTEXACT:
				1022	case OP_NOTEXACTI:
				1023	case OP_NOTI:
				1024	case OP_NOTMINPLUS:
				1025	case OP_NOTMINPLUSI:
				1026	case OP_NOTMINQUERY:
				1027	case OP_NOTMINQUERYI:
				1028	case OP_NOTMINSTAR:
				1029	case OP_NOTMINSTARI:
				1030	case OP_NOTMINUPTO:
				1031	case OP_NOTMINUPTOI:
				1032	case OP_NOTPLUS:
				1033	case OP_NOTPLUSI:
				1034	case OP_NOTPOSPLUS:
				1035	case OP_NOTPOSPLUSI:
				1036	case OP_NOTPOSQUERY:
				1037	case OP_NOTPOSQUERYI:
				1038	case OP_NOTPOSSTAR:
				1039	case OP_NOTPOSSTARI:
				1040	case OP_NOTPOSUPTO:
				1041	case OP_NOTPOSUPTOI:
				1042	case OP_NOTPROP:
				1043	case OP_NOTQUERY:
				1044	case OP_NOTQUERYI:
				1045	case OP_NOTSTAR:
				1046	case OP_NOTSTARI:
				1047	case OP_NOTUPTO:
				1048	case OP_NOTUPTOI:
				1049	case OP_NOT_HSPACE:
				1050	case OP_NOT_VSPACE:
				1051	case OP_PRUNE:
				1052	case OP_PRUNE_ARG:
				1053	case OP_RECURSE:
				1054	case OP_REF:
				1055	case OP_REFI:
				1056	case OP_REVERSE:
				1057	case OP_RREF:
				1058	case OP_SCOND:
				1059	case OP_SET_SOM:
				1060	case OP_SKIP:
				1061	case OP_SKIP_ARG:
				1062	case OP_SOD:
				1063	case OP_SOM:
				1064	case OP_THEN:
				1065	case OP_THEN_ARG:
				1066	return SSB_FAIL;
				1067
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	1068	/* OP_CIRC happens only at the start of an anchored branch (multiline ^
				1069	uses OP_CIRCM). Skip over it. */
				1070
				1071	case OP_CIRC:
				1072	tcode += PRIV(OP_lengths)[OP_CIRC];
				1073	break;
				1074
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	1075	/* A "real" property test implies no starting bits, but the fake property
				1076	PT_CLIST identifies a list of characters. These lists are short, as they
				1077	are used for characters with more than one "other case", so there is no
				1078	point in recognizing them for OP_NOTPROP. */
				1079
				1080	case OP_PROP:
				1081	if (tcode[1] != PT_CLIST) return SSB_FAIL;
				1082	{
				1083	const uint32_t *p = PRIV(ucd_caseless_sets) + tcode[2];
				1084	while ((c = *p++) < NOTACHAR)
				1085	{
				1086	#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
				1087	if (utf)
				1088	{
				1089	PCRE2_UCHAR buff[6];
				1090	(void)PRIV(ord2utf)(c, buff);
				1091	c = buff[0];
				1092	}
				1093	#endif
				1094	if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
				1095	}
				1096	}
				1097	try_next = FALSE;
				1098	break;
				1099
				1100	/* We can ignore word boundary tests. */
				1101
				1102	case OP_WORD_BOUNDARY:
				1103	case OP_NOT_WORD_BOUNDARY:
				1104	tcode++;
				1105	break;
				1106
				1107	/* If we hit a bracket or a positive lookahead assertion, recurse to set
				1108	bits from within the subpattern. If it can't find anything, we have to
				1109	give up. If it finds some mandatory character(s), we are done for this
				1110	branch. Otherwise, carry on scanning after the subpattern. */
				1111
				1112	case OP_BRA:
				1113	case OP_SBRA:
				1114	case OP_CBRA:
				1115	case OP_SCBRA:
				1116	case OP_BRAPOS:
				1117	case OP_SBRAPOS:
				1118	case OP_CBRAPOS:
				1119	case OP_SCBRAPOS:
				1120	case OP_ONCE:
Elliott Hughes	0c26e19	2019-08-07 12:24:46 -0700	[diff] [blame]	1121	case OP_SCRIPT_RUN:
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	1122	case OP_ASSERT:
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	1123	case OP_ASSERT_NA:
				1124	rc = set_start_bits(re, tcode, utf, ucp, depthptr);
				1125	if (rc == SSB_DONE)
				1126	{
				1127	try_next = FALSE;
				1128	}
				1129	else if (rc == SSB_CONTINUE)
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	1130	{
				1131	do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
				1132	tcode += 1 + LINK_SIZE;
				1133	}
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	1134	else return rc; /* FAIL, UNKNOWN, or TOODEEP */
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	1135	break;
				1136
				1137	/* If we hit ALT or KET, it means we haven't found anything mandatory in
				1138	this branch, though we might have found something optional. For ALT, we
				1139	continue with the next alternative, but we have to arrange that the final
				1140	result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
				1141	return SSB_CONTINUE: if this is the top level, that indicates failure,
				1142	but after a nested subpattern, it causes scanning to continue. */
				1143
				1144	case OP_ALT:
				1145	yield = SSB_CONTINUE;
				1146	try_next = FALSE;
				1147	break;
				1148
				1149	case OP_KET:
				1150	case OP_KETRMAX:
				1151	case OP_KETRMIN:
				1152	case OP_KETRPOS:
				1153	return SSB_CONTINUE;
				1154
				1155	/* Skip over callout */
				1156
				1157	case OP_CALLOUT:
				1158	tcode += PRIV(OP_lengths)[OP_CALLOUT];
				1159	break;
				1160
				1161	case OP_CALLOUT_STR:
				1162	tcode += GET(tcode, 1 + 2*LINK_SIZE);
				1163	break;
				1164
				1165	/* Skip over lookbehind and negative lookahead assertions */
				1166
				1167	case OP_ASSERT_NOT:
				1168	case OP_ASSERTBACK:
				1169	case OP_ASSERTBACK_NOT:
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	1170	case OP_ASSERTBACK_NA:
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	1171	do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
				1172	tcode += 1 + LINK_SIZE;
				1173	break;
				1174
				1175	/* BRAZERO does the bracket, but carries on. */
				1176
				1177	case OP_BRAZERO:
				1178	case OP_BRAMINZERO:
				1179	case OP_BRAPOSZERO:
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	1180	rc = set_start_bits(re, ++tcode, utf, ucp, depthptr);
				1181	if (rc == SSB_FAIL \|\| rc == SSB_UNKNOWN \|\| rc == SSB_TOODEEP) return rc;
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	1182	do tcode += GET(tcode,1); while (*tcode == OP_ALT);
				1183	tcode += 1 + LINK_SIZE;
				1184	break;
				1185
				1186	/* SKIPZERO skips the bracket. */
				1187
				1188	case OP_SKIPZERO:
				1189	tcode++;
				1190	do tcode += GET(tcode,1); while (*tcode == OP_ALT);
				1191	tcode += 1 + LINK_SIZE;
				1192	break;
				1193
				1194	/* Single-char * or ? sets the bit and tries the next item */
				1195
				1196	case OP_STAR:
				1197	case OP_MINSTAR:
				1198	case OP_POSSTAR:
				1199	case OP_QUERY:
				1200	case OP_MINQUERY:
				1201	case OP_POSQUERY:
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	1202	tcode = set_table_bit(re, tcode + 1, FALSE, utf, ucp);
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	1203	break;
				1204
				1205	case OP_STARI:
				1206	case OP_MINSTARI:
				1207	case OP_POSSTARI:
				1208	case OP_QUERYI:
				1209	case OP_MINQUERYI:
				1210	case OP_POSQUERYI:
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	1211	tcode = set_table_bit(re, tcode + 1, TRUE, utf, ucp);
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	1212	break;
				1213
				1214	/* Single-char upto sets the bit and tries the next */
				1215
				1216	case OP_UPTO:
				1217	case OP_MINUPTO:
				1218	case OP_POSUPTO:
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	1219	tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf, ucp);
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	1220	break;
				1221
				1222	case OP_UPTOI:
				1223	case OP_MINUPTOI:
				1224	case OP_POSUPTOI:
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	1225	tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf, ucp);
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	1226	break;
				1227
				1228	/* At least one single char sets the bit and stops */
				1229
				1230	case OP_EXACT:
				1231	tcode += IMM2_SIZE;
				1232	/* Fall through */
				1233	case OP_CHAR:
				1234	case OP_PLUS:
				1235	case OP_MINPLUS:
				1236	case OP_POSPLUS:
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	1237	(void)set_table_bit(re, tcode + 1, FALSE, utf, ucp);
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	1238	try_next = FALSE;
				1239	break;
				1240
				1241	case OP_EXACTI:
				1242	tcode += IMM2_SIZE;
				1243	/* Fall through */
				1244	case OP_CHARI:
				1245	case OP_PLUSI:
				1246	case OP_MINPLUSI:
				1247	case OP_POSPLUSI:
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	1248	(void)set_table_bit(re, tcode + 1, TRUE, utf, ucp);
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	1249	try_next = FALSE;
				1250	break;
				1251
				1252	/* Special spacing and line-terminating items. These recognize specific
				1253	lists of characters. The difference between VSPACE and ANYNL is that the
				1254	latter can match the two-character CRLF sequence, but that is not
				1255	relevant for finding the first character, so their code here is
				1256	identical. */
				1257
				1258	case OP_HSPACE:
				1259	SET_BIT(CHAR_HT);
				1260	SET_BIT(CHAR_SPACE);
				1261
				1262	/* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
				1263	the bits for 0xA0 and for code units >= 255, independently of UTF. */
				1264
				1265	#if PCRE2_CODE_UNIT_WIDTH != 8
				1266	SET_BIT(0xA0);
				1267	SET_BIT(0xFF);
				1268	#else
				1269	/* For the 8-bit library in UTF-8 mode, set the bits for the first code
				1270	units of horizontal space characters. */
				1271
				1272	#ifdef SUPPORT_UNICODE
				1273	if (utf)
				1274	{
				1275	SET_BIT(0xC2); /* For U+00A0 */
				1276	SET_BIT(0xE1); /* For U+1680, U+180E */
				1277	SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
				1278	SET_BIT(0xE3); /* For U+3000 */
				1279	}
				1280	else
				1281	#endif
				1282	/* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
				1283	the code is EBCDIC. */
				1284	{
				1285	#ifndef EBCDIC
				1286	SET_BIT(0xA0);
				1287	#endif /* Not EBCDIC */
				1288	}
				1289	#endif /* 8-bit support */
				1290
				1291	try_next = FALSE;
				1292	break;
				1293
				1294	case OP_ANYNL:
				1295	case OP_VSPACE:
				1296	SET_BIT(CHAR_LF);
				1297	SET_BIT(CHAR_VT);
				1298	SET_BIT(CHAR_FF);
				1299	SET_BIT(CHAR_CR);
				1300
				1301	/* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
				1302	the bits for NEL and for code units >= 255, independently of UTF. */
				1303
				1304	#if PCRE2_CODE_UNIT_WIDTH != 8
				1305	SET_BIT(CHAR_NEL);
				1306	SET_BIT(0xFF);
				1307	#else
				1308	/* For the 8-bit library in UTF-8 mode, set the bits for the first code
				1309	units of vertical space characters. */
				1310
				1311	#ifdef SUPPORT_UNICODE
				1312	if (utf)
				1313	{
				1314	SET_BIT(0xC2); /* For U+0085 (NEL) */
				1315	SET_BIT(0xE2); /* For U+2028, U+2029 */
				1316	}
				1317	else
				1318	#endif
				1319	/* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
				1320	{
				1321	SET_BIT(CHAR_NEL);
				1322	}
				1323	#endif /* 8-bit support */
				1324
				1325	try_next = FALSE;
				1326	break;
				1327
				1328	/* Single character types set the bits and stop. Note that if PCRE2_UCP
Elliott Hughes	653c210	2019-01-09 15:41:36 -0800	[diff] [blame]	1329	is set, we do not see these opcodes because \d etc are converted to
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	1330	properties. Therefore, these apply in the case when only characters less
				1331	than 256 are recognized to match the types. */
				1332
				1333	case OP_NOT_DIGIT:
				1334	set_nottype_bits(re, cbit_digit, table_limit);
				1335	try_next = FALSE;
				1336	break;
				1337
				1338	case OP_DIGIT:
				1339	set_type_bits(re, cbit_digit, table_limit);
				1340	try_next = FALSE;
				1341	break;
				1342
				1343	case OP_NOT_WHITESPACE:
				1344	set_nottype_bits(re, cbit_space, table_limit);
				1345	try_next = FALSE;
				1346	break;
				1347
				1348	case OP_WHITESPACE:
				1349	set_type_bits(re, cbit_space, table_limit);
				1350	try_next = FALSE;
				1351	break;
				1352
				1353	case OP_NOT_WORDCHAR:
				1354	set_nottype_bits(re, cbit_word, table_limit);
				1355	try_next = FALSE;
				1356	break;
				1357
				1358	case OP_WORDCHAR:
				1359	set_type_bits(re, cbit_word, table_limit);
				1360	try_next = FALSE;
				1361	break;
				1362
				1363	/* One or more character type fudges the pointer and restarts, knowing
				1364	it will hit a single character type and stop there. */
				1365
				1366	case OP_TYPEPLUS:
				1367	case OP_TYPEMINPLUS:
				1368	case OP_TYPEPOSPLUS:
				1369	tcode++;
				1370	break;
				1371
				1372	case OP_TYPEEXACT:
				1373	tcode += 1 + IMM2_SIZE;
				1374	break;
				1375
				1376	/* Zero or more repeats of character types set the bits and then
				1377	try again. */
				1378
				1379	case OP_TYPEUPTO:
				1380	case OP_TYPEMINUPTO:
				1381	case OP_TYPEPOSUPTO:
				1382	tcode += IMM2_SIZE; /* Fall through */
				1383
				1384	case OP_TYPESTAR:
				1385	case OP_TYPEMINSTAR:
				1386	case OP_TYPEPOSSTAR:
				1387	case OP_TYPEQUERY:
				1388	case OP_TYPEMINQUERY:
				1389	case OP_TYPEPOSQUERY:
				1390	switch(tcode[1])
				1391	{
				1392	default:
				1393	case OP_ANY:
				1394	case OP_ALLANY:
				1395	return SSB_FAIL;
				1396
				1397	case OP_HSPACE:
				1398	SET_BIT(CHAR_HT);
				1399	SET_BIT(CHAR_SPACE);
				1400
				1401	/* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
				1402	the bits for 0xA0 and for code units >= 255, independently of UTF. */
				1403
				1404	#if PCRE2_CODE_UNIT_WIDTH != 8
				1405	SET_BIT(0xA0);
				1406	SET_BIT(0xFF);
				1407	#else
				1408	/* For the 8-bit library in UTF-8 mode, set the bits for the first code
				1409	units of horizontal space characters. */
				1410
				1411	#ifdef SUPPORT_UNICODE
				1412	if (utf)
				1413	{
				1414	SET_BIT(0xC2); /* For U+00A0 */
				1415	SET_BIT(0xE1); /* For U+1680, U+180E */
				1416	SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
				1417	SET_BIT(0xE3); /* For U+3000 */
				1418	}
				1419	else
				1420	#endif
				1421	/* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
				1422	the code is EBCDIC. */
				1423	{
				1424	#ifndef EBCDIC
				1425	SET_BIT(0xA0);
				1426	#endif /* Not EBCDIC */
				1427	}
				1428	#endif /* 8-bit support */
				1429	break;
				1430
				1431	case OP_ANYNL:
				1432	case OP_VSPACE:
				1433	SET_BIT(CHAR_LF);
				1434	SET_BIT(CHAR_VT);
				1435	SET_BIT(CHAR_FF);
				1436	SET_BIT(CHAR_CR);
				1437
				1438	/* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
				1439	the bits for NEL and for code units >= 255, independently of UTF. */
				1440
				1441	#if PCRE2_CODE_UNIT_WIDTH != 8
				1442	SET_BIT(CHAR_NEL);
				1443	SET_BIT(0xFF);
				1444	#else
				1445	/* For the 8-bit library in UTF-8 mode, set the bits for the first code
				1446	units of vertical space characters. */
				1447
				1448	#ifdef SUPPORT_UNICODE
				1449	if (utf)
				1450	{
				1451	SET_BIT(0xC2); /* For U+0085 (NEL) */
				1452	SET_BIT(0xE2); /* For U+2028, U+2029 */
				1453	}
				1454	else
				1455	#endif
				1456	/* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
				1457	{
				1458	SET_BIT(CHAR_NEL);
				1459	}
				1460	#endif /* 8-bit support */
				1461	break;
				1462
				1463	case OP_NOT_DIGIT:
				1464	set_nottype_bits(re, cbit_digit, table_limit);
				1465	break;
				1466
				1467	case OP_DIGIT:
				1468	set_type_bits(re, cbit_digit, table_limit);
				1469	break;
				1470
				1471	case OP_NOT_WHITESPACE:
				1472	set_nottype_bits(re, cbit_space, table_limit);
				1473	break;
				1474
				1475	case OP_WHITESPACE:
				1476	set_type_bits(re, cbit_space, table_limit);
				1477	break;
				1478
				1479	case OP_NOT_WORDCHAR:
				1480	set_nottype_bits(re, cbit_word, table_limit);
				1481	break;
				1482
				1483	case OP_WORDCHAR:
				1484	set_type_bits(re, cbit_word, table_limit);
				1485	break;
				1486	}
				1487
				1488	tcode += 2;
				1489	break;
				1490
				1491	/* Extended class: if there are any property checks, or if this is a
				1492	negative XCLASS without a map, give up. If there are no property checks,
				1493	there must be wide characters on the XCLASS list, because otherwise an
				1494	XCLASS would not have been created. This means that code points >= 255
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	1495	are potential starters. In the UTF-8 case we can scan them and set bits
				1496	for the relevant leading bytes. */
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	1497
				1498	#ifdef SUPPORT_WIDE_CHARS
				1499	case OP_XCLASS:
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	1500	xclassflags = tcode[1 + LINK_SIZE];
				1501	if ((xclassflags & XCL_HASPROP) != 0 \|\|
				1502	(xclassflags & (XCL_MAP\|XCL_NOT)) == XCL_NOT)
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	1503	return SSB_FAIL;
				1504
				1505	/* We have a positive XCLASS or a negative one without a map. Set up the
				1506	map pointer if there is one, and fall through. */
				1507
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	1508	classmap = ((xclassflags & XCL_MAP) == 0)? NULL :
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	1509	(uint8_t *)(tcode + 1 + LINK_SIZE + 1);
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	1510
				1511	/* In UTF-8 mode, scan the character list and set bits for leading bytes,
				1512	then jump to handle the map. */
				1513
				1514	#if PCRE2_CODE_UNIT_WIDTH == 8
				1515	if (utf && (xclassflags & XCL_NOT) == 0)
				1516	{
				1517	PCRE2_UCHAR b, e;
				1518	PCRE2_SPTR p = tcode + 1 + LINK_SIZE + 1 + ((classmap == NULL)? 0:32);
				1519	tcode += GET(tcode, 1);
				1520
				1521	for (;;) switch (*p++)
				1522	{
				1523	case XCL_SINGLE:
				1524	b = *p++;
				1525	while ((*p & 0xc0) == 0x80) p++;
				1526	re->start_bitmap[b/8] \|= (1u << (b&7));
				1527	break;
				1528
				1529	case XCL_RANGE:
				1530	b = *p++;
				1531	while ((*p & 0xc0) == 0x80) p++;
				1532	e = *p++;
				1533	while ((*p & 0xc0) == 0x80) p++;
				1534	for (; b <= e; b++)
				1535	re->start_bitmap[b/8] \|= (1u << (b&7));
				1536	break;
				1537
				1538	case XCL_END:
				1539	goto HANDLE_CLASSMAP;
				1540
				1541	default:
				1542	return SSB_UNKNOWN; /* Internal error, should not occur */
				1543	}
				1544	}
				1545	#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */
				1546	#endif /* SUPPORT_WIDE_CHARS */
				1547
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	1548	/* It seems that the fall through comment must be outside the #ifdef if
				1549	it is to avoid the gcc compiler warning. */
				1550
				1551	/* Fall through */
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	1552
				1553	/* Enter here for a negative non-XCLASS. In the 8-bit library, if we are
				1554	in UTF mode, any byte with a value >= 0xc4 is a potentially valid starter
				1555	because it starts a character with a value > 255. In 8-bit non-UTF mode,
				1556	there is no difference between CLASS and NCLASS. In all other wide
				1557	character modes, set the 0xFF bit to indicate code units >= 255. */
				1558
				1559	case OP_NCLASS:
				1560	#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
				1561	if (utf)
				1562	{
				1563	re->start_bitmap[24] \|= 0xf0; /* Bits for 0xc4 - 0xc8 */
				1564	memset(re->start_bitmap+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
				1565	}
				1566	#elif PCRE2_CODE_UNIT_WIDTH != 8
				1567	SET_BIT(0xFF); /* For characters >= 255 */
				1568	#endif
				1569	/* Fall through */
				1570
				1571	/* Enter here for a positive non-XCLASS. If we have fallen through from
				1572	an XCLASS, classmap will already be set; just advance the code pointer.
				1573	Otherwise, set up classmap for a a non-XCLASS and advance past it. */
				1574
				1575	case OP_CLASS:
				1576	if (*tcode == OP_XCLASS) tcode += GET(tcode, 1); else
				1577	{
				1578	classmap = (uint8_t *)(++tcode);
				1579	tcode += 32 / sizeof(PCRE2_UCHAR);
				1580	}
				1581
				1582	/* When wide characters are supported, classmap may be NULL. In UTF-8
				1583	(sic) mode, the bits in a class bit map correspond to character values,
				1584	not to byte values. However, the bit map we are constructing is for byte
				1585	values. So we have to do a conversion for characters whose code point is
				1586	greater than 127. In fact, there are only two possible starting bytes for
				1587	characters in the range 128 - 255. */
				1588
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	1589	#if defined SUPPORT_WIDE_CHARS && PCRE2_CODE_UNIT_WIDTH == 8
				1590	HANDLE_CLASSMAP:
				1591	#endif
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	1592	if (classmap != NULL)
				1593	{
				1594	#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
				1595	if (utf)
				1596	{
				1597	for (c = 0; c < 16; c++) re->start_bitmap[c] \|= classmap[c];
				1598	for (c = 128; c < 256; c++)
				1599	{
Elliott Hughes	0c26e19	2019-08-07 12:24:46 -0700	[diff] [blame]	1600	if ((classmap[c/8] & (1u << (c&7))) != 0)
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	1601	{
Elliott Hughes	0c26e19	2019-08-07 12:24:46 -0700	[diff] [blame]	1602	int d = (c >> 6) \| 0xc0; /* Set bit for this starter */
				1603	re->start_bitmap[d/8] \|= (1u << (d&7)); /* and then skip on to the */
				1604	c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	1605	}
				1606	}
				1607	}
				1608	else
				1609	#endif
				1610	/* In all modes except UTF-8, the two bit maps are compatible. */
				1611
				1612	{
				1613	for (c = 0; c < 32; c++) re->start_bitmap[c] \|= classmap[c];
				1614	}
				1615	}
				1616
				1617	/* Act on what follows the class. For a zero minimum repeat, continue;
				1618	otherwise stop processing. */
				1619
				1620	switch (*tcode)
				1621	{
				1622	case OP_CRSTAR:
				1623	case OP_CRMINSTAR:
				1624	case OP_CRQUERY:
				1625	case OP_CRMINQUERY:
				1626	case OP_CRPOSSTAR:
				1627	case OP_CRPOSQUERY:
				1628	tcode++;
				1629	break;
				1630
				1631	case OP_CRRANGE:
				1632	case OP_CRMINRANGE:
				1633	case OP_CRPOSRANGE:
				1634	if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
				1635	else try_next = FALSE;
				1636	break;
				1637
				1638	default:
				1639	try_next = FALSE;
				1640	break;
				1641	}
				1642	break; /* End of class handling case */
				1643	} /* End of switch for opcodes */
				1644	} /* End of try_next loop */
				1645
				1646	code += GET(code, 1); /* Advance to next branch */
				1647	}
				1648	while (*code == OP_ALT);
				1649
				1650	return yield;
				1651	}
				1652
				1653
				1654
				1655	/*************************************************
				1656	* Study a compiled expression *
				1657	*************************************************/
				1658
				1659	/* This function is handed a compiled expression that it must study to produce
				1660	information that will speed up the matching.
				1661
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	1662	Argument:
				1663	re points to the compiled expression
				1664
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	1665	Returns: 0 normally; non-zero should never normally occur
				1666	1 unknown opcode in set_start_bits
				1667	2 missing capturing bracket
				1668	3 unknown opcode in find_minlength
				1669	*/
				1670
				1671	int
				1672	PRIV(study)(pcre2_real_code *re)
				1673	{
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	1674	int count = 0;
				1675	PCRE2_UCHAR *code;
				1676	BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	1677	BOOL ucp = (re->overall_options & PCRE2_UCP) != 0;
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	1678
				1679	/* Find start of compiled code */
				1680
				1681	code = (PCRE2_UCHAR )((uint8_t )re + sizeof(pcre2_real_code)) +
				1682	re->name_entry_size * re->name_count;
				1683
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	1684	/* For a pattern that has a first code unit, or a multiline pattern that
				1685	matches only at "line start", there is no point in seeking a list of starting
				1686	code units. */
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	1687
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	1688	if ((re->flags & (PCRE2_FIRSTSET\|PCRE2_STARTLINE)) == 0)
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	1689	{
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	1690	int depth = 0;
				1691	int rc = set_start_bits(re, code, utf, ucp, &depth);
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	1692	if (rc == SSB_UNKNOWN) return 1;
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	1693
				1694	/* If a list of starting code units was set up, scan the list to see if only
				1695	one or two were listed. Having only one listed is rare because usually a
				1696	single starting code unit will have been recognized and PCRE2_FIRSTSET set.
				1697	If two are listed, see if they are caseless versions of the same character;
				1698	if so we can replace the list with a caseless first code unit. This gives
				1699	better performance and is plausibly worth doing for patterns such as [Ww]ord
				1700	or (word\|WORD). */
				1701
				1702	if (rc == SSB_DONE)
				1703	{
				1704	int i;
				1705	int a = -1;
				1706	int b = -1;
				1707	uint8_t *p = re->start_bitmap;
				1708	uint32_t flags = PCRE2_FIRSTMAPSET;
				1709
				1710	for (i = 0; i < 256; p++, i += 8)
				1711	{
				1712	uint8_t x = *p;
				1713	if (x != 0)
				1714	{
				1715	int c;
				1716	uint8_t y = x & (~x + 1); /* Least significant bit */
				1717	if (y != x) goto DONE; /* More than one bit set */
				1718
				1719	/* In the 16-bit and 32-bit libraries, the bit for 0xff means "0xff and
				1720	all wide characters", so we cannot use it here. */
				1721
				1722	#if PCRE2_CODE_UNIT_WIDTH != 8
				1723	if (i == 248 && x == 0x80) goto DONE;
				1724	#endif
				1725
				1726	/* Compute the character value */
				1727
				1728	c = i;
				1729	switch (x)
				1730	{
				1731	case 1: break;
				1732	case 2: c += 1; break; case 4: c += 2; break;
				1733	case 8: c += 3; break; case 16: c += 4; break;
				1734	case 32: c += 5; break; case 64: c += 6; break;
				1735	case 128: c += 7; break;
				1736	}
				1737
				1738	/* c contains the code unit value, in the range 0-255. In 8-bit UTF
				1739	mode, only values < 128 can be used. In all the other cases, c is a
				1740	character value. */
				1741
				1742	#if PCRE2_CODE_UNIT_WIDTH == 8
				1743	if (utf && c > 127) goto DONE;
				1744	#endif
				1745	if (a < 0) a = c; /* First one found, save in a */
				1746	else if (b < 0) /* Second one found */
				1747	{
				1748	int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c);
				1749
				1750	#ifdef SUPPORT_UNICODE
				1751	if (utf \|\| ucp)
				1752	{
				1753	if (UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */
				1754	if (c > 127) d = UCD_OTHERCASE(c);
				1755	}
				1756	#endif /* SUPPORT_UNICODE */
				1757
				1758	if (d != a) goto DONE; /* Not the other case of a */
				1759	b = c; /* Save second in b */
				1760	}
				1761	else goto DONE; /* More than two characters found */
				1762	}
				1763	}
				1764
				1765	/* Replace the start code unit bits with a first code unit, but only if it
				1766	is not the same as a required later code unit. This is because a search for
				1767	a required code unit starts after an explicit first code unit, but at a
				1768	code unit found from the bitmap. Patterns such as /a*a/ don't work
				1769	if both the start unit and required unit are the same. */
				1770
				1771	if (a >= 0 &&
				1772	(
				1773	(re->flags & PCRE2_LASTSET) == 0 \|\|
				1774	(
				1775	re->last_codeunit != (uint32_t)a &&
				1776	(b < 0 \|\| re->last_codeunit != (uint32_t)b)
				1777	)
				1778	))
				1779	{
				1780	re->first_codeunit = a;
				1781	flags = PCRE2_FIRSTSET;
				1782	if (b >= 0) flags \|= PCRE2_FIRSTCASELESS;
				1783	}
				1784
				1785	DONE:
				1786	re->flags \|= flags;
				1787	}
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	1788	}
				1789
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	1790	/* Find the minimum length of subject string. If the pattern can match an empty
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	1791	string, the minimum length is already known. If the pattern contains (*ACCEPT)
				1792	all bets are off, and we don't even try to find a minimum length. If there are
				1793	more back references than the size of the vector we are going to cache them in,
				1794	do nothing. A pattern that complicated will probably take a long time to
				1795	analyze and may in any case turn out to be too complicated. Note that back
				1796	reference minima are held as 16-bit numbers. */
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	1797
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	1798	if ((re->flags & (PCRE2_MATCH_EMPTY\|PCRE2_HASACCEPT)) == 0 &&
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	1799	re->top_backref <= MAX_CACHE_BACKREF)
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	1800	{
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	1801	int min;
Elliott Hughes	9bc971b	2018-07-27 13:23:14 -0700	[diff] [blame]	1802	int backref_cache[MAX_CACHE_BACKREF+1];
				1803	backref_cache[0] = 0; /* Highest one that is set */
				1804	min = find_minlength(re, code, code, utf, NULL, &count, backref_cache);
				1805	switch(min)
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	1806	{
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	1807	case -1: /* \C in UTF mode or over-complex regex */
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	1808	break; /* Leave minlength unchanged (will be zero) */
				1809
				1810	case -2:
				1811	return 2; /* missing capturing bracket */
				1812
				1813	case -3:
				1814	return 3; /* unrecognized opcode */
				1815
				1816	default:
Elliott Hughes	2dbd7d2	2020-06-03 14:32:37 -0700	[diff] [blame]	1817	re->minlength = (min > UINT16_MAX)? UINT16_MAX : min;
Janis Danisevskis	53e448c	2016-03-31 13:35:25 +0100	[diff] [blame]	1818	break;
				1819	}
				1820	}
				1821
				1822	return 0;
				1823	}
				1824
				1825	/* End of pcre2_study.c */