Blame - Modules/pypcre.c - platform/external/python/cpython3

blob: 6a3ada5c26f0fb3a24ed62d67cdcd4ba7b46ee93 [file] [log] [blame]

Guido van Rossum	51b3aa3	1997-10-06 14:43:11 +0000	[diff] [blame]	1
				2	/*************************************************
				3	* Perl-Compatible Regular Expressions *
				4	*************************************************/
				5
				6	/* DO NOT EDIT THIS FILE! */
				7
				8	/* This file is automatically written by the merge-files.py script
				9	included with the PCRE distribution for Python; it's produced from
				10	several C files, and code is removed in the process. If you want to
				11	modify the code or track down bugs, it will be much easier to work
				12	with the code in its original, multiple-file form. Don't edit this
				13	file by hand, or submit patches to it.
				14
				15	The Python-specific PCRE distribution can be retrieved from
				16	http://starship.skyport.net/crew/amk/regex/
				17
				18	The unmodified original PCRE distribution doesn't have a fixed URL
				19	yet; write Philip Hazel <ph10@cam.ac.uk> for the latest version.
				20
				21	Written by: Philip Hazel <ph10@cam.ac.uk>
				22
				23	Extensively modified by the Python String-SIG: <string-sig@python.org>
				24	Send bug reports to: <string-sig@python.org>
				25	(They'll figure out if it's a bug in PCRE or in the Python-specific
				26	changes.)
				27
				28	Copyright (c) 1997 University of Cambridge
				29
				30	-----------------------------------------------------------------------------
				31	Permission is granted to anyone to use this software for any purpose on any
				32	computer system, and to redistribute it freely, subject to the following
				33	restrictions:
				34
				35	1. This software is distributed in the hope that it will be useful,
				36	but WITHOUT ANY WARRANTY; without even the implied warranty of
				37	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
				38
				39	2. The origin of this software must not be misrepresented, either by
				40	explicit claim or by omission.
				41
				42	3. Altered versions must be plainly marked as such, and must not be
				43	misrepresented as being the original software.
				44	-----------------------------------------------------------------------------
				45	*/
				46
				47
				48	#define FOR_PYTHON
				49	#include "pcre-internal.h"
				50	#include "Python.h"
				51	#include "graminit.h"
				52
				53	/*************************************************
				54	* Perl-Compatible Regular Expressions *
				55	*************************************************/
				56
				57	/* This file is automatically written by the makechartables auxiliary
				58	program. If you edit it by hand, you might like to edit the Makefile to
				59	prevent its ever being regenerated. */
				60
				61	/* This table is a lower casing table. */
				62
				63	unsigned char pcre_lcc[] = {
				64	0, 1, 2, 3, 4, 5, 6, 7,
				65	8, 9, 10, 11, 12, 13, 14, 15,
				66	16, 17, 18, 19, 20, 21, 22, 23,
				67	24, 25, 26, 27, 28, 29, 30, 31,
				68	32, 33, 34, 35, 36, 37, 38, 39,
				69	40, 41, 42, 43, 44, 45, 46, 47,
				70	48, 49, 50, 51, 52, 53, 54, 55,
				71	56, 57, 58, 59, 60, 61, 62, 63,
				72	64, 97, 98, 99,100,101,102,103,
				73	104,105,106,107,108,109,110,111,
				74	112,113,114,115,116,117,118,119,
				75	120,121,122, 91, 92, 93, 94, 95,
				76	96, 97, 98, 99,100,101,102,103,
				77	104,105,106,107,108,109,110,111,
				78	112,113,114,115,116,117,118,119,
				79	120,121,122,123,124,125,126,127,
				80	128,129,130,131,132,133,134,135,
				81	136,137,138,139,140,141,142,143,
				82	144,145,146,147,148,149,150,151,
				83	152,153,154,155,156,157,158,159,
				84	160,161,162,163,164,165,166,167,
				85	168,169,170,171,172,173,174,175,
				86	176,177,178,179,180,181,182,183,
				87	184,185,186,187,188,189,190,191,
				88	192,193,194,195,196,197,198,199,
				89	200,201,202,203,204,205,206,207,
				90	208,209,210,211,212,213,214,215,
				91	216,217,218,219,220,221,222,223,
				92	224,225,226,227,228,229,230,231,
				93	232,233,234,235,236,237,238,239,
				94	240,241,242,243,244,245,246,247,
				95	248,249,250,251,252,253,254,255 };
				96
				97	/* This table is an upper casing table. */
				98
				99	unsigned char pcre_ucc[] = {
				100	0, 1, 2, 3, 4, 5, 6, 7,
				101	8, 9, 10, 11, 12, 13, 14, 15,
				102	16, 17, 18, 19, 20, 21, 22, 23,
				103	24, 25, 26, 27, 28, 29, 30, 31,
				104	32, 33, 34, 35, 36, 37, 38, 39,
				105	40, 41, 42, 43, 44, 45, 46, 47,
				106	48, 49, 50, 51, 52, 53, 54, 55,
				107	56, 57, 58, 59, 60, 61, 62, 63,
				108	64, 65, 66, 67, 68, 69, 70, 71,
				109	72, 73, 74, 75, 76, 77, 78, 79,
				110	80, 81, 82, 83, 84, 85, 86, 87,
				111	88, 89, 90, 91, 92, 93, 94, 95,
				112	96, 65, 66, 67, 68, 69, 70, 71,
				113	72, 73, 74, 75, 76, 77, 78, 79,
				114	80, 81, 82, 83, 84, 85, 86, 87,
				115	88, 89, 90,123,124,125,126,127,
				116	128,129,130,131,132,133,134,135,
				117	136,137,138,139,140,141,142,143,
				118	144,145,146,147,148,149,150,151,
				119	152,153,154,155,156,157,158,159,
				120	160,161,162,163,164,165,166,167,
				121	168,169,170,171,172,173,174,175,
				122	176,177,178,179,180,181,182,183,
				123	184,185,186,187,188,189,190,191,
				124	192,193,194,195,196,197,198,199,
				125	200,201,202,203,204,205,206,207,
				126	208,209,210,211,212,213,214,215,
				127	216,217,218,219,220,221,222,223,
				128	224,225,226,227,228,229,230,231,
				129	232,233,234,235,236,237,238,239,
				130	240,241,242,243,244,245,246,247,
				131	248,249,250,251,252,253,254,255 };
				132
				133	/* This table identifies various classes of character by individual bits:
				134	1 white space character
				135	2 decimal digit
				136	4 hexadecimal digit
				137	8 alphanumeric or '_'
				138	16 octal digit
				139	128 regular expression metacharacter or binary zero
				140	*/
				141
				142	unsigned char pcre_ctypes[] = {
				143	0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
				144	0x00,0x01,0x01,0x01,0x01,0x01,0x00,0x00, /* 8- 15 */
				145	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
				146	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
				147	0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /* - ' */
				148	0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /* ( - / */
				149	0x1e,0x1e,0x1e,0x1e,0x1e,0x1e,0x1e,0x1e, /* 0 - 7 */
				150	0x0e,0x0e,0x00,0x00,0x00,0x00,0x00,0x80, /* 8 - ? */
				151	0x00,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x08, /* @ - G */
				152	0x08,0x08,0x08,0x08,0x08,0x08,0x08,0x08, /* H - O */
				153	0x08,0x08,0x08,0x08,0x08,0x08,0x08,0x08, /* P - W */
				154	0x08,0x08,0x08,0x80,0x00,0x00,0x80,0x08, /* X - _ */
				155	0x00,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x08, /* ` - g */
				156	0x08,0x08,0x08,0x08,0x08,0x08,0x08,0x08, /* h - o */
				157	0x08,0x08,0x08,0x08,0x08,0x08,0x08,0x08, /* p - w */
				158	0x08,0x08,0x08,0x80,0x80,0x00,0x00,0x00, /* x -127 */
				159	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
				160	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
				161	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
				162	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
				163	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
				164	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
				165	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
				166	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
				167	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
				168	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
				169	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
				170	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
				171	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
				172	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
				173	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
				174	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
				175
				176	/* End of pcre-chartables.c */
				177	/*************************************************
				178	* Perl-Compatible Regular Expressions *
				179	*************************************************/
				180
				181	/*
				182	This is a library of functions to support regular expressions whose syntax
				183	and semantics are as close as possible to those of the Perl 5 language.
				184
				185	Written by: Philip Hazel <ph10@cam.ac.uk>
				186
				187	Copyright (c) 1997 University of Cambridge
				188
				189	-----------------------------------------------------------------------------
				190	Permission is granted to anyone to use this software for any purpose on any
				191	computer system, and to redistribute it freely, subject to the following
				192	restrictions:
				193
				194	1. This software is distributed in the hope that it will be useful,
				195	but WITHOUT ANY WARRANTY; without even the implied warranty of
				196	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
				197
				198	2. The origin of this software must not be misrepresented, either by
				199	explicit claim or by omission.
				200
				201	3. Altered versions must be plainly marked as such, and must not be
				202	misrepresented as being the original software.
				203	-----------------------------------------------------------------------------
				204
				205	See the file Tech.Notes for some information on the internals.
				206	*/
				207
				208	/* This module contains the actual definition of global variables that are
				209	shared between the different modules. In fact, these are limited to the
				210	indirections for memory management functions. */
				211
				212	/* Include the internals header, which itself includes Standard C headers plus
				213	the external pcre header. */
				214
				215
				216	/* Store get and free functions. */
				217
				218	void (pcre_malloc)(size_t) = malloc;
				219	void (pcre_free)(void ) = free;
				220
				221	/* End of pcre-globals.c */
				222	/*************************************************
				223	* Perl-Compatible Regular Expressions *
				224	*************************************************/
				225
				226	/*
				227	This is a library of functions to support regular expressions whose syntax
				228	and semantics are as close as possible to those of the Perl 5 language. See
				229	the file Tech.Notes for some information on the internals.
				230
				231	Written by: Philip Hazel <ph10@cam.ac.uk>
				232
				233	Copyright (c) 1997 University of Cambridge
				234
				235	-----------------------------------------------------------------------------
				236	Permission is granted to anyone to use this software for any purpose on any
				237	computer system, and to redistribute it freely, subject to the following
				238	restrictions:
				239
				240	1. This software is distributed in the hope that it will be useful,
				241	but WITHOUT ANY WARRANTY; without even the implied warranty of
				242	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
				243
				244	2. The origin of this software must not be misrepresented, either by
				245	explicit claim or by omission.
				246
				247	3. Altered versions must be plainly marked as such, and must not be
				248	misrepresented as being the original software.
				249	-----------------------------------------------------------------------------
				250	*/
				251
				252
				253	/* Include the internals header, which itself includes Standard C headers plus
				254	the external pcre header. */
				255
				256
				257
				258	/* Character types for class type bits */
				259
				260	static char class_types[] = { ctype_digit, ctype_space, ctype_word };
				261
				262
				263
				264	/*************************************************
				265	* Set a range of bits in the map *
				266	*************************************************/
				267
				268	/* This function is called for character types.
				269
				270	Arguments:
				271	start_bits points to the bit map
				272	type a character type bit
				273	include TRUE to include the type;
				274	FALSE to include all but the type
				275
				276	Returns: nothing
				277	*/
				278
				279	static void
				280	set_type_bits(uschar *start_bits, int type, BOOL include)
				281	{
				282	int i;
				283	for (i = 0; i < 256; i++)
				284	if (((pcre_ctypes[i] & type) != 0) == include) start_bits[i/8] \|= (1<<(i%8));
				285	}
				286
				287
				288
				289	/*************************************************
				290	* Set one bit in the map *
				291	*************************************************/
				292
				293	/* This function is called to set a bit in the map for a given character,
				294	or both cases of a letter if caseless. It could be replaced by a macro if
				295	better performance is wanted.
				296
				297	Arguments:
				298	start_bits points to 32-byte table
				299	c the character
				300	caseless TRUE if caseless
				301
				302	Returns: nothing
				303	*/
				304
				305	static void
				306	set_bit(uschar *start_bits, int c, BOOL caseless)
				307	{
				308	if (caseless)
				309	{
				310	int d = pcre_ucc[c];
				311	start_bits[d/8] \|= (1<<(d%8));
				312	c = pcre_lcc[c];
				313	}
				314	start_bits[c/8] \|= (1<<(c%8));
				315	}
				316
				317
				318
				319	/*************************************************
				320	* Create bitmap of starting chars *
				321	*************************************************/
				322
				323	/* This function scans a compiled unanchored expression and attempts to build a
				324	bitmap of the set of initial characters. If it can't, it returns FALSE. As time
				325	goes by, we may be able to get more clever at doing this.
				326
				327	Arguments:
				328	code points to an expression
				329	start_bits points to a 32-byte table, initialized to 0
				330	caseless TRUE if caseless
				331
				332	Returns: TRUE if table built, FALSE otherwise
				333	*/
				334
				335	static BOOL
				336	set_start_bits(uschar code, uschar start_bits, BOOL caseless)
				337	{
				338	do
				339	{
				340	uschar *tcode = code + 3;
				341	BOOL try_next = TRUE;
				342
				343	while (try_next)
				344	{
				345	try_next = FALSE;
				346
				347	if ((int)tcode >= OP_BRA \|\| tcode == OP_ASSERT)
				348	{
				349	if (!set_start_bits(tcode, start_bits, caseless)) return FALSE;
				350	}
				351
				352	else switch(*tcode)
				353	{
				354	default:
				355	return FALSE;
				356
				357	/* BRAZERO does the bracket, but carries on. */
				358
				359	case OP_BRAZERO:
				360	case OP_BRAMINZERO:
				361	if (!set_start_bits(++tcode, start_bits, caseless)) return FALSE;
				362	do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
				363	tcode += 3;
				364	try_next = TRUE;
				365	break;
				366
				367	/* Single-char * or ? sets the bit and tries the next item */
				368
				369	case OP_STAR:
				370	case OP_MINSTAR:
				371	case OP_QUERY:
				372	case OP_MINQUERY:
				373	set_bit(start_bits, tcode[1], caseless);
				374	tcode += 2;
				375	try_next = TRUE;
				376	break;
				377
				378	/* Single-char upto sets the bit and tries the next */
				379
				380	case OP_UPTO:
				381	case OP_MINUPTO:
				382	set_bit(start_bits, tcode[3], caseless);
				383	tcode += 4;
				384	try_next = TRUE;
				385	break;
				386
				387	/* At least one single char sets the bit and stops */
				388
				389	case OP_EXACT: /* Fall through */
				390	tcode++;
				391
				392	case OP_CHARS: /* Fall through */
				393	tcode++;
				394
				395	case OP_PLUS:
				396	case OP_MINPLUS:
				397	set_bit(start_bits, tcode[1], caseless);
				398	break;
				399
				400	/* Single character type sets the bits and stops */
				401
				402	case OP_NOT_DIGIT:
				403	set_type_bits(start_bits, ctype_digit, FALSE);
				404	break;
				405
				406	case OP_DIGIT:
				407	set_type_bits(start_bits, ctype_digit, TRUE);
				408	break;
				409
				410	case OP_NOT_WHITESPACE:
				411	set_type_bits(start_bits, ctype_space, FALSE);
				412	break;
				413
				414	case OP_WHITESPACE:
				415	set_type_bits(start_bits, ctype_space, TRUE);
				416	break;
				417
				418	case OP_NOT_WORDCHAR:
				419	set_type_bits(start_bits, ctype_word, FALSE);
				420	break;
				421
				422	case OP_WORDCHAR:
				423	set_type_bits(start_bits, ctype_word, TRUE);
				424	break;
				425
				426	/* One or more character type fudges the pointer and restarts, knowing
				427	it will hit a single character type and stop there. */
				428
				429	case OP_TYPEPLUS:
				430	case OP_TYPEMINPLUS:
				431	tcode++;
				432	try_next = TRUE;
				433	break;
				434
				435	case OP_TYPEEXACT:
				436	tcode += 3;
				437	try_next = TRUE;
				438	break;
				439
				440	/* Zero or more repeats of character types set the bits and then
				441	try again. */
				442
				443	case OP_TYPEUPTO:
				444	case OP_TYPEMINUPTO:
				445	tcode += 2; /* Fall through */
				446
				447	case OP_TYPESTAR:
				448	case OP_TYPEMINSTAR:
				449	case OP_TYPEQUERY:
				450	case OP_TYPEMINQUERY:
				451	switch(tcode[1])
				452	{
				453	case OP_NOT_DIGIT:
				454	set_type_bits(start_bits, ctype_digit, FALSE);
				455	break;
				456
				457	case OP_DIGIT:
				458	set_type_bits(start_bits, ctype_digit, TRUE);
				459	break;
				460
				461	case OP_NOT_WHITESPACE:
				462	set_type_bits(start_bits, ctype_space, FALSE);
				463	break;
				464
				465	case OP_WHITESPACE:
				466	set_type_bits(start_bits, ctype_space, TRUE);
				467	break;
				468
				469	case OP_NOT_WORDCHAR:
				470	set_type_bits(start_bits, ctype_word, FALSE);
				471	break;
				472
				473	case OP_WORDCHAR:
				474	set_type_bits(start_bits, ctype_word, TRUE);
				475	break;
				476	}
				477
				478	tcode += 2;
				479	try_next = TRUE;
				480	break;
				481
				482	/* Character class: set the bits and either carry on or not,
				483	according to the repeat count. */
				484
				485	case OP_CLASS:
				486	case OP_NEGCLASS:
				487	{
				488	uschar *base = tcode;
				489	uschar data, end;
				490	uschar *bitmap = start_bits;
				491	uschar local[32];
				492	int flags = base[1];
				493	int i;
				494
				495	tcode += 4 + 2 * tcode[2] + tcode[3]; /* Advance past the item */
				496	switch (*tcode)
				497	{
				498	case OP_CRSTAR:
				499	case OP_CRMINSTAR:
				500	case OP_CRQUERY:
				501	case OP_CRMINQUERY:
				502	tcode++;
				503	try_next = TRUE;
				504	break;
				505
				506	case OP_CRRANGE:
				507	case OP_CRMINRANGE:
				508	if (((tcode[1] << 8) + tcode[2]) == 0)
				509	{
				510	tcode += 5;
				511	try_next = TRUE;
				512	}
				513	break;
				514	}
				515
				516	/* For a negated class, we have to build a separate table of all
				517	the bits in the class, and then turn all other bits on in the main
				518	table. Otherwise there are problems with things like [^\da]. */
				519
				520	if (*base == OP_NEGCLASS)
				521	{
				522	memset(local, 0, 32);
				523	bitmap = local;
				524	}
				525
				526	/* Character types */
				527
				528	for (i = 0; flags != 0; i++)
				529	{
				530	if ((flags & 1) != 0)
				531	set_type_bits(bitmap, class_types[i/2], (i & 1) == 0);
				532	flags >>= 1;
				533	}
				534
				535	/* Ranges and individual characters */
				536
				537	data = base + 4;
				538	end = data + base[2] * 2;
				539	while (data < end)
				540	{
				541	for (i = *data; i <= data[1]; i++) set_bit(bitmap, i, caseless);
				542	data += 2;
				543	}
				544
				545	end += base[3];
				546	while (data < end) set_bit(bitmap, *data++, caseless);
				547
				548	/* If a negated class, transfer data from local map to the main one */
				549
				550	if (bitmap != start_bits)
				551	for (i = 0; i < 32; i++) start_bits[i] \|= ~local[i];
				552	}
				553	break; /* End of class handling */
				554
				555	} /* End of switch */
				556	} /* End of try_next loop */
				557
				558	code += (code[1] << 8) + code[2]; /* Advance to next branch */
				559	}
				560	while (*code == OP_ALT);
				561	return TRUE;
				562	}
				563
				564
				565
				566	/*************************************************
				567	* Study a compiled expression *
				568	*************************************************/
				569
				570	/* This function is handed a compiled expression that it must study to produce
				571	information that will speed up the matching. It returns a pcre_extra block
				572	which then gets handed back to pcre_exec().
				573
				574	Arguments:
				575	re points to the compiled expression
				576	options contains option bits
				577	errorptr points to where to place error messages;
				578	set NULL unless error
				579
				580	Returns: pointer to a pcre_extra block,
				581	NULL on error or if no optimization possible
				582	*/
				583
				584	pcre_extra *
				585	pcre_study(pcre external_re, int options, char *errorptr)
				586	{
				587	BOOL caseless;
				588	uschar start_bits[32];
				589	real_pcre_extra *extra;
				590	real_pcre re = (real_pcre )external_re;
				591
				592	*errorptr = NULL;
				593
				594	if (re == NULL \|\| re->magic_number != MAGIC_NUMBER)
				595	{
				596	*errorptr = "argument is not a compiled regular expression";
				597	return NULL;
				598	}
				599
				600	if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
				601	{
				602	*errorptr = "unknown or incorrect option bit(s) set";
				603	return NULL;
				604	}
				605
				606	/* For an anchored pattern, or an unchored pattern that has a first char, or a
				607	multiline pattern that matches only at "line starts", no further processing at
				608	present. */
				609
				610	if ((re->options & (PCRE_ANCHORED\|PCRE_FIRSTSET\|PCRE_STARTLINE)) != 0)
				611	return NULL;
				612
				613	/* Determine the caseless state from the compiled pattern and the current
				614	options. */
				615
				616	caseless = ((re->options \| options) & PCRE_CASELESS) != 0;
				617
				618	/* See if we can find a fixed set of initial characters for the pattern. */
				619
				620	memset(start_bits, 0, 32);
				621	if (!set_start_bits(re->code, start_bits, caseless)) return NULL;
				622
				623	/* Get an "extra" block and put the information therein. */
				624
				625	extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
				626
				627	if (extra == NULL)
				628	{
				629	*errorptr = "failed to get memory";
				630	return NULL;
				631	}
				632	extra->options = PCRE_STUDY_MAPPED \| (caseless? PCRE_STUDY_CASELESS : 0);
				633	memcpy(extra->start_bits, start_bits, 32);
				634
				635	return (pcre_extra *)extra;
				636	}
				637
				638	/* End of pcre-study.c */
				639	/*************************************************
				640	* Perl-Compatible Regular Expressions *
				641	*************************************************/
				642
				643	/*
				644	This is a library of functions to support regular expressions whose syntax
				645	and semantics are as close as possible to those of the Perl 5 language. See
				646	the file Tech.Notes for some information on the internals.
				647
				648	Written by: Philip Hazel <ph10@cam.ac.uk>
				649
				650	Copyright (c) 1997 University of Cambridge
				651
				652	-----------------------------------------------------------------------------
				653	Permission is granted to anyone to use this software for any purpose on any
				654	computer system, and to redistribute it freely, subject to the following
				655	restrictions:
				656
				657	1. This software is distributed in the hope that it will be useful,
				658	but WITHOUT ANY WARRANTY; without even the implied warranty of
				659	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
				660
				661	2. The origin of this software must not be misrepresented, either by
				662	explicit claim or by omission.
				663
				664	3. Altered versions must be plainly marked as such, and must not be
				665	misrepresented as being the original software.
				666	-----------------------------------------------------------------------------
				667	*/
				668
				669
				670	/* Define DEBUG to get debugging output on stdout. */
				671
				672	/* #define DEBUG */
				673
				674
				675	/* Include the internals header, which itself includes Standard C headers plus
				676	the external pcre header. */
				677
				678
				679	#ifndef Py_eval_input
				680	/* For Python 1.4, graminit.h has to be explicitly included */
				681	#define Py_eval_input eval_input
				682	#endif
				683
				684	/* Min and max values for the common repeats; for the maxima, 0 => infinity */
				685
				686	static char rep_min[] = { 0, 0, 1, 1, 0, 0 };
				687	static char rep_max[] = { 0, 0, 0, 0, 1, 1 };
				688
				689	/* Text forms of OP_ values and things, for debugging */
				690
				691	#ifdef DEBUG
				692	static char *OP_names[] = { "End", "\\A", "\\B", "\\b", "\\D", "\\d",
				693	"\\S", "\\s", "\\W", "\\w", "\\Z", "^", "$", "Any", "chars",
				694	"", "?", "+", "+?", "?", "??", "{", "{", "{",
				695	"", "?", "+", "+?", "?", "??", "{", "{", "{",
				696	"", "?", "+", "+?", "?", "??", "{", "{",
				697	"class", "negclass", "Ref",
				698	"Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
				699	"Brazero", "Braminzero", "Bra"
				700	};
				701
				702	static char *class_names[] = { "\\d", "\\D", "\\s", "\\S", "\\w", "\\W" };
				703	#endif
				704
				705	/* Table of character type operators that correspond to the bits in the
				706	character class flags, starting at the least significant end. */
				707
				708	static char class_ops[] = {
				709	OP_DIGIT, OP_NOT_DIGIT,
				710	OP_WHITESPACE, OP_NOT_WHITESPACE,
				711	OP_WORDCHAR, OP_NOT_WORDCHAR };
				712
				713	/* Table for handling escaped characters in the range '0'-'z'. Positive returns
				714	are simple data values; negative values are for special things like \d and so
				715	on. Zero means further processing is needed (for things like \x), or the escape
				716	is invalid. */
				717
				718	/* PYTHON: Python doesn't support \e, but does support \v */
				719
				720	static short int escapes[] = {
				721	0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
				722	0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
				723	'@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
				724	0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
				725	0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
				726	0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
				727	'`', 7, -ESC_b, 0, -ESC_d, 0, '\f', 0, /* ` - g */
				728	0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
				729	0, 0, '\r', -ESC_s, '\t', 0, '\v', -ESC_w, /* p - w */
				730	0, 0, 0 /* x - z */
				731	};
				732
				733
				734	/* Definition to allow mutual recursion */
				735
				736	static BOOL compile_regexp(BOOL, int , uschar , uschar *,
				737	char *, PyObject );
				738
				739	/* Structure for passing "static" information around between the functions
				740	doing the matching, so that they are thread-safe. */
				741
				742	typedef struct match_data {
				743	int errorcode; /* As it says */
				744	int offset_vector; / Offset vector */
				745	int offset_end; /* One past the end */
				746	BOOL offset_overflow; /* Set if too many extractions */
				747	BOOL caseless; /* Case-independent flag */
				748	BOOL multiline; /* Multiline flag */
				749	uschar start_subject; / Start of the subject string */
				750	uschar end_subject; / End of the subject string */
				751
				752	uschar end_match_ptr; / Subject position at end match */
				753	int end_offset_top; /* Highwater mark at end of match */
				754	BOOL dotall; /* Dotall flag */
				755	int length; /* Length of the allocated stacks */
				756	int point; /* Point to add next item pushed onto stacks */
				757	/* Pointers to the 6 stacks */
				758	int off_num, offset_top, r1, r2;
				759	uschar eptr, ecode;
				760	} match_data;
				761
				762
				763
				764	/*************************************************
				765	* Return version string *
				766	*************************************************/
				767
				768	char *
				769	pcre_version(void)
				770	{
				771	return PCRE_VERSION;
				772	}
				773
				774
				775
				776
				777	/*************************************************
				778	* Return info about a compiled pattern *
				779	*************************************************/
				780
				781	/* This function picks potentially useful data out of the private
				782	structure.
				783
				784	Arguments:
				785	external_re points to compiled code
				786	optptr where to pass back the options
				787	first_char where to pass back the first character,
				788	or -1 if multiline and all branches start ^,
				789	or -2 otherwise
				790
				791	Returns: number of identifying extraction brackets
				792	or negative values on error
				793	*/
				794
				795	int
				796	pcre_info(pcre external_re, int optptr, int *first_char)
				797	{
				798	real_pcre re = (real_pcre )external_re;
				799	if (re == NULL) return PCRE_ERROR_NULL;
				800	if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
				801	if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
				802	if (first_char != NULL)
				803	*first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
				804	((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
				805	return re->top_bracket;
				806	}
				807
				808
				809
				810
				811	#ifdef DEBUG
				812	/*************************************************
				813	* Debugging function to print chars *
				814	*************************************************/
				815
				816	/* Print a sequence of chars in printable format, stopping at the end of the
				817	subject if the requested.
				818
				819	Arguments:
				820	p points to characters
				821	length number to print
				822	is_subject TRUE if printing from within md->start_subject
				823	md pointer to matching data block, if is_subject is TRUE
				824
				825	Returns: nothing
				826	*/
				827
				828	static pchars(uschar p, int length, BOOL is_subject, match_data md)
				829	{
				830	int c;
				831	if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
				832	while (length-- > 0)
				833	if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
				834	}
				835	#endif
				836
				837
				838
				839
				840	/*************************************************
				841	* Check subpattern for empty operand *
				842	*************************************************/
				843
				844	/* This function checks a bracketed subpattern to see if any of the paths
				845	through it could match an empty string. This is used to diagnose an error if
				846	such a subpattern is followed by a quantifier with an unlimited upper bound.
				847
				848	Argument:
				849	code points to the opening bracket
				850
				851	Returns: TRUE or FALSE
				852	*/
				853
				854	static BOOL
				855	could_be_empty(uschar *code)
				856	{
				857	do {
				858	uschar *cc = code + 3;
				859
				860	/* Scan along the opcodes for this branch; as soon as we find something
				861	that matches a non-empty string, break out and advance to test the next
				862	branch. If we get to the end of the branch, return TRUE for the whole
				863	sub-expression. */
				864
				865	for (;;)
				866	{
				867	/* Test an embedded subpattern; if it could not be empty, break the
				868	loop. Otherwise carry on in the branch. */
				869
				870	if ((int)(*cc) >= OP_BRA)
				871	{
				872	if (!could_be_empty(cc)) break;
				873	do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
				874	cc += 3;
				875	}
				876
				877	else switch (*cc)
				878	{
				879	/* Reached end of a branch: the subpattern may match the empty string */
				880
				881	case OP_ALT:
				882	case OP_KET:
				883	case OP_KETRMAX:
				884	case OP_KETRMIN:
				885	return TRUE;
				886
				887	/* Skip over assertive subpatterns */
				888
				889	case OP_ASSERT:
				890	case OP_ASSERT_NOT:
				891	do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
				892	cc += 3;
				893	break;
				894
				895	/* Skip over things that don't match chars */
				896
				897	case OP_SOD:
				898	case OP_EOD:
				899	case OP_CIRC:
				900	case OP_DOLL:
				901	case OP_BRAZERO:
				902	case OP_BRAMINZERO:
				903	case OP_NOT_WORD_BOUNDARY:
				904	case OP_WORD_BOUNDARY:
				905	cc++;
				906	break;
				907
				908	/* Skip over simple repeats with zero lower bound */
				909
				910	case OP_STAR:
				911	case OP_MINSTAR:
				912	case OP_QUERY:
				913	case OP_MINQUERY:
				914	case OP_TYPESTAR:
				915	case OP_TYPEMINSTAR:
				916	case OP_TYPEQUERY:
				917	case OP_TYPEMINQUERY:
				918	cc += 2;
				919	break;
				920
				921	/* Skip over UPTOs (lower bound is zero) */
				922
				923	case OP_UPTO:
				924	case OP_MINUPTO:
				925	case OP_TYPEUPTO:
				926	case OP_TYPEMINUPTO:
				927	cc += 4;
				928	break;
				929
				930	/* Check a class or a back reference for a zero minimum */
				931
				932	case OP_CLASS:
				933	case OP_NEGCLASS:
				934	case OP_REF:
				935	cc += (cc == OP_REF)? 2 : 4 + 2 cc[2] + cc[3];
				936
				937	switch (*cc)
				938	{
				939	case OP_CRSTAR:
				940	case OP_CRMINSTAR:
				941	case OP_CRQUERY:
				942	case OP_CRMINQUERY:
				943	cc++;
				944	break;
				945
				946	case OP_CRRANGE:
				947	case OP_CRMINRANGE:
				948	if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
				949	cc += 3;
				950	break;
				951
				952	default:
				953	goto NEXT_BRANCH;
				954	}
				955	break;
				956
				957	/* Anything else matches at least one character */
				958
				959	default:
				960	goto NEXT_BRANCH;
				961	}
				962	}
				963
				964	NEXT_BRANCH:
				965	code += (code[1] << 8) + code[2];
				966	}
				967	while (*code == OP_ALT);
				968
				969	/* No branches match the empty string */
				970
				971	return FALSE;
				972	}
				973
				974
				975	/* Determine the length of a group ID in an expression like
				976	(?P<foo_123>...)
				977	Arguments:
				978	ptr pattern position pointer (say that 3 times fast)
				979	finalchar the character that will mark the end of the ID
				980	errorptr points to the pointer to the error message
				981	*/
				982
				983	static int
				984	get_group_id(uschar ptr, char finalchar, char *errorptr)
				985	{
				986	uschar *start = ptr;
				987
				988	/* If the first character is not in \w, or is in \w but is a digit,
				989	report an error */
				990	if (!(pcre_ctypes[*ptr] & ctype_word) \|\|
				991	(pcre_ctypes[*ptr++] & ctype_digit))
				992	{
				993	*errorptr = "(?P identifier must start with a letter or underscore";
				994	return 0;
				995	}
				996
				997	/* Increment ptr until we either hit a null byte, the desired
				998	final character, or a non-word character */
				999	for(; (ptr != 0) && (ptr != finalchar) &&
				1000	(pcre_ctypes[*ptr] & ctype_word); ptr++)
				1001	{
Guido van Rossum	c386107	1997-10-08 02:07:40 +0000	[diff] [blame]	1002	/* Empty loop body */
Guido van Rossum	51b3aa3	1997-10-06 14:43:11 +0000	[diff] [blame]	1003	}
				1004	if (*ptr==finalchar)
				1005	return ptr-start;
				1006	if (*ptr==0)
				1007	{
				1008	*errorptr = "unterminated (?P identifier";
				1009	return 0;
				1010	}
				1011	*errorptr = "illegal character in (?P identifier";
				1012	return 0;
				1013	}
				1014
				1015	/*************************************************
				1016	* Handle escapes *
				1017	*************************************************/
				1018
				1019	/* This function is called when a \ has been encountered. It either returns a
				1020	positive value for a simple escape such as \n, or a negative value which
				1021	encodes one of the more complicated things such as \d. On entry, ptr is
				1022	pointing at the \. On exit, it is on the final character of the escape
				1023	sequence.
				1024
				1025	Arguments:
				1026	ptrptr points to the pattern position pointer
				1027	errorptr points to the pointer to the error message
				1028
				1029	Returns: zero or positive => a data character
				1030	negative => a special escape sequence
				1031	on error, errorptr is set
				1032	*/
				1033
				1034	static int
				1035	check_escape(uschar ptrptr, char errorptr)
				1036	{
				1037	uschar ptr = ptrptr;
				1038	int c = (++ptr) & 255; / Ensure > 0 on signed-char systems */
				1039	int i;
				1040
				1041	if (c == 0) *errorptr = "\\ at end of pattern";
				1042
				1043	/* Digits or letters may have special meaning; all others are literals. */
				1044
				1045	else if (c < '0' \|\| c > 'z') {}
				1046
				1047	/* Do an initial lookup in a table. A non-zero result is something that can be
				1048	returned immediately. Otherwise further processing may be required. */
				1049
				1050	else if ((i = escapes[c - '0']) != 0) c = i;
				1051
				1052	/* Escapes that need further processing, or are illegal. */
				1053
				1054	else switch (c)
				1055	{
				1056	case '0':
				1057	c = 0;
				1058	while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_odigit) != 0 )
				1059	c = c * 8 + *(++ptr) - '0';
				1060	break;
				1061
				1062	case '1': case '2': case '3': case '4': case '5':
				1063	case '6': case '7': case '8': case '9':
				1064	{
				1065	/* PYTHON: Try to compute an octal value for a character */
				1066	for(c=0, i=0; c!=-1 && ptr[i]!=0 && i<3; i++)
				1067	{
				1068	if (( pcre_ctypes[ ptr[i] ] & ctype_odigit) != 0)
				1069	c = c * 8 + ptr[i]-'0';
				1070	else
				1071	c = -1; /* Non-octal character */
				1072	}
				1073	/* Aha! There were 3 octal digits, so it must be a character */
				1074	if (c != -1 && i == 3)
				1075	{
				1076	ptr += i-1;
				1077	break;
				1078	}
				1079	c = ptr[0]; /* Restore the first character after the \ */
				1080	c -= '0'; i = 1;
				1081	while (i<2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0)
				1082	{
				1083	c = c * 10 + ptr[1] - '0';
				1084	ptr++; i++;
				1085	}
				1086	if (c > 255 - ESC_REF) *errorptr = "back reference too big";
				1087	c = -(ESC_REF + c);
				1088	}
				1089	break;
				1090
				1091	case 'x':
				1092	{
Guido van Rossum	c386107	1997-10-08 02:07:40 +0000	[diff] [blame]	1093	int length;
Guido van Rossum	51b3aa3	1997-10-06 14:43:11 +0000	[diff] [blame]	1094	char *string;
Guido van Rossum	c386107	1997-10-08 02:07:40 +0000	[diff] [blame]	1095	PyObject *result;
Guido van Rossum	51b3aa3	1997-10-06 14:43:11 +0000	[diff] [blame]	1096
				1097	i=1;
				1098	while (ptr[i]!=0 &&
				1099	( pcre_ctypes[ptr[i]] & ctype_xdigit) != 0)
				1100	i++;
				1101	if (i==1)
				1102	{
				1103	*errorptr="\\x must be followed by hex digits";
				1104	break;
				1105	}
				1106	length=i-1;
				1107	string=malloc(length+4+1);
				1108	if (string==NULL)
				1109	{
				1110	*errorptr="can't allocate memory for \\x string";
				1111	break;
				1112	}
				1113	/* Create a string containing "\x<hexdigits>", which will be
				1114	passed to eval() */
				1115	string[0]=string[length+3]='"';
				1116	string[1]='\\';
				1117	string[length+4]='\0';
				1118	memcpy(string+2, ptr, length+1);
				1119	ptr += length;
Guido van Rossum	c386107	1997-10-08 02:07:40 +0000	[diff] [blame]	1120	result=PyRun_String((char *)string, Py_eval_input,
				1121	PyEval_GetGlobals(), PyEval_GetLocals());
Guido van Rossum	51b3aa3	1997-10-06 14:43:11 +0000	[diff] [blame]	1122	free(string);
				1123	/* The evaluation raised an exception */
Guido van Rossum	c386107	1997-10-08 02:07:40 +0000	[diff] [blame]	1124	if (result==NULL)
Guido van Rossum	51b3aa3	1997-10-06 14:43:11 +0000	[diff] [blame]	1125	{
				1126	*errorptr="exception occurred during evaluation of \\x";
				1127	break;
				1128	}
Guido van Rossum	c386107	1997-10-08 02:07:40 +0000	[diff] [blame]	1129	if (PyString_Size(result)!=1)
Guido van Rossum	51b3aa3	1997-10-06 14:43:11 +0000	[diff] [blame]	1130	{
Guido van Rossum	c386107	1997-10-08 02:07:40 +0000	[diff] [blame]	1131	Py_DECREF(result);
Guido van Rossum	51b3aa3	1997-10-06 14:43:11 +0000	[diff] [blame]	1132	*errorptr="\\x string is not one byte in length";
				1133	break;
				1134	}
Guido van Rossum	c386107	1997-10-08 02:07:40 +0000	[diff] [blame]	1135	c=(unsigned char )PyString_AsString(result);
				1136	Py_DECREF(result);
Guido van Rossum	51b3aa3	1997-10-06 14:43:11 +0000	[diff] [blame]	1137	break;
				1138	}
				1139	break;
				1140
				1141
				1142	case 'l':
				1143	case 'L':
				1144	case 'u':
				1145	case 'U':
				1146	case 'Q':
				1147	case 'E':
				1148	*errorptr = "the Perl escapes \\u, \\U, \\l, \\L, \\Q, \\E are not valid";
				1149	break;
				1150
				1151	default:
				1152	/* In Python, an unrecognized escape will simply return the character
				1153	after the backslash, so do nothing */
				1154	break;
				1155	}
				1156
				1157	*ptrptr = ptr;
				1158	return c;
				1159	}
				1160
				1161
				1162
				1163	/*************************************************
				1164	* Read repeat counts *
				1165	*************************************************/
				1166
				1167	/* Read an item of the form {n,m} and return the values.
				1168
				1169	Arguments:
				1170	p pointer to first char after '{'
				1171	minp pointer to int for min
				1172	maxp pointer to int for max
				1173	returned as -1 if no max
				1174	errorptr points to pointer to error message
				1175
				1176	Returns: pointer to '}' on success;
				1177	current ptr on error, with errorptr set
				1178	*/
				1179
				1180	static uschar *
				1181	read_repeat_counts(uschar p, int minp, int maxp, char *errorptr)
				1182	{
				1183	int min = 0;
				1184	int max = -1;
				1185
				1186	if ((pcre_ctypes[*p] & ctype_digit) == 0)
				1187	{
				1188	*errorptr = "number expected after {";
				1189	return p;
				1190	}
				1191
				1192	while ((pcre_ctypes[p] & ctype_digit) != 0) min = min 10 + *p++ - '0';
				1193
				1194	if (*p == '}') max = min; else
				1195	{
				1196	if (*p++ != ',')
				1197	{
				1198	*errorptr = "comma expected";
				1199	return p-1;
				1200	}
				1201	if (*p != '}')
				1202	{
				1203	max = 0;
				1204	while((pcre_ctypes[p] & ctype_digit) != 0) max = max 10 + *p++ - '0';
				1205	if (*p != '}')
				1206	{
				1207	*errorptr = "} expected";
				1208	return p;
				1209	}
				1210	if (max < min)
				1211	{
				1212	*errorptr = "numbers out of order";
				1213	return p;
				1214	}
				1215	}
				1216	}
				1217
				1218	/* Do paranoid checks, then fill in the required variables, and pass back the
				1219	pointer to the terminating '}'. */
				1220
				1221	if (max == 0) *errorptr = "zero maximum not allowed";
				1222	else if (min > 65535 \|\| max > 65535) *errorptr = "number too big";
				1223	else
				1224	{
				1225	*minp = min;
				1226	*maxp = max;
				1227	}
				1228	return p;
				1229	}
				1230
				1231
				1232
				1233	/*************************************************
				1234	* Compile one branch *
				1235	*************************************************/
				1236
				1237	/* Scan the pattern, compiling it into the code vector.
				1238
				1239	Arguments:
				1240	extended TRUE if the PCRE_EXTENDED option was set
				1241	brackets points to 2-element bracket vector
				1242	code points to the pointer to the current code point
				1243	ptrptr points to the current pattern pointer
				1244	errorptr points to pointer to error message
				1245
				1246	Returns: TRUE on success
				1247	FALSE, with *errorptr set on error
				1248	*/
				1249
				1250	static BOOL
				1251	compile_branch(BOOL extended, int brackets, uschar *codeptr,
				1252	uschar ptrptr, char errorptr, PyObject *dictionary)
				1253	{
				1254	int repeat_type, op_type;
				1255	int repeat_min, repeat_max;
				1256	int bravalue, length;
				1257	register int c;
				1258	register uschar code = codeptr;
				1259	uschar ptr = ptrptr;
				1260	uschar *previous = NULL;
				1261	uschar *oldptr;
				1262
				1263	/* Switch on next character until the end of the branch */
				1264
				1265	for (;; ptr++)
				1266	{
				1267	c = *ptr;
				1268	if (extended)
				1269	{
				1270	if ((pcre_ctypes[c] & ctype_space) != 0) continue;
				1271	if (c == '#')
				1272	{
				1273	while ((c = *(++ptr)) != 0 && c != '\n');
				1274	continue;
				1275	}
				1276	}
				1277
				1278	switch(c)
				1279	{
				1280	/* The branch terminates at end of string, \|, or ). */
				1281
				1282	case 0:
				1283	case '\|':
				1284	case ')':
				1285	*codeptr = code;
				1286	*ptrptr = ptr;
				1287	return TRUE;
				1288
				1289	/* Handle single-character metacharacters */
				1290
				1291	case '^':
				1292	previous = NULL;
				1293	*code++ = OP_CIRC;
				1294	break;
				1295
				1296	case '$':
				1297	previous = NULL;
				1298	*code++ = OP_DOLL;
				1299	break;
				1300
				1301	case '.':
				1302	previous = code;
				1303	*code++ = OP_ANY;
				1304	break;
				1305
				1306	/* Character classes. We do quite a bit of munging around here. There are
				1307	always four initial bytes: the op_code, a flags byte for things like \d, a
				1308	count of pairs and a count of single characters. The pairs then follow, and
				1309	finally the single characters. */
				1310
				1311	case '[':
				1312	{
				1313	int rangecount = 0;
				1314	int flags = 0;
				1315	int singles_count = 0;
				1316	char singles[256];
				1317
				1318	previous = code;
				1319
				1320	/* If the first character is '^', set the negation flag */
				1321
				1322	if ((c = (++ptr)) == '^') { code = OP_NEGCLASS; c = *(++ptr); }
				1323	else *code = OP_CLASS;
				1324	code += 4;
				1325
				1326	/* Process characters until ] is reached. By writing this as a "do" it
				1327	means that an initial ] is taken as a data character. */
				1328
				1329	do
				1330	{
				1331	if (c == 0)
				1332	{
				1333	*errorptr = "] missing";
				1334	goto FAILED;
				1335	}
				1336
				1337	/*** Perl treats '-' here as a data character, so PCRE had better
				1338	do the same ... cut out this diagnosis.
				1339
				1340	if (c == '-')
				1341	{
				1342	*errorptr = "unexpected '-' in character class";
				1343	goto FAILED;
				1344	}
				1345	... ***/
				1346
				1347	/* Backslash may introduce a single character, or it may introduce one
				1348	of the specials, which just set a flag. Escaped items are checked for
				1349	validity in the pre-compiling pass. The sequence \b is a special case.
				1350	Inside a class (and only there) it is treated as backslash. Elsewhere
				1351	it marks a word boundary. */
				1352
				1353	if (c == '\\')
				1354	{
				1355	uschar *save_ptr = ptr+1;
				1356	c = check_escape(&ptr, errorptr);
				1357	if (c < 0)
				1358	{
				1359	switch (-c)
				1360	{
				1361	case ESC_d: flags \|= CLASS_DIGITS; continue;
				1362	case ESC_D: flags \|= CLASS_NOT_DIGITS; continue;
				1363	case ESC_s: flags \|= CLASS_WHITESPACE; continue;
				1364	case ESC_S: flags \|= CLASS_NOT_WHITESPACE; continue;
				1365	case ESC_w: flags \|= CLASS_WORD; continue;
				1366	case ESC_W: flags \|= CLASS_NOT_WORD; continue;
				1367	default:
				1368	ptr = save_ptr;
				1369	c = *ptr;
				1370	break;
				1371
				1372	case ESC_b: c = '\b'; /* Treat as single character */
				1373	break;
				1374	}
				1375	}
				1376	/* Fall through if single character */
				1377	}
				1378
				1379	/* A single character may be followed by '-' to form a range. However,
				1380	Perl does not permit ']' to be the end of the range. A '-' character
				1381	here is treated as a literal. */
				1382
				1383	if (ptr[1] == '-' && ptr[2] != ']')
				1384	{
				1385	int d;
				1386	ptr += 2;
				1387	d = *ptr;
				1388
				1389	if (d == 0)
				1390	{
				1391	*errorptr = "incomplete range";
				1392	goto FAILED;
				1393	}
				1394
				1395	/* The second part of a range can be a single-character escape, but
				1396	not any of the other escapes. */
				1397
				1398	if (d == '\\')
				1399	{
				1400	d = check_escape(&ptr, errorptr);
				1401	if (d < 0)
				1402	{
				1403	if (d == -ESC_b) d = '\b'; else
				1404	{
				1405	*errorptr = "invalid range";
				1406	goto FAILED;
				1407	}
				1408	}
				1409	}
				1410
				1411	if (d < c)
				1412	{
				1413	*errorptr = "range out of order";
				1414	goto FAILED;
				1415	}
				1416
				1417	if (rangecount >= 255)
				1418	{
				1419	*errorptr = "too many ranges inside []";
				1420	goto FAILED;
				1421	}
				1422
				1423	rangecount++;
				1424	*code++ = c;
				1425	*code++ = d;
				1426	continue;
				1427	}
				1428
				1429	/* Handle a lone single character: save it up for outputting at the
				1430	end. Be paranoid and check that the buffer isn't going to overflow. */
				1431
				1432	if (singles_count >= 255)
				1433	{
				1434	*errorptr = "too many characters inside []";
				1435	goto FAILED;
				1436	}
				1437	singles[singles_count++] = c;
				1438	}
				1439
				1440	/* Loop until ']' reached; the check for end of string happens inside the
				1441	loop. This "while" is the end of the "do" above. */
				1442
				1443	while ((c = *(++ptr)) != ']');
				1444
				1445	/* Copy saved single characters, which follow the ranges in the output. */
				1446
				1447	c = 0;
				1448	while (c < singles_count) *code++ = singles[c++];
				1449
				1450	/* Finally fill in the flags and counts of ranges and single characters,
				1451	and advance the pointer past the ]. */
				1452
				1453	previous[1] = flags;
				1454	previous[2] = rangecount;
				1455	previous[3] = singles_count;
				1456	}
				1457	break;
				1458
				1459	/* Various kinds of repeat */
				1460
				1461	case '{':
				1462	ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
				1463	if (*errorptr != NULL) goto FAILED;
				1464	goto REPEAT;
				1465
				1466	case '*':
				1467	repeat_min = 0;
				1468	repeat_max = -1;
				1469	goto REPEAT;
				1470
				1471	case '+':
				1472	repeat_min = 1;
				1473	repeat_max = -1;
				1474	goto REPEAT;
				1475
				1476	case '?':
				1477	repeat_min = 0;
				1478	repeat_max = 1;
				1479
				1480	REPEAT:
				1481	if (previous == NULL)
				1482	{
				1483	*errorptr = "nothing to repeat";
				1484	goto FAILED;
				1485	}
				1486
				1487	/* If the next character is '?' this is a minimizing repeat. Advance to the
				1488	next character. */
				1489
				1490	if (ptr[1] == '?') { repeat_type = 1; ptr++; } else repeat_type = 0;
				1491
				1492	/* If previous was a string of characters, chop off the last one and use it
				1493	as the subject of the repeat. If there was only one character, we can
				1494	abolish the previous item altogether. */
				1495
				1496	if (*previous == OP_CHARS)
				1497	{
				1498	int len = previous[1];
				1499	if (len == 1)
				1500	{
				1501	c = previous[2];
				1502	code = previous;
				1503	}
				1504	else
				1505	{
				1506	c = previous[len+1];
				1507	previous[1]--;
				1508	code--;
				1509	}
				1510	op_type = 0; /* Use single-char op codes */
				1511	goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
				1512	}
				1513
				1514	/* If previous was a character type match (\d or similar), abolish it and
				1515	create a suitable repeat item. The code is shared with single-character
				1516	repeats by adding a suitable offset into repeat_type. */
				1517
				1518	if ((int)previous < OP_EOD \|\| previous == OP_ANY)
				1519	{
				1520	op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
				1521	c = *previous;
				1522	code = previous;
				1523
				1524	OUTPUT_SINGLE_REPEAT:
				1525	repeat_type += op_type; /* Combine both values for many cases */
				1526
				1527	/* A minimum of zero is handled either as the special case * or ?, or as
				1528	an UPTO, with the maximum given. */
				1529
				1530	if (repeat_min == 0)
				1531	{
				1532	if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
				1533	else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
				1534	else
				1535	{
				1536	*code++ = OP_UPTO + repeat_type;
				1537	*code++ = repeat_max >> 8;
				1538	*code++ = (repeat_max & 255);
				1539	}
				1540	}
				1541
				1542	/* The case {1,} is handled as the special case + */
				1543
				1544	else if (repeat_min == 1 && repeat_max == -1)
				1545	*code++ = OP_PLUS + repeat_type;
				1546
				1547	/* The case {n,n} is just an EXACT, while the general case {n,m} is
				1548	handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
				1549
				1550	else
				1551	{
				1552	if (repeat_min != 1)
				1553	{
				1554	code++ = OP_EXACT + op_type; / NB EXACT doesn't have repeat_type */
				1555	*code++ = repeat_min >> 8;
				1556	*code++ = (repeat_min & 255);
				1557	}
				1558
				1559	/* If the mininum is 1 and the previous item was a character string,
				1560	we either have to put back the item that got cancelled if the string
				1561	length was 1, or add the character back onto the end of a longer
				1562	string. For a character type nothing need be done; it will just get put
				1563	back naturally. */
				1564
				1565	else if (*previous == OP_CHARS)
				1566	{
				1567	if (code == previous) code += 2; else previous[1]++;
				1568	}
				1569
				1570	/* Insert an UPTO if the max is greater than the min. */
				1571
				1572	if (repeat_max != repeat_min)
				1573	{
				1574	*code++ = c;
				1575	repeat_max -= repeat_min;
				1576	*code++ = OP_UPTO + repeat_type;
				1577	*code++ = repeat_max >> 8;
				1578	*code++ = (repeat_max & 255);
				1579	}
				1580	}
				1581
				1582	/* The character or character type itself comes last in all cases. */
				1583
				1584	*code++ = c;
				1585	}
				1586
				1587	/* If previous was a character class or a back reference, we put the repeat
				1588	stuff after it. */
				1589
				1590	else if (previous == OP_CLASS \|\| previous == OP_NEGCLASS \|\|
				1591	*previous == OP_REF)
				1592	{
				1593	if (repeat_min == 0 && repeat_max == -1)
				1594	*code++ = OP_CRSTAR + repeat_type;
				1595	else if (repeat_min == 1 && repeat_max == -1)
				1596	*code++ = OP_CRPLUS + repeat_type;
				1597	else if (repeat_min == 0 && repeat_max == 1)
				1598	*code++ = OP_CRQUERY + repeat_type;
				1599	else
				1600	{
				1601	*code++ = OP_CRRANGE + repeat_type;
				1602	*code++ = repeat_min >> 8;
				1603	*code++ = repeat_min & 255;
				1604	if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
				1605	*code++ = repeat_max >> 8;
				1606	*code++ = repeat_max & 255;
				1607	}
				1608	}
				1609
				1610	/* If previous was a bracket group, we may have to replicate it in certain
				1611	cases. If the maximum repeat count is unlimited, check that the bracket
				1612	group cannot match the empty string, and diagnose an error if it can. */
				1613
				1614	else if ((int)*previous >= OP_BRA)
				1615	{
				1616	int i;
				1617	int length = code - previous;
				1618
				1619	if (repeat_max == -1 && could_be_empty(previous))
				1620	{
				1621	*errorptr = "operand of unlimited repeat could match the empty string";
				1622	goto FAILED;
				1623	}
				1624
				1625	/* If the minimum is greater than zero, and the maximum is unlimited or
				1626	equal to the minimum, the first copy remains where it is, and is
				1627	replicated up to the minimum number of times. This case includes the +
				1628	repeat, but of course no replication is needed in that case. */
				1629
				1630	if (repeat_min > 0 && (repeat_max == -1 \|\| repeat_max == repeat_min))
				1631	{
				1632	for (i = 1; i < repeat_min; i++)
				1633	{
				1634	memcpy(code, previous, length);
				1635	code += length;
				1636	}
				1637	}
				1638
				1639	/* If the minimum is zero, stick BRAZERO in front of the first copy.
				1640	Then, if there is a fixed upper limit, replicated up to that many times,
				1641	sticking BRAZERO in front of all the optional ones. */
				1642
				1643	else
				1644	{
				1645	if (repeat_min == 0)
				1646	{
				1647	memmove(previous+1, previous, length);
				1648	code++;
				1649	*previous++ = OP_BRAZERO + repeat_type;
				1650	}
				1651
				1652	for (i = 1; i < repeat_min; i++)
				1653	{
				1654	memcpy(code, previous, length);
				1655	code += length;
				1656	}
				1657
				1658	for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
				1659	{
				1660	*code++ = OP_BRAZERO + repeat_type;
				1661	memcpy(code, previous, length);
				1662	code += length;
				1663	}
				1664	}
				1665
				1666	/* If the maximum is unlimited, set a repeater in the final copy. */
				1667
				1668	if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
				1669	}
				1670
				1671	/* Else there's some kind of shambles */
				1672
				1673	else
				1674	{
				1675	*errorptr = "internal error 1 (unexpected repeat)";
				1676	goto FAILED;
				1677	}
				1678
				1679	/* In all case we no longer have a previous item. */
				1680
				1681	previous = NULL;
				1682	break;
				1683
				1684
				1685	/* Start of nested bracket sub-expression, or comment or lookahead.
				1686	First deal with special things that can come after a bracket; all are
				1687	introduced by ?, and the appearance of any of them means that this is not a
				1688	referencing group. They were checked for validity in the first pass over
				1689	the string, so we don't have to check for syntax errors here. */
				1690
				1691	case '(':
				1692	previous = code; /* Only real brackets can be repeated */
				1693	if (*(++ptr) == '?')
				1694	{
				1695	bravalue = OP_BRA;
				1696
				1697	switch (*(++ptr))
				1698	{
				1699	case '#':
				1700	case 'i':
				1701	case 'm':
				1702	case 's':
				1703	case 'x':
				1704	ptr++;
				1705	while (*ptr != ')') ptr++;
				1706	previous = NULL;
				1707	continue;
				1708
				1709	case ':': /* Non-extracting bracket */
				1710	ptr++;
				1711	break;
				1712
				1713	case '=': /* Assertions can't be repeated */
				1714	bravalue = OP_ASSERT;
				1715	ptr++;
				1716	previous = NULL;
				1717	break;
				1718
				1719	case '!':
				1720	bravalue = OP_ASSERT_NOT;
				1721	ptr++;
				1722	previous = NULL;
				1723	break;
				1724
				1725	case ('P'):
				1726	ptr++;
				1727	if (*ptr=='<')
				1728	{
				1729	/* (?P<groupname>...) */
				1730	int idlen;
				1731	PyObject string, intobj;
				1732
				1733	ptr++;
				1734	idlen = get_group_id(ptr, '>', errorptr);
				1735	if (*errorptr) {
				1736	goto FAILED;
				1737	}
				1738	string = PyString_FromStringAndSize(ptr, idlen);
				1739	intobj = PyInt_FromLong( brackets[0] );
				1740	if (intobj == NULL \|\| string==NULL)
				1741	{
				1742	Py_XDECREF(string);
				1743	Py_XDECREF(intobj);
				1744	*errorptr = "exception raised";
				1745	goto FAILED;
				1746	}
				1747	PyDict_SetItem(dictionary, string, intobj);
				1748	Py_DECREF(string); Py_DECREF(intobj);
				1749	ptr += idlen+1; /* Point to rest of expression */
				1750	goto do_grouping_bracket;
				1751	}
				1752	if (*ptr=='=')
				1753	{
				1754	/* (?P=groupname) */
				1755	int idlen, refnum;
				1756	PyObject string, intobj;
				1757
				1758	ptr++;
				1759	idlen = get_group_id(ptr, ')', errorptr);
				1760	if (*errorptr) {
				1761	goto FAILED;
				1762	}
				1763	string = PyString_FromStringAndSize(ptr, idlen);
Guido van Rossum	c386107	1997-10-08 02:07:40 +0000	[diff] [blame]	1764	if (string==NULL) {
Guido van Rossum	51b3aa3	1997-10-06 14:43:11 +0000	[diff] [blame]	1765	Py_XDECREF(string);
				1766	*errorptr = "exception raised";
				1767	goto FAILED;
				1768	}
				1769	intobj = PyDict_GetItem(dictionary, string);
				1770	if (intobj==NULL) {
Guido van Rossum	c386107	1997-10-08 02:07:40 +0000	[diff] [blame]	1771	Py_DECREF(string);
Guido van Rossum	51b3aa3	1997-10-06 14:43:11 +0000	[diff] [blame]	1772	*errorptr = "?P= group identifier isn't defined";
				1773	goto FAILED;
				1774	}
				1775
				1776	refnum = PyInt_AsLong(intobj);
Guido van Rossum	c386107	1997-10-08 02:07:40 +0000	[diff] [blame]	1777	Py_DECREF(string); Py_DECREF(intobj);
Guido van Rossum	51b3aa3	1997-10-06 14:43:11 +0000	[diff] [blame]	1778	*code++ = OP_REF;
				1779	*code++ = refnum;
				1780	/* The continue will cause the top-level for() loop to
				1781	be resumed, so ptr will be immediately incremented.
				1782	Therefore, the following line adds just idlen, not
				1783	idlen+1 */
				1784	ptr += idlen;
				1785	continue;
				1786	}
				1787	/* The character after ?P is neither < nor =, so
				1788	report an error. Add more Python-extensions here. */
				1789	*errorptr="unknown after (?P";
				1790	goto FAILED;
				1791	break;
				1792	default:
				1793	*errorptr = "unknown after (?";
				1794	goto FAILED;
				1795	}
				1796	}
				1797
				1798	/* Else we have a referencing group */
				1799
				1800	else
				1801	{
				1802	do_grouping_bracket:
				1803	if (brackets[0] > EXTRACT_MAX)
				1804	{
				1805	*errorptr = "too many extraction brackets";
				1806	goto FAILED;
				1807	}
				1808	brackets[1] = brackets[0];
				1809	bravalue = OP_BRA + brackets[0]++;
				1810	}
				1811
				1812	/* Process nested bracketed re; at end pointer is on the bracket. We copy
				1813	code into a non-register variable in order to be able to pass its address
				1814	because some compilers complain otherwise. */
				1815
				1816	*code = bravalue;
				1817	{
				1818	uschar *mcode = code;
				1819	if (!compile_regexp(extended, brackets, &mcode, &ptr, errorptr, dictionary))
				1820	goto FAILED;
				1821	code = mcode;
				1822	}
				1823
				1824	if (*ptr != ')')
				1825	{
				1826	*errorptr = "missing )";
				1827	goto FAILED;
				1828	}
				1829	break;
				1830
				1831	/* Check \ for being a real metacharacter; if not, fall through and handle
				1832	it as a data character at the start of a string. Escape items are checked
				1833	for validity in the pre-compiling pass. */
				1834
				1835	case '\\':
				1836	oldptr = ptr;
				1837	c = check_escape(&ptr, errorptr);
				1838
				1839	/* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
				1840	are arranged to be the negation of the corresponding OP_values. For the
				1841	back references, the values are ESC_REF plus the reference number. Only
				1842	back references and those types that consume a character may be repeated.
				1843	We can test for values between ESC_b and ESC_Z for the latter; this may
				1844	have to change if any new ones are ever created. */
				1845
				1846	if (c < 0)
				1847	{
				1848	if (-c >= ESC_REF)
				1849	{
				1850	int refnum = -c -ESC_REF;
				1851	if (brackets[1] < refnum ) {
				1852	*errorptr = "backreference to non-existent group";
				1853	goto FAILED;
				1854	}
				1855	previous = code;
				1856	*code++ = OP_REF;
				1857	*code++ = refnum;
				1858	}
				1859	else
				1860	{
				1861	previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
				1862	*code++ = -c;
				1863	}
				1864	continue;
				1865	}
				1866
				1867	/* Reset and fall through */
				1868
				1869	ptr = oldptr;
				1870	c = '\\';
				1871
				1872	/* Handle a run of data characters until a metacharacter is encountered.
				1873	The first character is guaranteed not to be whitespace or # when the
				1874	extended flag is set. */
				1875
				1876	default:
				1877	previous = code;
				1878	*code = OP_CHARS;
				1879	code += 2;
				1880	length = 0;
				1881
				1882	do
				1883	{
				1884	if (extended)
				1885	{
				1886	if ((pcre_ctypes[c] & ctype_space) != 0) continue;
				1887	if (c == '#')
				1888	{
				1889	while ((c = *(++ptr)) != 0 && c != '\n');
				1890	if (c == 0) break;
				1891	continue;
				1892	}
				1893	}
				1894
				1895	/* Backslash may introduce a data char or a metacharacter. Escaped items
				1896	are checked for validity in the pre-compiling pass. Stop the string
				1897	before a metaitem. */
				1898
				1899	if (c == '\\')
				1900	{
				1901	oldptr = ptr;
				1902	c = check_escape(&ptr, errorptr);
				1903	if (c < 0) { ptr = oldptr; break; }
				1904	}
				1905
				1906	/* Ordinary character or single-char escape */
				1907
				1908	*code++ = c;
				1909	length++;
				1910	}
				1911
				1912	/* This "while" is the end of the "do" above. */
				1913
				1914	while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
				1915
				1916	/* Compute the length and set it in the data vector, and advance to
				1917	the next state. */
				1918
				1919	previous[1] = length;
				1920	ptr--;
				1921	break;
				1922	}
				1923	} /* end of big loop */
				1924
				1925	/* Control never reaches here by falling through, only by a goto for all the
				1926	error states. Pass back the position in the pattern so that it can be displayed
				1927	to the user for diagnosing the error. */
				1928
				1929	FAILED:
				1930	*ptrptr = ptr;
				1931	return FALSE;
				1932	}
				1933
				1934
				1935
				1936
				1937	/*************************************************
				1938	* Compile sequence of alternatives *
				1939	*************************************************/
				1940
				1941	/* On entry, ptr is pointing past the bracket character, but on return
				1942	it points to the closing bracket, or vertical bar, or end of string.
				1943	The code variable is pointing at the byte into which the BRA operator has been
				1944	stored.
				1945
				1946	Argument:
				1947	extended TRUE if PCRE_EXTENDED was set
				1948	brackets -> 2-element vector containing next and top bracket numbers
				1949	codeptr -> the address of the current code pointer
				1950	ptrptr -> the address of the current pattern pointer
				1951	errorptr -> pointer to error message
				1952
				1953	Returns: TRUE on success
				1954	*/
				1955
				1956	static BOOL
				1957	compile_regexp(BOOL extended, int brackets, uschar *codeptr,
				1958	uschar ptrptr, char errorptr, PyObject *dictionary)
				1959	{
				1960	uschar ptr = ptrptr;
				1961	uschar code = codeptr;
				1962	uschar *start_bracket = code;
				1963
				1964	for (;;)
				1965	{
				1966	int length;
				1967	uschar *last_branch = code;
				1968
				1969	code += 3;
				1970	if (!compile_branch(extended, brackets, &code, &ptr, errorptr, dictionary))
				1971	{
				1972	*ptrptr = ptr;
				1973	return FALSE;
				1974	}
				1975
				1976	/* Fill in the length of the last branch */
				1977
				1978	length = code - last_branch;
				1979	last_branch[1] = length >> 8;
				1980	last_branch[2] = length & 255;
				1981
				1982	/* Reached end of expression, either ')' or end of pattern. Insert a
				1983	terminating ket and the length of the whole bracketed item, and return,
				1984	leaving the pointer at the terminating char. */
				1985
				1986	if (*ptr != '\|')
				1987	{
				1988	length = code - start_bracket;
				1989	*code++ = OP_KET;
				1990	*code++ = length >> 8;
				1991	*code++ = length & 255;
				1992	*codeptr = code;
				1993	*ptrptr = ptr;
				1994	return TRUE;
				1995	}
				1996
				1997	/* Another branch follows; insert an "or" node and advance the pointer. */
				1998
				1999	*code = OP_ALT;
				2000	ptr++;
				2001	}
				2002	/* Control never reaches here */
				2003	}
				2004
				2005
				2006
				2007	/*************************************************
				2008	* Check for anchored expression *
				2009	*************************************************/
				2010
				2011	/* Try to find out if this is an anchored regular expression. Consider each
				2012	alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
				2013	all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
				2014	it's anchored. However, if this is a multiline pattern, then only OP_SOD
				2015	counts, since OP_CIRC can match in the middle.
				2016
				2017	A branch is also implicitly anchored if it starts with .* because that will try
				2018	the rest of the pattern at all possible matching points, so there is no point
				2019	trying them again.
				2020
				2021	Argument: points to start of expression (the bracket)
				2022	Returns: TRUE or FALSE
				2023	*/
				2024
				2025	static BOOL
				2026	is_anchored(register uschar *code, BOOL multiline)
				2027	{
				2028	do {
				2029	int op = (int)code[3];
				2030	if (op >= OP_BRA \|\| op == OP_ASSERT)
				2031	{ if (!is_anchored(code+3, multiline)) return FALSE; }
				2032	else if (op == OP_TYPESTAR \|\| op == OP_TYPEMINSTAR)
				2033	{ if (code[4] != OP_ANY) return FALSE; }
				2034	else if (op != OP_SOD && (multiline \|\| op != OP_CIRC)) return FALSE;
				2035	code += (code[1] << 8) + code[2];
				2036	}
				2037	while (*code == OP_ALT);
				2038	return TRUE;
				2039	}
				2040
				2041
				2042
				2043	/*************************************************
				2044	* Check for start with \n line expression *
				2045	*************************************************/
				2046
				2047	/* This is called for multiline expressions to try to find out if every branch
				2048	starts with ^ so that "first char" processing can be done to speed things up.
				2049
				2050	Argument: points to start of expression (the bracket)
				2051	Returns: TRUE or FALSE
				2052	*/
				2053
				2054	static BOOL
				2055	is_startline(uschar *code)
				2056	{
				2057	do {
				2058	if ((int)code[3] >= OP_BRA \|\| code[3] == OP_ASSERT)
				2059	{ if (!is_startline(code+3)) return FALSE; }
				2060	else if (code[3] != OP_CIRC) return FALSE;
				2061	code += (code[1] << 8) + code[2];
				2062	}
				2063	while (*code == OP_ALT);
				2064	return TRUE;
				2065	}
				2066
				2067
				2068
				2069	/*************************************************
				2070	* Check for fixed first char *
				2071	*************************************************/
				2072
				2073	/* Try to find out if there is a fixed first character. This is called for
				2074	unanchored expressions, as it speeds up their processing quite considerably.
				2075	Consider each alternative branch. If they all start with the same char, or with
				2076	a bracket all of whose alternatives start with the same char (recurse ad lib),
				2077	then we return that char, otherwise -1.
				2078
				2079	Argument: points to start of expression (the bracket)
				2080	Returns: -1 or the fixed first char
				2081	*/
				2082
				2083	static int
				2084	find_firstchar(uschar *code)
				2085	{
				2086	register int c = -1;
				2087	do
				2088	{
				2089	register int charoffset = 4;
				2090
				2091	if ((int)code[3] >= OP_BRA \|\| code[3] == OP_ASSERT)
				2092	{
				2093	register int d;
				2094	if ((d = find_firstchar(code+3)) < 0) return -1;
				2095	if (c < 0) c = d; else if (c != d) return -1;
				2096	}
				2097
				2098	else switch(code[3])
				2099	{
				2100	default:
				2101	return -1;
				2102
				2103	case OP_EXACT: /* Fall through */
				2104	charoffset++;
				2105
				2106	case OP_CHARS: /* Fall through */
				2107	charoffset++;
				2108
				2109	case OP_PLUS:
				2110	case OP_MINPLUS:
				2111	if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;
				2112	break;
				2113	}
				2114	code += (code[1] << 8) + code[2];
				2115	}
				2116	while (*code == OP_ALT);
				2117	return c;
				2118	}
				2119
				2120
				2121
				2122	/*************************************************
				2123	* Compile a Regular Expression *
				2124	*************************************************/
				2125
				2126	/* This function takes a string and returns a pointer to a block of store
				2127	holding a compiled version of the expression.
				2128
				2129	Arguments:
				2130	pattern the regular expression
				2131	options various option bits
				2132	errorptr pointer to pointer to error text
				2133	erroroffset ptr offset in pattern where error was detected
				2134
				2135	Returns: pointer to compiled data block, or NULL on error,
				2136	with errorptr and erroroffset set
				2137	*/
				2138
				2139	pcre *
				2140	pcre_compile(char pattern, int options, char *errorptr, int
				2141	erroroffset, PyObject dictionary)
				2142	{
				2143	real_pcre *re;
				2144	int spaces = 0;
				2145	int length = 3; /* For initial BRA plus length */
				2146	int runlength;
				2147	int c, size;
				2148	int brackets[2];
				2149	int brastack[200];
				2150	int brastackptr = 0;
				2151	BOOL extended = (options & PCRE_EXTENDED) != 0;
				2152	uschar code, ptr;
				2153
				2154	#ifdef DEBUG
				2155	uschar code_base, code_end;
				2156	#endif
				2157
				2158	/* Miscellaneous initialization; the copy the error pointers into static
				2159	variables so all functions can access them. */
				2160
				2161	brackets[0] = 1; /* Next bracket number */
				2162	brackets[1] = 0; /* Highest used bracket number */
				2163
				2164	*errorptr = NULL;
				2165	*erroroffset = 0;
				2166
				2167	if ((options & ~PUBLIC_OPTIONS) != 0)
				2168	{
				2169	*errorptr = "unknown option bit(s) set";
				2170	return NULL;
				2171	}
				2172
				2173	#ifdef DEBUG
				2174	printf("------------------------------------------------------------------\n");
				2175	printf("%s\n", pattern);
				2176	#endif
				2177
				2178	/* The first thing to do is to make a pass over the pattern to compute the
				2179	amount of store required to hold the compiled code. This does not have to be
				2180	perfect as long as errors are overestimates. At the same time we can detect any
				2181	internal flag settings. Make an attempt to correct for any counted white space
				2182	if an "extended" flag setting appears late in the pattern. We can't be so
				2183	clever for #-comments. */
				2184
				2185	ptr = (uschar *)(pattern - 1);
				2186	while ((c = *(++ptr)) != 0)
				2187	{
				2188	int min, max;
				2189
				2190	if ((pcre_ctypes[c] & ctype_space) != 0)
				2191	{
				2192	if (extended) continue;
				2193	spaces++;
				2194	}
				2195
				2196	if (extended && c == '#')
				2197	{
				2198	while ((c = *(++ptr)) != 0 && c != '\n');
				2199	continue;
				2200	}
				2201
				2202	switch(c)
				2203	{
				2204	/* A backslashed item may be an escaped "normal" character or a
				2205	character type. For a "normal" character, put the pointers and
				2206	character back so that tests for whitespace etc. in the input
				2207	are done correctly. */
				2208
				2209	case '\\':
				2210	{
				2211	uschar *save_ptr = ptr;
				2212	c = check_escape(&ptr, errorptr);
				2213	if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
				2214	if (c >= 0)
				2215	{
				2216	ptr = save_ptr;
				2217	c = '\\';
				2218	goto NORMAL_CHAR;
				2219	}
				2220	}
				2221	length++;
				2222
				2223	/* A back reference needs an additional char, plus either one or 5
				2224	bytes for a repeat. */
				2225
				2226	if (c <= -ESC_REF)
				2227	{
				2228	length++; /* For single back reference */
				2229	if (ptr[1] == '{')
				2230	{
				2231	ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
				2232	if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
				2233	if ((min == 0 && (max == 1 \|\| max == -1)) \|\|
				2234	(min == 1 && max == -1))
				2235	length++;
				2236	else length += 5;
				2237	if (ptr[1] == '?') ptr++;
				2238	}
				2239	}
				2240	continue;
				2241
				2242	case '^':
				2243	case '.':
				2244	case '$':
				2245	case '': / These repeats won't be after brackets; */
				2246	case '+': /* those are handled separately */
				2247	case '?':
				2248	length++;
				2249	continue;
				2250
				2251	/* This covers the cases of repeats after a single char, metachar, class,
				2252	or back reference. */
				2253
				2254	case '{':
				2255	ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
				2256	if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
				2257	if ((min == 0 && (max == 1 \|\| max == -1)) \|\|
				2258	(min == 1 && max == -1))
				2259	length++;
				2260	else
				2261	{
				2262	length--; /* Uncount the original char or metachar */
				2263	if (min == 1) length++; else if (min > 0) length += 4;
				2264	if (max > 0) length += 4; else length += 2;
				2265	}
				2266	if (ptr[1] == '?') ptr++;
				2267	continue;
				2268
				2269	/* An alternation contains an offset to the next branch or ket. */
				2270	case '\|':
				2271	length += 3;
				2272	continue;
				2273
				2274	/* A character class uses 4 characters plus the characters in it. Don't
				2275	worry about character types that aren't allowed in classes - they'll get
				2276	picked up during the compile. */
				2277
				2278	case '[':
				2279	length += 4;
				2280	if (ptr[1] == '^') ptr++;
				2281	do
				2282	{
				2283	if (*(++ptr) == '\\')
				2284	{
				2285	(void)check_escape(&ptr, errorptr);
				2286	if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
				2287	}
				2288	length++;
				2289	}
				2290	while (ptr != 0 && ptr != ']');
				2291
				2292	/* A repeat needs either 1 or 5 bytes. */
				2293
				2294	if (ptr[1] == '{')
				2295	{
				2296	ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
				2297	if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
				2298	if ((min == 0 && (max == 1 \|\| max == -1)) \|\|
				2299	(min == 1 && max == -1))
				2300	length++;
				2301	else length += 5;
				2302	if (ptr[1] == '?') ptr++;
				2303	}
				2304	continue;
				2305
				2306	/* Brackets may be genuine groups or special things */
				2307
				2308	case '(':
				2309
				2310	/* Handle special forms of bracket, which all start (? */
				2311
				2312	if (ptr[1] == '?') switch (c = ptr[2])
				2313	{
				2314	/* Skip over comments entirely */
				2315	case '#':
				2316	ptr += 3;
				2317	while (ptr != 0 && ptr != ')') ptr++;
				2318	if (*ptr == 0)
				2319	{
				2320	*errorptr = "missing ) after comment";
				2321	goto PCRE_ERROR_RETURN;
				2322	}
				2323	continue;
				2324
				2325	/* Non-referencing groups and lookaheads just move the pointer on, and
				2326	then behave like a non-special bracket. */
				2327
				2328	case ':':
				2329	case '=':
				2330	case '!':
				2331	ptr += 2;
				2332	break;
				2333
				2334	/* Else loop setting valid options until ) is met. Anything else is an
				2335	error. */
				2336
				2337	case ('P'):
				2338	{
				2339	int idlen;
				2340	switch (*ptr++) {
				2341	case ('<'):
				2342	idlen = get_group_id(ptr++, '>', errorptr);
				2343	if (*errorptr) goto PCRE_ERROR_RETURN;
				2344	ptr += idlen+1;
				2345	break;
				2346	case ('='):
				2347	idlen = get_group_id(ptr++, ')', errorptr);
				2348	if (*errorptr) goto PCRE_ERROR_RETURN;
				2349	ptr += idlen+1;
				2350	length++;
				2351	break;
				2352	}
				2353	}
				2354	break;
				2355
				2356	default:
				2357	ptr += 2;
				2358	for (;; ptr++)
				2359	{
				2360	if ((c = *ptr) == 'i')
				2361	{
				2362	options \|= PCRE_CASELESS;
				2363	continue;
				2364	}
				2365	else if ((c = *ptr) == 'm')
				2366	{
				2367	options \|= PCRE_MULTILINE;
				2368	continue;
				2369	}
				2370	else if ((c = *ptr) == 's')
				2371	{
				2372	options \|= PCRE_DOTALL;
				2373	continue;
				2374	}
				2375	else if (c == 'x')
				2376	{
				2377	options \|= PCRE_EXTENDED;
				2378	extended = TRUE;
				2379	length -= spaces; /* Already counted spaces */
				2380	continue;
				2381	}
				2382	else if (c == ')') break;
				2383
				2384	*errorptr = "undefined after (?";
				2385	goto PCRE_ERROR_RETURN;
				2386	}
				2387	continue; /* End of this bracket handling */
				2388	}
				2389
				2390	/* Non-special forms of bracket. Save length for computing whole length
				2391	at end if there's a repeat that requires duplication of the group. */
				2392
				2393	if (brastackptr >= sizeof(brastack)/sizeof(int))
				2394	{
				2395	*errorptr = "too many brackets";
				2396	goto PCRE_ERROR_RETURN;
				2397	}
				2398
				2399	brastack[brastackptr++] = length;
				2400	length += 3;
				2401	continue;
				2402
				2403	/* Handle ket. Look for subsequent max/min; for certain sets of values we
				2404	have to replicate this bracket up to that many times. */
				2405
				2406	case ')':
				2407	length += 3;
				2408	{
				2409	int min = 1;
				2410	int max = 1;
				2411	int duplength = length - brastack[--brastackptr];
				2412
				2413	/* Leave ptr at the final char; for read_repeat_counts this happens
				2414	automatically; for the others we need an increment. */
				2415
				2416	if ((c = ptr[1]) == '{')
				2417	{
				2418	ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
				2419	if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
				2420	}
				2421	else if (c == '*') { min = 0; max = -1; ptr++; }
				2422	else if (c == '+') { max = -1; ptr++; }
				2423	else if (c == '?') { min = 0; ptr++; }
				2424
				2425	/* If there is a minimum > 1 we have to replicate up to min-1 times; if
				2426	there is a limited maximum we have to replicate up to max-1 times and
				2427	allow for a BRAZERO item before each optional copy, as we also have to
				2428	do before the first copy if the minimum is zero. */
				2429
				2430	if (min == 0) length++;
				2431	else if (min > 1) length += (min - 1) * duplength;
				2432	if (max > min) length += (max - min) * (duplength + 1);
				2433	}
				2434
				2435	continue;
				2436
				2437	/* Non-special character. For a run of such characters the length required
				2438	is the number of characters + 2, except that the maximum run length is 255.
				2439	We won't get a skipped space or a non-data escape or the start of a #
				2440	comment as the first character, so the length can't be zero. */
				2441
				2442	NORMAL_CHAR:
				2443	default:
				2444	length += 2;
				2445	runlength = 0;
				2446	do
				2447	{
				2448	if ((pcre_ctypes[c] & ctype_space) != 0)
				2449	{
				2450	if (extended) continue;
				2451	spaces++;
				2452	}
				2453
				2454	if (extended && c == '#')
				2455	{
				2456	while ((c = *(++ptr)) != 0 && c != '\n');
				2457	continue;
				2458	}
				2459
				2460	/* Backslash may introduce a data char or a metacharacter; stop the
				2461	string before the latter. */
				2462
				2463	if (c == '\\')
				2464	{
				2465	uschar *saveptr = ptr;
				2466	c = check_escape(&ptr, errorptr);
				2467	if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
				2468	if (c < 0) { ptr = saveptr; break; }
				2469	}
				2470
				2471	/* Ordinary character or single-char escape */
				2472
				2473	runlength++;
				2474	}
				2475
				2476	/* This "while" is the end of the "do" above. */
				2477
				2478	while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
				2479
				2480	ptr--;
				2481	length += runlength;
				2482	continue;
				2483	}
				2484	}
				2485
				2486	length += 4; /* For final KET and END */
				2487
				2488	if (length > 65539)
				2489	{
				2490	*errorptr = "regular expression too large";
				2491	return NULL;
				2492	}
				2493
				2494	/* Compute the size of data block needed and get it, either from malloc or
				2495	externally provided function. Put in the magic number and the options. */
				2496
				2497	size = length + sizeof(real_pcre) - sizeof(re->code);
				2498	re = (real_pcre *)(pcre_malloc)(size);
				2499
				2500	if (re == NULL)
				2501	{
				2502	*errorptr = "failed to get memory";
				2503	return NULL;
				2504	}
				2505
				2506	re->magic_number = MAGIC_NUMBER;
				2507	re->options = options;
				2508
				2509	/* Set up a starting, non-extracting bracket, then compile the expression. On
				2510	error, *errorptr will be set non-NULL, so we don't need to look at the result
				2511	of the function here. */
				2512
				2513	ptr = (uschar *)pattern;
				2514	code = re->code;
				2515	*code = OP_BRA;
				2516	(void)compile_regexp(extended, brackets, &code, &ptr, errorptr, dictionary);
				2517	re->top_bracket = brackets[1];
				2518
				2519	/* If not reached end of pattern on success, there's an excess bracket. */
				2520
				2521	if (errorptr == NULL && ptr != 0) *errorptr = "unmatched brackets";
				2522	/* Fill in the terminating state and check for disastrous overflow, but
				2523	if debugging, leave the test till after things are printed out. */
				2524
				2525	*code++ = OP_END;
				2526
				2527	#ifndef DEBUG
				2528	if (code - re->code > length) *errorptr = "internal error: code overflow";
				2529	#endif
				2530
				2531	/* Failed to compile */
				2532
				2533	if (*errorptr != NULL)
				2534	{
				2535	(pcre_free)(re);
				2536	PCRE_ERROR_RETURN:
				2537	erroroffset = ptr - (uschar )pattern;
				2538	return NULL;
				2539	}
				2540
				2541	/* If the anchored option was not passed, set flag if we can determine that it
				2542	is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
				2543	we can determine what the first character has to be, because that speeds up
				2544	unanchored matches no end. In the case of multiline matches, an alternative is
				2545	to set the PCRE_STARTLINE flag if all branches start with ^. */
				2546
				2547	if ((options & PCRE_ANCHORED) == 0)
				2548	{
				2549	if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))
				2550	re->options \|= PCRE_ANCHORED;
				2551	else
				2552	{
				2553	int c = find_firstchar(re->code);
				2554	if (c >= 0)
				2555	{
				2556	re->first_char = c;
				2557	re->options \|= PCRE_FIRSTSET;
				2558	}
				2559	else if (is_startline(re->code))
				2560	re->options \|= PCRE_STARTLINE;
				2561	}
				2562	}
				2563
				2564	/* Print out the compiled data for debugging */
				2565
				2566	#ifdef DEBUG
				2567
				2568	printf("Length = %d top_bracket = %d%s%s%s%s\n",
				2569	length, re->top_bracket,
				2570	((re->options & PCRE_ANCHORED) != 0)? " anchored" : "",
				2571	((re->options & PCRE_CASELESS) != 0)? " caseless" : "",
				2572	extended? " extended" : "",
				2573	((re->options & PCRE_MULTILINE) != 0)? " multiline" : "");
				2574
				2575	if ((re->options & PCRE_FIRSTSET) != 0)
				2576	{
				2577	if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
				2578	else printf("First char = \\x%02x\n", re->first_char);
				2579	}
				2580
				2581	code_end = code;
				2582	code_base = code = re->code;
				2583
				2584	while (code < code_end)
				2585	{
				2586	int charlength;
				2587
				2588	printf("%3d ", code - code_base);
				2589
				2590	if (*code >= OP_BRA)
				2591	{
				2592	printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
				2593	code += 2;
				2594	}
				2595
				2596	else switch(*code)
				2597	{
				2598	case OP_CHARS:
				2599	charlength = *(++code);
				2600	printf("%3d ", charlength);
				2601	while (charlength-- > 0)
				2602	if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
				2603	break;
				2604
				2605	case OP_KETRMAX:
				2606	case OP_KETRMIN:
				2607	case OP_ALT:
				2608	case OP_KET:
				2609	case OP_ASSERT:
				2610	case OP_ASSERT_NOT:
				2611	printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
				2612	code += 2;
				2613	break;
				2614
				2615	case OP_STAR:
				2616	case OP_MINSTAR:
				2617	case OP_PLUS:
				2618	case OP_MINPLUS:
				2619	case OP_QUERY:
				2620	case OP_MINQUERY:
				2621	case OP_TYPESTAR:
				2622	case OP_TYPEMINSTAR:
				2623	case OP_TYPEPLUS:
				2624	case OP_TYPEMINPLUS:
				2625	case OP_TYPEQUERY:
				2626	case OP_TYPEMINQUERY:
				2627	if (*code >= OP_TYPESTAR)
				2628	printf(" %s", OP_names[code[1]]);
				2629	else if (isprint(c = code[1])) printf(" %c", c);
				2630	else printf(" \\x%02x", c);
				2631	printf("%s", OP_names[*code++]);
				2632	break;
				2633
				2634	case OP_EXACT:
				2635	case OP_UPTO:
				2636	case OP_MINUPTO:
				2637	if (isprint(c = code[3])) printf(" %c{", c);
				2638	else printf(" \\x%02x{", c);
				2639	if (*code != OP_EXACT) printf(",");
				2640	printf("%d}", (code[1] << 8) + code[2]);
				2641	if (*code == OP_MINUPTO) printf("?");
				2642	code += 3;
				2643	break;
				2644
				2645	case OP_TYPEEXACT:
				2646	case OP_TYPEUPTO:
				2647	case OP_TYPEMINUPTO:
				2648	printf(" %s{", OP_names[code[3]]);
				2649	if (*code != OP_TYPEEXACT) printf(",");
				2650	printf("%d}", (code[1] << 8) + code[2]);
				2651	if (*code == OP_TYPEMINUPTO) printf("?");
				2652	code += 3;
				2653	break;
				2654
				2655	case OP_REF:
				2656	printf(" \\%d", *(++code));
				2657	break;
				2658
				2659	case OP_CLASS:
				2660	case OP_NEGCLASS:
				2661	{
				2662	int i, min, max;
				2663	int flags = code[1];
				2664	int rangecount = code[2];
				2665	int charcount = code[3];
				2666
				2667	printf(" [%s", (*code == OP_CLASS)? "" : "^");
				2668	code += 3;
				2669
				2670	for (i = 0; i < 8; i++)
				2671	if ((flags & (1 << i)) != 0) printf("%s", class_names[i]);
				2672
				2673	for (i = 0; i < rangecount; i++)
				2674	{
				2675	if (isprint((++code))) printf("%c-", code); else printf("\\x%02x-", *code);
				2676	if (isprint((++code))) printf("%c", code); else printf("\\x%02x", *code);
				2677	}
				2678
				2679	for (i = 0; i < charcount; i++)
				2680	{
				2681	if (!isprint((++code))) printf("\\x%02x", code);
				2682	else if (strchr("-\\]", code) != NULL) printf("\\%c", code);
				2683	else printf("%c", *code);
				2684	}
				2685	printf("]");
				2686
				2687	switch(*(++code))
				2688	{
				2689	case OP_CRSTAR:
				2690	case OP_CRMINSTAR:
				2691	case OP_CRPLUS:
				2692	case OP_CRMINPLUS:
				2693	case OP_CRQUERY:
				2694	case OP_CRMINQUERY:
				2695	printf("%s", OP_names[*code]);
				2696	break;
				2697
				2698	case OP_CRRANGE:
				2699	case OP_CRMINRANGE:
				2700	min = (code[1] << 8) + code[2];
				2701	max = (code[3] << 8) + code[4];
				2702	if (max == 0) printf("{%d,}", min);
				2703	else printf("{%d,%d}", min, max);
				2704	if (*code == OP_CRMINRANGE) printf("?");
				2705	code += 4;
				2706	break;
				2707
				2708	default:
				2709	code--;
				2710	}
				2711	}
				2712	break;
				2713
				2714	/* Anything else is just a one-node item */
				2715
				2716	default:
				2717	printf(" %s", OP_names[*code]);
				2718	break;
				2719	}
				2720
				2721	code++;
				2722	printf("\n");
				2723	}
				2724	printf("------------------------------------------------------------------\n");
				2725
				2726	/* This check is done here in the debugging case so that the code that
				2727	was compiled can be seen. */
				2728
				2729	if (code - re->code > length)
				2730	{
				2731	*errorptr = "internal error: code overflow";
				2732	(pcre_free)(re);
				2733	erroroffset = ptr - (uschar )pattern;
				2734	return NULL;
				2735	}
				2736	#endif
				2737
				2738	return (pcre *)re;
				2739	}
				2740
				2741
				2742
				2743	/*************************************************
				2744	* Match a character type *
				2745	*************************************************/
				2746
				2747	/* Not used in all the places it might be as it's sometimes faster
				2748	to put the code inline.
				2749
				2750	Arguments:
				2751	type the character type
				2752	c the character
				2753	multiline the multiline flag
				2754
				2755	Returns: TRUE if character is of the type
				2756	*/
				2757
				2758	static BOOL
				2759	match_type(int type, int c, BOOL dotall)
				2760	{
				2761
				2762	#ifdef DEBUG
				2763	if (isprint(c)) printf("matching subject %c against ", c);
				2764	else printf("matching subject \\x%02x against ", c);
				2765	printf("%s\n", OP_names[type]);
				2766	#endif
				2767
				2768	switch(type)
				2769	{
				2770	case OP_ANY: return dotall \|\| c != '\n';
				2771	case OP_NOT_DIGIT: return (pcre_ctypes[c] & ctype_digit) == 0;
				2772	case OP_DIGIT: return (pcre_ctypes[c] & ctype_digit) != 0;
				2773	case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
				2774	case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0;
				2775	case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0;
				2776	case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0;
				2777	}
				2778	return FALSE;
				2779	}
				2780
				2781
				2782	/*************************************************
				2783	* Match a character class *
				2784	*************************************************/
				2785
				2786	/* Return "result" if char is in the class and "!result" otherwise.
				2787
				2788	Arguments:
				2789	data points to the class item
				2790	c the subject character
				2791	result value to return if in class
				2792	md matching "static" data
				2793
				2794	Returns: result or !result
				2795	*/
				2796
				2797	static BOOL
				2798	match_class(register uschar data, register int c, BOOL result, match_data md)
				2799	{
				2800	int flags = data[1];
				2801	int i;
				2802	uschar *base = data;
				2803	uschar *end;
				2804
				2805	#ifdef DEBUG
				2806	{
				2807	uschar *d = base + 3;
				2808
				2809	if (isprint(c))
				2810	printf("match %c against [%s", c, result? "" : "^");
				2811	else
				2812	printf("match \\x%02x against [%s", c, result? "" : "^");
				2813
				2814	for (i = 0; i < 8; i++)
				2815	if ((flags & (1 << i)) != 0) printf("%s", class_names[i]);
				2816
				2817	for (i = 0; i < data[2]; i++)
				2818	{
				2819	if (isprint((++d))) printf("%c-", d); else printf("\\x%02x-", *d);
				2820	if (isprint((++d))) printf("%c", d); else printf("\\x%02x", *d);
				2821	}
				2822
				2823	for (i = 0; i < data[3]; i++)
				2824	{
				2825	if (!isprint((++d))) printf("\\x%02x", d);
				2826	else if (strchr("-\\]", d) != NULL) printf("\\%c", d);
				2827	else printf("%c", *d);
				2828	}
				2829	printf("]\n");
				2830	}
				2831	#endif
				2832
				2833	/* Test for any character types */
				2834
				2835	for (i = 0; flags != 0; i++)
				2836	{
				2837	if ((flags & 1) != 0 && match_type(class_ops[i], c, md->dotall))
				2838	return result;
				2839	flags >>= 1;
				2840	}
				2841
				2842	/* Advance pointer to the specific chars and do the caseless or caseful testing
				2843	of the ranges and individual characters as necessary. */
				2844
				2845	data += 4;
				2846	end = data + base[2] * 2;
				2847
				2848	/* Caseless character ranges are slightly tricky, because of cases like [W-c].
				2849	What we do is to uppercase the subject char if it is beyond the end of the
				2850	range, or lowercase it if it is before the start of the range and try again if
				2851	a caseful comparison has failed. This works because upper case letters come
				2852	before lower case in ASCII code. It would not work in EBCDIC, for example,
				2853	where they are the other way round, but then ranges like [W-c] would be illegal
				2854	in EBCDIC. */
				2855
				2856	if (md->caseless)
				2857	{
				2858	while (data < end)
				2859	{
				2860	register int d;
				2861	if (c >= (int)*data && c <= (int)data[1]) return result;
				2862	d = (c < (int)*data)? pcre_lcc[c] : pcre_ucc[c];
				2863	if (d >= (int)*data && d <= (int)data[1]) return result;
				2864	data += 2;
				2865	}
				2866	end += base[3];
				2867	c = pcre_lcc[c];
				2868	while (data < end) if (c == pcre_lcc[*data++]) return result;
				2869	}
				2870
				2871	/* Caseful is easy */
				2872
				2873	else
				2874	{
				2875	while (data < end)
				2876	{
				2877	if (c >= (int)*data && c <= (int)data[1]) return result;
				2878	data += 2;
				2879	}
				2880	end += base[3];
				2881	while (data < end) if (c == *data++) return result;
				2882	}
				2883
				2884	/* Character is not in the class */
				2885
				2886	return !result;
				2887	}
				2888
				2889
				2890
				2891	/*************************************************
				2892	* Match a back-reference *
				2893	*************************************************/
				2894
				2895	/* If a back reference hasn't been set, the match fails.
				2896
				2897	Arguments:
				2898	number reference number
				2899	eptr points into the subject
				2900	length length to be matched
				2901	md points to match data block
				2902
				2903	Returns: TRUE if matched
				2904	*/
				2905
				2906	static BOOL
				2907	match_ref(int number, register uschar eptr, int length, match_data md)
				2908	{
				2909	uschar *p = md->start_subject + md->offset_vector[number];
				2910
				2911	#ifdef DEBUG
				2912	if (eptr >= md->end_subject)
				2913	printf("matching subject <null>");
				2914	else
				2915	{
				2916	printf("matching subject ");
				2917	pchars(eptr, length, TRUE, md);
				2918	}
				2919	printf(" against backref ");
				2920	pchars(p, length, FALSE, md);
				2921	printf("\n");
				2922	#endif
				2923
				2924	/* Always fail if not enough characters left */
				2925
				2926	if (length > md->end_subject - p) return FALSE;
				2927
				2928	/* Separate the caselesss case for speed */
				2929
				2930	if (md->caseless)
				2931	{ while (length-- > 0) if (pcre_lcc[p++] != pcre_lcc[eptr++]) return FALSE; }
				2932	else
				2933	{ while (length-- > 0) if (p++ != eptr++) return FALSE; }
				2934
				2935	return TRUE;
				2936	}
				2937
				2938	static int free_stack(match_data *md)
				2939	{
				2940	/* Free any stack space that was allocated by the call to match(). */
				2941	if (md->off_num) free(md->off_num);
				2942	if (md->offset_top) free(md->offset_top);
				2943	if (md->r1) free(md->r1);
				2944	if (md->r2) free(md->r2);
				2945	if (md->eptr) free(md->eptr);
Guido van Rossum	c386107	1997-10-08 02:07:40 +0000	[diff] [blame]	2946	if (md->ecode) free(md->ecode);
				2947	return 0;
Guido van Rossum	51b3aa3	1997-10-06 14:43:11 +0000	[diff] [blame]	2948	}
				2949
				2950	static int grow_stack(match_data *md)
				2951	{
				2952	md->length = md->length ? md->length+md->length/2 : 200;
				2953	md->offset_top=realloc(md->offset_top, md->length*sizeof(int));
				2954	md->eptr=realloc(md->eptr, md->lengthsizeof(void ));
				2955	md->ecode=realloc(md->ecode, md->lengthsizeof(void ));
				2956	md->off_num=realloc(md->off_num, md->length*sizeof(int));
				2957	md->r1=realloc(md->r1, md->length*sizeof(int));
				2958	md->r2=realloc(md->r2, md->length*sizeof(int));
				2959	return 0;
				2960	}
				2961
				2962	/*************************************************
				2963	* Match from current position *
				2964	*************************************************/
				2965
				2966	/* On entry ecode points to the first opcode, and eptr to the first character.
				2967
				2968	Arguments:
				2969	eptr pointer in subject
				2970	ecode position in code
				2971	offset_top current top pointer
				2972	md pointer to "static" info for the match
				2973
				2974	Returns: TRUE if matched
				2975	*/
				2976
				2977	static BOOL
				2978	match(register uschar eptr, register uschar ecode, int offset_top,
				2979	match_data *md)
				2980	{
				2981	int save_stack_position = md->point;
				2982	match_loop:
				2983
				2984	#define SUCCEED goto succeed
				2985	#define FAIL goto fail
				2986
				2987	for (;;)
				2988	{
				2989	int min, max, ctype;
				2990	register int i;
				2991	register int c;
Guido van Rossum	c386107	1997-10-08 02:07:40 +0000	[diff] [blame]	2992	BOOL minimize = 0;
Guido van Rossum	51b3aa3	1997-10-06 14:43:11 +0000	[diff] [blame]	2993
				2994	/* Opening bracket. Check the alternative branches in turn, failing if none
				2995	match. We have to set the start offset if required and there is space
				2996	in the offset vector so that it is available for subsequent back references
				2997	if the bracket matches. However, if the bracket fails, we must put back the
				2998	previous value of both offsets in case they were set by a previous copy of
				2999	the same bracket. Don't worry about setting the flag for the error case here;
				3000	that is handled in the code for KET. */
				3001
				3002	if ((int)*ecode >= OP_BRA)
				3003	{
				3004	int number = (*ecode - OP_BRA) << 1;
Guido van Rossum	c386107	1997-10-08 02:07:40 +0000	[diff] [blame]	3005	int save_offset1 = 0, save_offset2 = 0;
Guido van Rossum	51b3aa3	1997-10-06 14:43:11 +0000	[diff] [blame]	3006
				3007	#ifdef DEBUG
				3008	printf("start bracket %d\n", number/2);
				3009	#endif
				3010
				3011	if (number > 0 && number < md->offset_end)
				3012	{
				3013	save_offset1 = md->offset_vector[number];
				3014	save_offset2 = md->offset_vector[number+1];
				3015	md->offset_vector[number] = eptr - md->start_subject;
				3016
				3017	#ifdef DEBUG
				3018	printf("saving %d %d\n", save_offset1, save_offset2);
				3019	#endif
				3020	}
				3021
				3022	/* Recurse for all the alternatives. */
				3023
				3024	do
				3025	{
				3026	if (match(eptr, ecode+3, offset_top, md)) SUCCEED;
				3027	ecode += (ecode[1] << 8) + ecode[2];
				3028	}
				3029	while (*ecode == OP_ALT);
				3030
				3031	#ifdef DEBUG
				3032	printf("bracket %d failed\n", number/2);
				3033	#endif
				3034
				3035	if (number > 0 && number < md->offset_end)
				3036	{
				3037	md->offset_vector[number] = save_offset1;
				3038	md->offset_vector[number+1] = save_offset2;
				3039	}
				3040
				3041	FAIL;
				3042	}
				3043
				3044	/* Other types of node can be handled by a switch */
				3045
				3046	switch(*ecode)
				3047	{
				3048	case OP_END:
				3049	md->end_match_ptr = eptr; /* Record where we ended */
				3050	md->end_offset_top = offset_top; /* and how many extracts were taken */
				3051	SUCCEED;
				3052
				3053	/* Assertion brackets. Check the alternative branches in turn - the
				3054	matching won't pass the KET for an assertion. If any one branch matches,
				3055	the assertion is true. */
				3056
				3057	case OP_ASSERT:
				3058	do
				3059	{
				3060	if (match(eptr, ecode+3, offset_top, md)) break;
				3061	ecode += (ecode[1] << 8) + ecode[2];
				3062	}
				3063	while (*ecode == OP_ALT);
				3064	if (*ecode == OP_KET) FAIL;
				3065
				3066	/* Continue from after the assertion, updating the offsets high water
				3067	mark, since extracts may have been taken during the assertion. */
				3068
				3069	do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
				3070	ecode += 3;
				3071	offset_top = md->end_offset_top;
				3072	continue;
				3073
				3074	/* Negative assertion: all branches must fail to match */
				3075
				3076	case OP_ASSERT_NOT:
				3077	do
				3078	{
				3079	if (match(eptr, ecode+3, offset_top, md)) FAIL;
				3080	ecode += (ecode[1] << 8) + ecode[2];
				3081	}
				3082	while (*ecode == OP_ALT);
				3083	ecode += 3;
				3084	continue;
				3085
				3086	/* An alternation is the end of a branch; scan along to find the end of the
				3087	bracketed group and go to there. */
				3088
				3089	case OP_ALT:
				3090	do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
				3091	break;
				3092
				3093	/* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
				3094	that it may occur zero times. It may repeat infinitely, or not at all -
				3095	i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
				3096	repeat limits are compiled as a number of copies, with the optional ones
				3097	preceded by BRAZERO or BRAMINZERO. */
				3098
				3099	case OP_BRAZERO:
				3100	{
				3101	uschar *next = ecode+1;
				3102	if (match(eptr, next, offset_top, md)) SUCCEED;
				3103	do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
				3104	ecode = next + 3;
				3105	}
				3106	break;
				3107
				3108	case OP_BRAMINZERO:
				3109	{
				3110	uschar *next = ecode+1;
				3111	do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
				3112	if (match(eptr, next+3, offset_top, md)) SUCCEED;
				3113	ecode++;
				3114	}
				3115	break;;
				3116
				3117	/* End of a group, repeated or non-repeating. If we are at the end of
				3118	an assertion "group", stop matching and SUCCEED, but record the
				3119	current high water mark for use by positive assertions. */
				3120
				3121	case OP_KET:
				3122	case OP_KETRMIN:
				3123	case OP_KETRMAX:
				3124	{
				3125	int number, start, end;
				3126	uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
				3127
				3128	if (prev == OP_ASSERT \|\| prev == OP_ASSERT_NOT)
				3129	{
				3130	md->end_offset_top = offset_top;
				3131	SUCCEED;
				3132	}
				3133
				3134	/* In all other cases we have to check the group number back at the
				3135	start and if necessary complete handling an extraction by setting the
				3136	final offset and bumping the high water mark. */
				3137
				3138	number = (*prev - OP_BRA) << 1;
				3139
				3140	#ifdef DEBUG
				3141	printf("end bracket %d\n", number/2);
				3142	#endif
				3143
				3144	if (number > 0)
				3145	{
				3146	if (number >= md->offset_end) md->offset_overflow = TRUE; else
				3147	{
				3148	start=md->offset_vector[number] ; end =md->offset_vector[number+1];
				3149	md->offset_vector[number+1] = eptr - md->start_subject;
				3150	if (offset_top <= number) offset_top = number + 2;
				3151	}
				3152	}
				3153
				3154	/* For a non-repeating ket, just advance to the next node and continue at
				3155	this level. */
				3156
				3157	if (*ecode == OP_KET)
				3158	{
				3159	ecode += 3;
				3160	break;
				3161	}
				3162
				3163	/* The repeating kets try the rest of the pattern or restart from the
				3164	preceding bracket, in the appropriate order. */
				3165
				3166	if (*ecode == OP_KETRMIN)
				3167	{
				3168	uschar *ptr;
				3169	if (match(eptr, ecode+3, offset_top, md)) goto succeed;
				3170	/* Handle alternation inside the BRA...KET; push the additional
				3171	alternatives onto the stack
				3172	XXX this tries the alternatives backwards! */
				3173	ptr=prev;
				3174	do {
				3175	ptr += (ptr[1]<<8)+ ptr[2];
				3176	if (*ptr==OP_ALT)
				3177	{
				3178	if (md->length == md->point) grow_stack(md);
				3179	md->offset_top[md->point] = offset_top;
				3180	md->eptr[md->point] = eptr;
				3181	md->ecode[md->point] = ptr+3;
				3182	md->r1[md->point] = 0;
				3183	md->r2[md->point] = 0;
				3184	md->off_num[md->point] = 0;
				3185	md->point++;
				3186	}
				3187	} while (*ptr==OP_ALT);
				3188	ecode=prev+3; goto match_loop;
				3189	}
				3190	else /* OP_KETRMAX */
				3191	{
				3192	uschar *ptr;
				3193	int points_pushed=0;
				3194
				3195	/* Push one failure point, that will resume matching at the code after
				3196	the KETRMAX opcode. */
				3197	if (md->length == md->point) grow_stack(md);
				3198	md->offset_top[md->point] = offset_top;
				3199	md->eptr[md->point] = eptr;
				3200	md->ecode[md->point] = ecode+3;
				3201	md->r1[md->point] = md->offset_vector[number];
				3202	md->r2[md->point] = md->offset_vector[number+1];
				3203	md->off_num[md->point] = number;
				3204	md->point++;
				3205
				3206	md->offset_vector[number] = eptr - md->start_subject;
				3207	/* Handle alternation inside the BRA...KET; push each of the
				3208	additional alternatives onto the stack
				3209	XXX this tries the alternatives backwards! */
				3210	ptr=prev;
				3211	do {
				3212	ptr += (ptr[1]<<8)+ ptr[2];
				3213	if (*ptr==OP_ALT)
				3214	{
				3215	if (md->length == md->point) grow_stack(md);
				3216	md->offset_top[md->point] = offset_top;
				3217	md->eptr[md->point] = eptr;
				3218	md->ecode[md->point] = ptr+3;
				3219	md->r1[md->point] = 0;
				3220	md->r2[md->point] = 0;
				3221	md->off_num[md->point] = 0;
				3222	md->point++;
				3223	points_pushed++;
				3224	}
				3225	} while (*ptr==OP_ALT);
				3226	/* Jump to the first (or only) alternative and resume trying to match */
				3227	ecode=prev+3; goto match_loop;
				3228	}
				3229	}
				3230	FAIL;
				3231
				3232	/* Start of subject, or after internal newline if multiline */
				3233
				3234	case OP_CIRC:
				3235	if (md->multiline)
				3236	{
				3237	if (eptr != md->start_subject && eptr[-1] != '\n') FAIL;
				3238	ecode++;
				3239	break;
				3240	}
				3241	/* ... else fall through */
				3242
				3243	/* Start of subject assertion */
				3244
				3245	case OP_SOD:
				3246	if (eptr != md->start_subject) FAIL;
				3247	ecode++;
				3248	break;
				3249
				3250	/* End of subject, or before internal newline if multiline */
				3251
				3252	case OP_DOLL:
				3253	if (md->multiline)
				3254	{
				3255	if (eptr < md->end_subject && *eptr != '\n') FAIL;
				3256	ecode++;
				3257	break;
				3258	}
				3259	/* ... else fall through */
				3260
				3261	/* End of subject assertion */
				3262
				3263	case OP_EOD:
				3264	if (eptr < md->end_subject) FAIL;
				3265	ecode++;
				3266	break;
				3267
				3268	/* Word boundary assertions */
				3269
				3270	case OP_NOT_WORD_BOUNDARY:
				3271	case OP_WORD_BOUNDARY:
				3272	{
				3273	BOOL prev_is_word = (eptr != md->start_subject) &&
				3274	((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
				3275	BOOL cur_is_word = (eptr < md->end_subject) &&
				3276	((pcre_ctypes[*eptr] & ctype_word) != 0);
				3277	if ((*ecode++ == OP_WORD_BOUNDARY)?
				3278	cur_is_word == prev_is_word : cur_is_word != prev_is_word)
				3279	FAIL;
				3280	}
				3281	break;
				3282
				3283	/* Match a single character type; inline for speed */
				3284
				3285	case OP_ANY:
				3286	if (!md->dotall && eptr < md->end_subject && *eptr == '\n') FAIL;
				3287	if (eptr++ >= md->end_subject) FAIL;
				3288	ecode++;
				3289	break;
				3290
				3291	case OP_NOT_DIGIT:
				3292	if (eptr >= md->end_subject \|\| (pcre_ctypes[*eptr++] & ctype_digit) != 0)
				3293	FAIL;
				3294	ecode++;
				3295	break;
				3296
				3297	case OP_DIGIT:
				3298	if (eptr >= md->end_subject \|\| (pcre_ctypes[*eptr++] & ctype_digit) == 0)
				3299	FAIL;
				3300	ecode++;
				3301	break;
				3302
				3303	case OP_NOT_WHITESPACE:
				3304	if (eptr >= md->end_subject \|\| (pcre_ctypes[*eptr++] & ctype_space) != 0)
				3305	FAIL;
				3306	ecode++;
				3307	break;
				3308
				3309	case OP_WHITESPACE:
				3310	if (eptr >= md->end_subject \|\| (pcre_ctypes[*eptr++] & ctype_space) == 0)
				3311	FAIL;
				3312	ecode++;
				3313	break;
				3314
				3315	case OP_NOT_WORDCHAR:
				3316	if (eptr >= md->end_subject \|\| (pcre_ctypes[*eptr++] & ctype_word) != 0)
				3317	FAIL;
				3318	ecode++;
				3319	break;
				3320
				3321	case OP_WORDCHAR:
				3322	if (eptr >= md->end_subject \|\| (pcre_ctypes[*eptr++] & ctype_word) == 0)
				3323	FAIL;
				3324	ecode++;
				3325	break;
				3326
				3327	/* Match a back reference, possibly repeatedly. Look past the end of the
				3328	item to see if there is repeat information following. The code is similar
				3329	to that for character classes, but repeated for efficiency. Then obey
				3330	similar code to character type repeats - written out again for speed.
				3331	However, if the referenced string is the empty string, always treat
				3332	it as matched, any number of times (otherwise there could be infinite
				3333	loops). */
				3334
				3335	case OP_REF:
				3336	{
				3337	int length;
				3338	int number = ecode[1] << 1; /* Doubled reference number */
				3339	ecode += 2; /* Advance past the item */
				3340
				3341	if (number >= offset_top \|\| md->offset_vector[number] < 0)
				3342	{
				3343	md->errorcode = PCRE_ERROR_BADREF;
				3344	FAIL;
				3345	}
				3346
				3347	length = md->offset_vector[number+1] - md->offset_vector[number];
				3348
				3349	switch (*ecode)
				3350	{
				3351	case OP_CRSTAR:
				3352	case OP_CRMINSTAR:
				3353	case OP_CRPLUS:
				3354	case OP_CRMINPLUS:
				3355	case OP_CRQUERY:
				3356	case OP_CRMINQUERY:
				3357	c = *ecode++ - OP_CRSTAR;
				3358	minimize = (c & 1) != 0;
				3359	min = rep_min[c]; /* Pick up values from tables; */
				3360	max = rep_max[c]; /* zero for max => infinity */
				3361	if (max == 0) max = INT_MAX;
				3362	break;
				3363
				3364	case OP_CRRANGE:
				3365	case OP_CRMINRANGE:
				3366	minimize = (*ecode == OP_CRMINRANGE);
				3367	min = (ecode[1] << 8) + ecode[2];
				3368	max = (ecode[3] << 8) + ecode[4];
				3369	if (max == 0) max = INT_MAX;
				3370	ecode += 5;
				3371	break;
				3372
				3373	default: /* No repeat follows */
				3374	if (!match_ref(number, eptr, length, md)) FAIL;
				3375	eptr += length;
				3376	continue; /* With the main loop */
				3377	}
				3378
				3379	/* If the length of the reference is zero, just continue with the
				3380	main loop. */
				3381
				3382	if (length == 0) continue;
				3383
				3384	/* First, ensure the minimum number of matches are present. We get back
				3385	the length of the reference string explicitly rather than passing the
				3386	address of eptr, so that eptr can be a register variable. */
				3387
				3388	for (i = 1; i <= min; i++)
				3389	{
				3390	if (!match_ref(number, eptr, length, md)) FAIL;
				3391	eptr += length;
				3392	}
				3393
				3394	/* If min = max, continue at the same level without recursion.
				3395	They are not both allowed to be zero. */
				3396
				3397	if (min == max) continue;
				3398
				3399	/* If minimizing, keep trying and advancing the pointer */
				3400
				3401	if (minimize)
				3402	{
				3403	for (i = min;; i++)
				3404	{
				3405	if (match(eptr, ecode, offset_top, md)) SUCCEED;
				3406	if (i >= max \|\| !match_ref(number, eptr, length, md))
				3407	FAIL;
				3408	eptr += length;
				3409	}
				3410	/* Control never gets here */
				3411	}
				3412
				3413	/* If maximizing, find the longest string and work backwards */
				3414
				3415	else
				3416	{
				3417	uschar *pp = eptr;
				3418	for (i = min; i < max; i++)
				3419	{
				3420	if (!match_ref(number, eptr, length, md)) break;
				3421	eptr += length;
				3422	}
				3423	while (eptr >= pp)
				3424	{
				3425	if (match(eptr, ecode, offset_top, md)) SUCCEED;
				3426	eptr -= length;
				3427	}
				3428	FAIL;
				3429	}
				3430	}
				3431	/* Control never gets here */
				3432
				3433	/* Match a character class, possibly repeatedly. Look past the end of the
				3434	item to see if there is repeat information following. Then obey similar
				3435	code to character type repeats - written out again for speed. */
				3436
				3437	case OP_CLASS:
				3438	case OP_NEGCLASS:
				3439	{
				3440	BOOL result = *ecode == OP_CLASS;
				3441	uschar data = ecode; / Save for matching */
				3442
				3443	ecode += 4 + 2 * ecode[2] + ecode[3]; /* Advance past the item */
				3444
				3445	switch (*ecode)
				3446	{
				3447	case OP_CRSTAR:
				3448	case OP_CRMINSTAR:
				3449	case OP_CRPLUS:
				3450	case OP_CRMINPLUS:
				3451	case OP_CRQUERY:
				3452	case OP_CRMINQUERY:
				3453	c = *ecode++ - OP_CRSTAR;
				3454	minimize = (c & 1) != 0;
				3455	min = rep_min[c]; /* Pick up values from tables; */
				3456	max = rep_max[c]; /* zero for max => infinity */
				3457	if (max == 0) max = INT_MAX;
				3458	break;
				3459
				3460	case OP_CRRANGE:
				3461	case OP_CRMINRANGE:
				3462	minimize = (*ecode == OP_CRMINRANGE);
				3463	min = (ecode[1] << 8) + ecode[2];
				3464	max = (ecode[3] << 8) + ecode[4];
				3465	if (max == 0) max = INT_MAX;
				3466	ecode += 5;
				3467	break;
				3468
				3469	default: /* No repeat follows */
				3470	if (eptr >= md->end_subject \|\| !match_class(data, *eptr++, result, md))
				3471	FAIL;
				3472	continue; /* With the main loop */
				3473	}
				3474
				3475	/* First, ensure the minimum number of matches are present. */
				3476
				3477	for (i = 1; i <= min; i++)
				3478	if (eptr >= md->end_subject \|\| !match_class(data, *eptr++, result, md))
				3479	FAIL;
				3480
				3481	/* If max == min we can continue with the main loop without the
				3482	need to recurse. */
				3483
				3484	if (min == max) continue;
				3485
				3486	/* If minimizing, keep testing the rest of the expression and advancing
				3487	the pointer while it matches the class. */
				3488
				3489	if (minimize)
				3490	{
				3491	for (i = min;; i++)
				3492	{
				3493	if (match(eptr, ecode, offset_top, md)) SUCCEED;
				3494	if (i >= max \|\| eptr >= md->end_subject \|\|
				3495	!match_class(data, *eptr++, result, md)) FAIL;
				3496	}
				3497	/* Control never gets here */
				3498	}
				3499
				3500	/* If maximizing, find the longest possible run, then work backwards. */
				3501
				3502	else
				3503	{
				3504	uschar *pp = eptr;
				3505	for (i = min; i < max; i++)
				3506	{
				3507	if (eptr >= md->end_subject \|\| !match_class(data, *eptr, result, md))
				3508	break;
				3509	eptr++;
				3510	}
				3511	while (eptr >= pp)
				3512	if (match(eptr--, ecode, offset_top, md)) SUCCEED;
				3513	FAIL;
				3514	}
				3515	}
				3516	/* Control never gets here */
				3517
				3518	/* Match a run of characters */
				3519
				3520	case OP_CHARS:
				3521	{
				3522	register int length = ecode[1];
				3523	ecode += 2;
				3524
				3525	#ifdef DEBUG
				3526	if (eptr >= md->end_subject)
				3527	printf("matching subject <null> against pattern ");
				3528	else
				3529	{
				3530	printf("matching subject ");
				3531	pchars(eptr, length, TRUE, md);
				3532	printf(" against pattern ");
				3533	}
				3534	pchars(ecode, length, FALSE, md);
				3535	printf("\n");
				3536	#endif
				3537
				3538	if (length > md->end_subject - eptr) FAIL;
				3539	if (md->caseless)
				3540	{
				3541	while (length-- > 0) if (pcre_lcc[ecode++] != pcre_lcc[eptr++]) FAIL;
				3542	}
				3543	else
				3544	{
				3545	while (length-- > 0) if (ecode++ != eptr++) FAIL;
				3546	}
				3547	}
				3548	break;
				3549
				3550	/* Match a single character repeatedly; different opcodes share code. */
				3551
				3552	case OP_EXACT:
				3553	min = max = (ecode[1] << 8) + ecode[2];
				3554	ecode += 3;
				3555	goto REPEATCHAR;
				3556
				3557	case OP_UPTO:
				3558	case OP_MINUPTO:
				3559	min = 0;
				3560	max = (ecode[1] << 8) + ecode[2];
				3561	minimize = *ecode == OP_MINUPTO;
				3562	ecode += 3;
				3563	goto REPEATCHAR;
				3564
				3565	case OP_STAR:
				3566	case OP_MINSTAR:
				3567	case OP_PLUS:
				3568	case OP_MINPLUS:
				3569	case OP_QUERY:
				3570	case OP_MINQUERY:
				3571	c = *ecode++ - OP_STAR;
				3572	minimize = (c & 1) != 0;
				3573	min = rep_min[c]; /* Pick up values from tables; */
				3574	max = rep_max[c]; /* zero for max => infinity */
				3575	if (max == 0) max = INT_MAX;
				3576
				3577	/* Common code for all repeated single-character matches. We can give
				3578	up quickly if there are fewer than the minimum number of characters left in
				3579	the subject. */
				3580
				3581	REPEATCHAR:
				3582	if (min > md->end_subject - eptr) FAIL;
				3583	c = *ecode++;
				3584
				3585	/* The code is duplicated for the caseless and caseful cases, for speed,
				3586	since matching characters is likely to be quite common. First, ensure the
				3587	minimum number of matches are present. If min = max, continue at the same
				3588	level without recursing. Otherwise, if minimizing, keep trying the rest of
				3589	the expression and advancing one matching character if failing, up to the
				3590	maximum. Alternatively, if maximizing, find the maximum number of
				3591	characters and work backwards. */
				3592
				3593	#ifdef DEBUG
				3594	printf("matching %c{%d,%d} against subject %.*s\n", c, min, max,
				3595	max, eptr);
				3596	#endif
				3597
				3598	if (md->caseless)
				3599	{
				3600	c = pcre_lcc[c];
				3601	for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) FAIL;
				3602	if (min == max) continue;
				3603	if (minimize)
				3604	{
				3605	for (i = min;; i++)
				3606	{
				3607	if (match(eptr, ecode, offset_top, md)) SUCCEED;
				3608	if (i >= max \|\| eptr >= md->end_subject \|\| c != pcre_lcc[*eptr++])
				3609	FAIL;
				3610	}
				3611	/* Control never gets here */
				3612	}
				3613	else
				3614	{
				3615	uschar *pp = eptr;
				3616	for (i = min; i < max; i++)
				3617	{
				3618	if (eptr >= md->end_subject \|\| c != pcre_lcc[*eptr]) break;
				3619	eptr++;
				3620	}
				3621	while (eptr >= pp)
				3622	if (match(eptr--, ecode, offset_top, md)) SUCCEED;
				3623	FAIL;
				3624	}
				3625	}
				3626
				3627	/* Caseful comparisons */
				3628
				3629	else
				3630	{
				3631	for (i = 1; i <= min; i++) if (c != *eptr++) FAIL;
				3632	if (min == max) continue;
				3633	if (minimize)
				3634	{
				3635	for (i = min;; i++)
				3636	{
				3637	if (match(eptr, ecode, offset_top, md)) SUCCEED;
				3638	if (i >= max \|\| eptr >= md->end_subject \|\| c != *eptr++) FAIL;
				3639	}
				3640	/* Control never gets here */
				3641	}
				3642	else
				3643	{
				3644	uschar *pp = eptr;
				3645	for (i = min; i < max; i++)
				3646	{
				3647	if (eptr >= md->end_subject \|\| c != *eptr) break;
				3648	eptr++;
				3649	}
				3650	while (eptr >= pp)
				3651	if (match(eptr--, ecode, offset_top, md)) SUCCEED;
				3652	FAIL;
				3653	}
				3654	}
				3655	/* Control never gets here */
				3656
				3657	/* Match a single character type repeatedly; several different opcodes
				3658	share code. This is very similar to the code for single characters, but we
				3659	repeat it in the interests of efficiency. */
				3660
				3661	case OP_TYPEEXACT:
				3662	min = max = (ecode[1] << 8) + ecode[2];
				3663	minimize = TRUE;
				3664	ecode += 3;
				3665	goto REPEATTYPE;
				3666
				3667	case OP_TYPEUPTO:
				3668	case OP_TYPEMINUPTO:
				3669	min = 0;
				3670	max = (ecode[1] << 8) + ecode[2];
				3671	minimize = *ecode == OP_TYPEMINUPTO;
				3672	ecode += 3;
				3673	goto REPEATTYPE;
				3674
				3675	case OP_TYPESTAR:
				3676	case OP_TYPEMINSTAR:
				3677	case OP_TYPEPLUS:
				3678	case OP_TYPEMINPLUS:
				3679	case OP_TYPEQUERY:
				3680	case OP_TYPEMINQUERY:
				3681	c = *ecode++ - OP_TYPESTAR;
				3682	minimize = (c & 1) != 0;
				3683	min = rep_min[c]; /* Pick up values from tables; */
				3684	max = rep_max[c]; /* zero for max => infinity */
				3685	if (max == 0) max = INT_MAX;
				3686
				3687	/* Common code for all repeated single character type matches */
				3688
				3689	REPEATTYPE:
				3690	ctype = ecode++; / Code for the character type */
				3691
				3692	/* First, ensure the minimum number of matches are present. Use inline
				3693	code for maximizing the speed, and do the type test once at the start
				3694	(i.e. keep it out of the loop). Also test that there are at least the
				3695	minimum number of characters before we start. */
				3696
				3697	if (min > md->end_subject - eptr) FAIL;
				3698	if (min > 0) switch(ctype)
				3699	{
				3700	case OP_ANY:
				3701	if (!md->dotall)
				3702	{ for (i = 1; i <= min; i++) if (*eptr++ == '\n') FAIL; }
				3703	else eptr += min;
				3704	break;
				3705
				3706	case OP_NOT_DIGIT:
				3707	for (i = 1; i <= min; i++)
				3708	if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) FAIL;
				3709	break;
				3710
				3711	case OP_DIGIT:
				3712	for (i = 1; i <= min; i++)
				3713	if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) FAIL;
				3714	break;
				3715
				3716	case OP_NOT_WHITESPACE:
				3717	for (i = 1; i <= min; i++)
				3718	if ((pcre_ctypes[*eptr++] & ctype_space) != 0) FAIL;
				3719	break;
				3720
				3721	case OP_WHITESPACE:
				3722	for (i = 1; i <= min; i++)
				3723	if ((pcre_ctypes[*eptr++] & ctype_space) == 0) FAIL;
				3724	break;
				3725
				3726	case OP_NOT_WORDCHAR:
				3727	for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
				3728	FAIL;
				3729	break;
				3730
				3731	case OP_WORDCHAR:
				3732	for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
				3733	FAIL;
				3734	break;
				3735	}
				3736
				3737	/* If min = max, continue at the same level without recursing */
				3738
				3739	if (min == max) continue;
				3740
				3741	/* If minimizing, we have to test the rest of the pattern before each
				3742	subsequent match, so inlining isn't much help; just use the function. */
				3743
				3744	if (minimize)
				3745	{
				3746	for (i = min;; i++)
				3747	{
				3748	if (match(eptr, ecode, offset_top, md)) SUCCEED;
				3749	if (i >= max \|\| eptr >= md->end_subject \|\|
				3750	!match_type(ctype, *eptr++, md->dotall))
				3751	FAIL;
				3752	}
				3753	/* Control never gets here */
				3754	}
				3755
				3756	/* If maximizing it is worth using inline code for speed, doing the type
				3757	test once at the start (i.e. keep it out of the loop). */
				3758
				3759	else
				3760	{
				3761	uschar *pp = eptr;
				3762	switch(ctype)
				3763	{
				3764	case OP_ANY:
				3765	if (!md->dotall)
				3766	{
				3767	for (i = min; i < max; i++)
				3768	{
				3769	if (eptr >= md->end_subject \|\| *eptr == '\n') break;
				3770	eptr++;
				3771	}
				3772	}
				3773	else
				3774	{
				3775	c = max - min;
				3776	if (c > md->end_subject - eptr) c = md->end_subject - eptr;
				3777	eptr += c;
				3778	}
				3779	break;
				3780
				3781	case OP_NOT_DIGIT:
				3782	for (i = min; i < max; i++)
				3783	{
				3784	if (eptr >= md->end_subject \|\| (pcre_ctypes[*eptr] & ctype_digit) != 0)
				3785	break;
				3786	eptr++;
				3787	}
				3788	break;
				3789
				3790	case OP_DIGIT:
				3791	for (i = min; i < max; i++)
				3792	{
				3793	if (eptr >= md->end_subject \|\| (pcre_ctypes[*eptr] & ctype_digit) == 0)
				3794	break;
				3795	eptr++;
				3796	}
				3797	break;
				3798
				3799	case OP_NOT_WHITESPACE:
				3800	for (i = min; i < max; i++)
				3801	{
				3802	if (eptr >= md->end_subject \|\| (pcre_ctypes[*eptr] & ctype_space) != 0)
				3803	break;
				3804	eptr++;
				3805	}
				3806	break;
				3807
				3808	case OP_WHITESPACE:
				3809	for (i = min; i < max; i++)
				3810	{
				3811	if (eptr >= md->end_subject \|\| (pcre_ctypes[*eptr] & ctype_space) == 0)
				3812	break;
				3813	eptr++;
				3814	}
				3815	break;
				3816
				3817	case OP_NOT_WORDCHAR:
				3818	for (i = min; i < max; i++)
				3819	{
				3820	if (eptr >= md->end_subject \|\| (pcre_ctypes[*eptr] & ctype_word) != 0)
				3821	break;
				3822	eptr++;
				3823	}
				3824	break;
				3825
				3826	case OP_WORDCHAR:
				3827	for (i = min; i < max; i++)
				3828	{
				3829	if (eptr >= md->end_subject \|\| (pcre_ctypes[*eptr] & ctype_word) == 0)
				3830	break;
				3831	eptr++;
				3832	}
				3833	break;
				3834	}
				3835
				3836	while (eptr >= pp)
				3837	if (match(eptr--, ecode, offset_top, md)) SUCCEED;
				3838	FAIL;
				3839	}
				3840	/* Control never gets here */
				3841
				3842	/* There's been some horrible disaster. */
				3843
				3844	default:
				3845	#ifdef DEBUG
				3846	printf("Unknown opcode %d\n", *ecode);
				3847	#endif
				3848	md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
				3849	FAIL;
				3850	}
				3851
				3852	/* Do not stick any code in here without much thought; it is assumed
				3853	that "continue" in the code above comes out to here to repeat the main
				3854	loop. */
				3855
				3856	} /* End of main loop */
				3857	/* Control never reaches here */
				3858
				3859	fail:
				3860	if (md->point > save_stack_position)
				3861	{
				3862	/* If there are still points remaining on the stack, pop the next one off */
Guido van Rossum	c386107	1997-10-08 02:07:40 +0000	[diff] [blame]	3863	int off_num;
Guido van Rossum	51b3aa3	1997-10-06 14:43:11 +0000	[diff] [blame]	3864
				3865	md->point--;
				3866	offset_top = md->offset_top[md->point];
				3867	eptr = md->eptr[md->point];
				3868	ecode = md->ecode[md->point];
				3869	off_num = md->off_num[md->point];
				3870	md->offset_vector[off_num] = md->r1[md->point];
				3871	md->offset_vector[off_num+1] = md->r2[md->point];
				3872	goto match_loop;
				3873	}
				3874	/* Failure, and nothing left on the stack, so end this function call */
				3875
				3876	/* Restore the top of the stack to where it was before this function
				3877	call. This lets us use one stack for everything; recursive calls
				3878	can push and pop information, and may increase the stack. When
				3879	the call returns, the parent function can resume pushing and
				3880	popping wherever it was. */
				3881
				3882	md->point = save_stack_position;
				3883	return FALSE;
				3884
				3885	succeed:
				3886	return TRUE;
				3887	}
				3888
				3889
				3890	/*************************************************
				3891	* Execute a Regular Expression *
				3892	*************************************************/
				3893
				3894	/* This function applies a compiled re to a subject string and picks out
				3895	portions of the string if it matches. Two elements in the vector are set for
				3896	each substring: the offsets to the start and end of the substring.
				3897
				3898	Arguments:
				3899	re points to the compiled expression
				3900	extra points to "hints" from pcre_study() or is NULL
				3901	subject points to the subject string
				3902	length length of subject string (may contain binary zeros)
				3903	options option bits
				3904	offsets points to a vector of ints to be filled in with offsets
				3905	offsetcount the number of elements in the vector
				3906
				3907	Returns: > 0 => success; value is the number of elements filled in
				3908	= 0 => success, but offsets is not big enough
				3909	-1 => failed to match
				3910	< -1 => some kind of unexpected problem
				3911	*/
				3912
				3913	int
				3914	pcre_exec(pcre external_re, pcre_extra external_extra, char *subject,
				3915	int length, int options, int *offsets, int offsetcount)
				3916	{
				3917	int resetcount;
				3918	int first_char = -1;
				3919	match_data match_block;
				3920	uschar *start_bits = NULL;
				3921	uschar start_match = (uschar )subject;
				3922	uschar *end_subject;
				3923	real_pcre re = (real_pcre )external_re;
				3924	real_pcre_extra extra = (real_pcre_extra )external_extra;
				3925	BOOL anchored = ((re->options \| options) & PCRE_ANCHORED) != 0;
				3926	BOOL startline = (re->options & PCRE_STARTLINE) != 0;
				3927
				3928	if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
				3929
				3930	if (re == NULL \|\| subject == NULL \|\|
				3931	(offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
				3932	if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
				3933
				3934	match_block.start_subject = (uschar *)subject;
				3935	match_block.end_subject = match_block.start_subject + length;
				3936	end_subject = match_block.end_subject;
				3937
				3938	match_block.caseless = ((re->options \| options) & PCRE_CASELESS) != 0;
				3939	match_block.multiline = ((re->options \|options) & PCRE_MULTILINE) != 0;
				3940	match_block.dotall = ((re->options \|options) & PCRE_DOTALL) != 0;
				3941
				3942	match_block.offset_vector = offsets; /* Where offsets go */
				3943	match_block.offset_end = (offsetcount & (-2)); /* Past max permitted (even) */
				3944	match_block.offset_overflow = FALSE;
				3945
				3946	match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
				3947
				3948	/* Set the stack state to empty */
				3949	match_block.off_num = match_block.offset_top = NULL;
				3950	match_block.r1 = match_block.r2 = NULL;
				3951	match_block.eptr = match_block.ecode = NULL;
				3952	match_block.point = match_block.length = 0;
				3953
				3954	/* Compute the minimum number of offsets that we need to reset each time. Doing
				3955	this makes a huge difference to execution time when there aren't many brackets
				3956	in the pattern. */
				3957
				3958	resetcount = 2 + re->top_bracket * 2;
				3959	if (resetcount > offsetcount) resetcount = offsetcount;
				3960
				3961	/* If MULTILINE is set at exec time but was not set at compile time, and the
				3962	anchored flag is set, we must re-check because a setting provoked by ^ in the
				3963	pattern is not right in multi-line mode. Calling is_anchored() again here does
				3964	the right check, because multiline is now set. If it now yields FALSE, the
				3965	expression must have had ^ starting some of its branches. Check to see if
				3966	that is true for all branches, and if so, set the startline flag. */
				3967
				3968	if (match_block. multiline && anchored && (re->options & PCRE_MULTILINE) == 0 &&
				3969	!is_anchored(re->code, match_block.multiline))
				3970	{
				3971	anchored = FALSE;
				3972	if (is_startline(re->code)) startline = TRUE;
				3973	}
				3974
				3975	/* Set up the first character to match, if available. The first_char value is
				3976	never set for an anchored regular expression, but the anchoring may be forced
				3977	at run time, so we have to test for anchoring. The first char may be unset for
				3978	an unanchored pattern, of course. If there's no first char and the pattern was
				3979	studied, the may be a bitmap of possible first characters. However, we can
				3980	use this only if the caseless state of the studying was correct. */
				3981
				3982	if (!anchored)
				3983	{
				3984	if ((re->options & PCRE_FIRSTSET) != 0)
				3985	{
				3986	first_char = re->first_char;
				3987	if (match_block.caseless) first_char = pcre_lcc[first_char];
				3988	}
				3989	else
				3990	if (!startline && extra != NULL &&
				3991	(extra->options & PCRE_STUDY_MAPPED) != 0 &&
				3992	((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless)
				3993	start_bits = extra->start_bits;
				3994	}
				3995
				3996	/* Loop for unanchored matches; for anchored regexps the loop runs just once. */
				3997
				3998	do
				3999	{
				4000	register int *iptr = offsets;
				4001	register int *iend = offsets + resetcount;
				4002
				4003	/* Reset the maximum number of extractions we might see. */
				4004
				4005	while (iptr < iend) *iptr++ = -1;
				4006
				4007	/* Advance to a unique first char if possible */
				4008
				4009	if (first_char >= 0)
				4010	{
				4011	if (match_block.caseless)
				4012	while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
				4013	start_match++;
				4014	else
				4015	while (start_match < end_subject && *start_match != first_char)
				4016	start_match++;
				4017	}
				4018
				4019	/* Or to just after \n for a multiline match if possible */
				4020
				4021	else if (startline)
				4022	{
				4023	if (start_match > match_block.start_subject)
				4024	{
				4025	while (start_match < end_subject && start_match[-1] != '\n')
				4026	start_match++;
				4027	}
				4028	}
				4029
				4030	/* Or to a non-unique first char */
				4031
				4032	else if (start_bits != NULL)
				4033	{
				4034	while (start_match < end_subject)
				4035	{
				4036	register int c = *start_match;
				4037	if ((start_bits[c/8] & (1<<(c%8))) == 0) start_match++; else break;
				4038	}
				4039	}
				4040
				4041	#ifdef DEBUG
				4042	printf(">>>> Match against: ");
				4043	pchars(start_match, end_subject - start_match, TRUE, &match_block);
				4044	printf("\n");
				4045	#endif
				4046
				4047	/* When a match occurs, substrings will be set for all internal extractions;
				4048	we just need to set up the whole thing as substring 0 before returning. If
				4049	there were too many extractions, set the return code to zero. */
				4050
				4051	if (match(start_match, re->code, 2, &match_block))
				4052	{
				4053	int rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
				4054	if (match_block.offset_end < 2) rc = 0; else
				4055	{
				4056	offsets[0] = start_match - match_block.start_subject;
				4057	offsets[1] = match_block.end_match_ptr - match_block.start_subject;
				4058	}
				4059	#ifdef DEBUG
				4060	printf(">>>> returning %d\n", rc);
				4061	#endif
				4062	free_stack(&match_block);
				4063	return rc;
				4064	}
				4065	}
				4066	while (!anchored &&
				4067	match_block.errorcode == PCRE_ERROR_NOMATCH &&
				4068	start_match++ < end_subject);
				4069
				4070	#ifdef DEBUG
				4071	printf(">>>> returning %d\n", match_block.errorcode);
				4072	#endif
				4073	free_stack(&match_block);
				4074	return match_block.errorcode;
				4075	}
				4076
				4077	/* End of pcre.c */