Blame - Modules/pypcre.c - platform/external/python/cpython2

blob: 9b4b8eee47a042f7f23a3d109cffb608193b8eb7 [file] [log] [blame]

Guido van Rossum	51b3aa3	1997-10-06 14:43:11 +0000	[diff] [blame]	1
				2	/*************************************************
				3	* Perl-Compatible Regular Expressions *
				4	*************************************************/
				5
				6	/* DO NOT EDIT THIS FILE! */
				7
				8	/* This file is automatically written by the merge-files.py script
				9	included with the PCRE distribution for Python; it's produced from
				10	several C files, and code is removed in the process. If you want to
				11	modify the code or track down bugs, it will be much easier to work
				12	with the code in its original, multiple-file form. Don't edit this
				13	file by hand, or submit patches to it.
				14
				15	The Python-specific PCRE distribution can be retrieved from
				16	http://starship.skyport.net/crew/amk/regex/
				17
				18	The unmodified original PCRE distribution doesn't have a fixed URL
				19	yet; write Philip Hazel <ph10@cam.ac.uk> for the latest version.
				20
				21	Written by: Philip Hazel <ph10@cam.ac.uk>
				22
				23	Extensively modified by the Python String-SIG: <string-sig@python.org>
				24	Send bug reports to: <string-sig@python.org>
				25	(They'll figure out if it's a bug in PCRE or in the Python-specific
				26	changes.)
				27
				28	Copyright (c) 1997 University of Cambridge
				29
				30	-----------------------------------------------------------------------------
				31	Permission is granted to anyone to use this software for any purpose on any
				32	computer system, and to redistribute it freely, subject to the following
				33	restrictions:
				34
				35	1. This software is distributed in the hope that it will be useful,
				36	but WITHOUT ANY WARRANTY; without even the implied warranty of
				37	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
				38
				39	2. The origin of this software must not be misrepresented, either by
				40	explicit claim or by omission.
				41
				42	3. Altered versions must be plainly marked as such, and must not be
				43	misrepresented as being the original software.
				44	-----------------------------------------------------------------------------
				45	*/
				46
				47
				48	#define FOR_PYTHON
				49	#include "pcre-internal.h"
				50	#include "Python.h"
				51	#include "graminit.h"
				52
				53	/*************************************************
				54	* Perl-Compatible Regular Expressions *
				55	*************************************************/
				56
				57	/* This file is automatically written by the makechartables auxiliary
				58	program. If you edit it by hand, you might like to edit the Makefile to
				59	prevent its ever being regenerated. */
				60
				61	/* This table is a lower casing table. */
				62
				63	unsigned char pcre_lcc[] = {
				64	0, 1, 2, 3, 4, 5, 6, 7,
				65	8, 9, 10, 11, 12, 13, 14, 15,
				66	16, 17, 18, 19, 20, 21, 22, 23,
				67	24, 25, 26, 27, 28, 29, 30, 31,
				68	32, 33, 34, 35, 36, 37, 38, 39,
				69	40, 41, 42, 43, 44, 45, 46, 47,
				70	48, 49, 50, 51, 52, 53, 54, 55,
				71	56, 57, 58, 59, 60, 61, 62, 63,
				72	64, 97, 98, 99,100,101,102,103,
				73	104,105,106,107,108,109,110,111,
				74	112,113,114,115,116,117,118,119,
				75	120,121,122, 91, 92, 93, 94, 95,
				76	96, 97, 98, 99,100,101,102,103,
				77	104,105,106,107,108,109,110,111,
				78	112,113,114,115,116,117,118,119,
				79	120,121,122,123,124,125,126,127,
				80	128,129,130,131,132,133,134,135,
				81	136,137,138,139,140,141,142,143,
				82	144,145,146,147,148,149,150,151,
				83	152,153,154,155,156,157,158,159,
				84	160,161,162,163,164,165,166,167,
				85	168,169,170,171,172,173,174,175,
				86	176,177,178,179,180,181,182,183,
				87	184,185,186,187,188,189,190,191,
				88	192,193,194,195,196,197,198,199,
				89	200,201,202,203,204,205,206,207,
				90	208,209,210,211,212,213,214,215,
				91	216,217,218,219,220,221,222,223,
				92	224,225,226,227,228,229,230,231,
				93	232,233,234,235,236,237,238,239,
				94	240,241,242,243,244,245,246,247,
				95	248,249,250,251,252,253,254,255 };
				96
				97	/* This table is an upper casing table. */
				98
				99	unsigned char pcre_ucc[] = {
				100	0, 1, 2, 3, 4, 5, 6, 7,
				101	8, 9, 10, 11, 12, 13, 14, 15,
				102	16, 17, 18, 19, 20, 21, 22, 23,
				103	24, 25, 26, 27, 28, 29, 30, 31,
				104	32, 33, 34, 35, 36, 37, 38, 39,
				105	40, 41, 42, 43, 44, 45, 46, 47,
				106	48, 49, 50, 51, 52, 53, 54, 55,
				107	56, 57, 58, 59, 60, 61, 62, 63,
				108	64, 65, 66, 67, 68, 69, 70, 71,
				109	72, 73, 74, 75, 76, 77, 78, 79,
				110	80, 81, 82, 83, 84, 85, 86, 87,
				111	88, 89, 90, 91, 92, 93, 94, 95,
				112	96, 65, 66, 67, 68, 69, 70, 71,
				113	72, 73, 74, 75, 76, 77, 78, 79,
				114	80, 81, 82, 83, 84, 85, 86, 87,
				115	88, 89, 90,123,124,125,126,127,
				116	128,129,130,131,132,133,134,135,
				117	136,137,138,139,140,141,142,143,
				118	144,145,146,147,148,149,150,151,
				119	152,153,154,155,156,157,158,159,
				120	160,161,162,163,164,165,166,167,
				121	168,169,170,171,172,173,174,175,
				122	176,177,178,179,180,181,182,183,
				123	184,185,186,187,188,189,190,191,
				124	192,193,194,195,196,197,198,199,
				125	200,201,202,203,204,205,206,207,
				126	208,209,210,211,212,213,214,215,
				127	216,217,218,219,220,221,222,223,
				128	224,225,226,227,228,229,230,231,
				129	232,233,234,235,236,237,238,239,
				130	240,241,242,243,244,245,246,247,
				131	248,249,250,251,252,253,254,255 };
				132
				133	/* This table identifies various classes of character by individual bits:
				134	1 white space character
				135	2 decimal digit
				136	4 hexadecimal digit
				137	8 alphanumeric or '_'
				138	16 octal digit
				139	128 regular expression metacharacter or binary zero
				140	*/
				141
				142	unsigned char pcre_ctypes[] = {
				143	0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
				144	0x00,0x01,0x01,0x01,0x01,0x01,0x00,0x00, /* 8- 15 */
				145	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
				146	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
				147	0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /* - ' */
				148	0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /* ( - / */
				149	0x1e,0x1e,0x1e,0x1e,0x1e,0x1e,0x1e,0x1e, /* 0 - 7 */
				150	0x0e,0x0e,0x00,0x00,0x00,0x00,0x00,0x80, /* 8 - ? */
				151	0x00,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x08, /* @ - G */
				152	0x08,0x08,0x08,0x08,0x08,0x08,0x08,0x08, /* H - O */
				153	0x08,0x08,0x08,0x08,0x08,0x08,0x08,0x08, /* P - W */
				154	0x08,0x08,0x08,0x80,0x00,0x00,0x80,0x08, /* X - _ */
				155	0x00,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x08, /* ` - g */
				156	0x08,0x08,0x08,0x08,0x08,0x08,0x08,0x08, /* h - o */
				157	0x08,0x08,0x08,0x08,0x08,0x08,0x08,0x08, /* p - w */
				158	0x08,0x08,0x08,0x80,0x80,0x00,0x00,0x00, /* x -127 */
				159	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
				160	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
				161	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
				162	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
				163	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
				164	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
				165	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
				166	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
				167	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
				168	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
				169	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
				170	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
				171	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
				172	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
				173	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
				174	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
				175
				176	/* End of pcre-chartables.c */
				177	/*************************************************
				178	* Perl-Compatible Regular Expressions *
				179	*************************************************/
				180
				181	/*
				182	This is a library of functions to support regular expressions whose syntax
				183	and semantics are as close as possible to those of the Perl 5 language.
				184
				185	Written by: Philip Hazel <ph10@cam.ac.uk>
				186
				187	Copyright (c) 1997 University of Cambridge
				188
				189	-----------------------------------------------------------------------------
				190	Permission is granted to anyone to use this software for any purpose on any
				191	computer system, and to redistribute it freely, subject to the following
				192	restrictions:
				193
				194	1. This software is distributed in the hope that it will be useful,
				195	but WITHOUT ANY WARRANTY; without even the implied warranty of
				196	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
				197
				198	2. The origin of this software must not be misrepresented, either by
				199	explicit claim or by omission.
				200
				201	3. Altered versions must be plainly marked as such, and must not be
				202	misrepresented as being the original software.
				203	-----------------------------------------------------------------------------
				204
				205	See the file Tech.Notes for some information on the internals.
				206	*/
				207
				208	/* This module contains the actual definition of global variables that are
				209	shared between the different modules. In fact, these are limited to the
				210	indirections for memory management functions. */
				211
				212	/* Include the internals header, which itself includes Standard C headers plus
				213	the external pcre header. */
				214
				215
				216	/* Store get and free functions. */
				217
				218	void (pcre_malloc)(size_t) = malloc;
				219	void (pcre_free)(void ) = free;
				220
				221	/* End of pcre-globals.c */
				222	/*************************************************
				223	* Perl-Compatible Regular Expressions *
				224	*************************************************/
				225
				226	/*
				227	This is a library of functions to support regular expressions whose syntax
				228	and semantics are as close as possible to those of the Perl 5 language. See
				229	the file Tech.Notes for some information on the internals.
				230
				231	Written by: Philip Hazel <ph10@cam.ac.uk>
				232
				233	Copyright (c) 1997 University of Cambridge
				234
				235	-----------------------------------------------------------------------------
				236	Permission is granted to anyone to use this software for any purpose on any
				237	computer system, and to redistribute it freely, subject to the following
				238	restrictions:
				239
				240	1. This software is distributed in the hope that it will be useful,
				241	but WITHOUT ANY WARRANTY; without even the implied warranty of
				242	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
				243
				244	2. The origin of this software must not be misrepresented, either by
				245	explicit claim or by omission.
				246
				247	3. Altered versions must be plainly marked as such, and must not be
				248	misrepresented as being the original software.
				249	-----------------------------------------------------------------------------
				250	*/
				251
				252
				253	/* Include the internals header, which itself includes Standard C headers plus
				254	the external pcre header. */
				255
				256
				257
				258	/* Character types for class type bits */
				259
				260	static char class_types[] = { ctype_digit, ctype_space, ctype_word };
				261
				262
				263
				264	/*************************************************
				265	* Set a range of bits in the map *
				266	*************************************************/
				267
				268	/* This function is called for character types.
				269
				270	Arguments:
				271	start_bits points to the bit map
				272	type a character type bit
				273	include TRUE to include the type;
				274	FALSE to include all but the type
				275
				276	Returns: nothing
				277	*/
				278
				279	static void
				280	set_type_bits(uschar *start_bits, int type, BOOL include)
				281	{
				282	int i;
				283	for (i = 0; i < 256; i++)
				284	if (((pcre_ctypes[i] & type) != 0) == include) start_bits[i/8] \|= (1<<(i%8));
				285	}
				286
				287
				288
				289	/*************************************************
				290	* Set one bit in the map *
				291	*************************************************/
				292
				293	/* This function is called to set a bit in the map for a given character,
				294	or both cases of a letter if caseless. It could be replaced by a macro if
				295	better performance is wanted.
				296
				297	Arguments:
				298	start_bits points to 32-byte table
				299	c the character
				300	caseless TRUE if caseless
				301
				302	Returns: nothing
				303	*/
				304
				305	static void
				306	set_bit(uschar *start_bits, int c, BOOL caseless)
				307	{
				308	if (caseless)
				309	{
				310	int d = pcre_ucc[c];
				311	start_bits[d/8] \|= (1<<(d%8));
				312	c = pcre_lcc[c];
				313	}
				314	start_bits[c/8] \|= (1<<(c%8));
				315	}
				316
				317
				318
				319	/*************************************************
				320	* Create bitmap of starting chars *
				321	*************************************************/
				322
				323	/* This function scans a compiled unanchored expression and attempts to build a
				324	bitmap of the set of initial characters. If it can't, it returns FALSE. As time
				325	goes by, we may be able to get more clever at doing this.
				326
				327	Arguments:
				328	code points to an expression
				329	start_bits points to a 32-byte table, initialized to 0
				330	caseless TRUE if caseless
				331
				332	Returns: TRUE if table built, FALSE otherwise
				333	*/
				334
				335	static BOOL
				336	set_start_bits(uschar code, uschar start_bits, BOOL caseless)
				337	{
				338	do
				339	{
				340	uschar *tcode = code + 3;
				341	BOOL try_next = TRUE;
				342
				343	while (try_next)
				344	{
				345	try_next = FALSE;
				346
				347	if ((int)tcode >= OP_BRA \|\| tcode == OP_ASSERT)
				348	{
				349	if (!set_start_bits(tcode, start_bits, caseless)) return FALSE;
				350	}
				351
				352	else switch(*tcode)
				353	{
				354	default:
				355	return FALSE;
				356
				357	/* BRAZERO does the bracket, but carries on. */
				358
				359	case OP_BRAZERO:
				360	case OP_BRAMINZERO:
				361	if (!set_start_bits(++tcode, start_bits, caseless)) return FALSE;
				362	do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
				363	tcode += 3;
				364	try_next = TRUE;
				365	break;
				366
				367	/* Single-char * or ? sets the bit and tries the next item */
				368
				369	case OP_STAR:
				370	case OP_MINSTAR:
				371	case OP_QUERY:
				372	case OP_MINQUERY:
				373	set_bit(start_bits, tcode[1], caseless);
				374	tcode += 2;
				375	try_next = TRUE;
				376	break;
				377
				378	/* Single-char upto sets the bit and tries the next */
				379
				380	case OP_UPTO:
				381	case OP_MINUPTO:
				382	set_bit(start_bits, tcode[3], caseless);
				383	tcode += 4;
				384	try_next = TRUE;
				385	break;
				386
				387	/* At least one single char sets the bit and stops */
				388
				389	case OP_EXACT: /* Fall through */
				390	tcode++;
				391
				392	case OP_CHARS: /* Fall through */
				393	tcode++;
				394
				395	case OP_PLUS:
				396	case OP_MINPLUS:
				397	set_bit(start_bits, tcode[1], caseless);
				398	break;
				399
				400	/* Single character type sets the bits and stops */
				401
				402	case OP_NOT_DIGIT:
				403	set_type_bits(start_bits, ctype_digit, FALSE);
				404	break;
				405
				406	case OP_DIGIT:
				407	set_type_bits(start_bits, ctype_digit, TRUE);
				408	break;
				409
				410	case OP_NOT_WHITESPACE:
				411	set_type_bits(start_bits, ctype_space, FALSE);
				412	break;
				413
				414	case OP_WHITESPACE:
				415	set_type_bits(start_bits, ctype_space, TRUE);
				416	break;
				417
				418	case OP_NOT_WORDCHAR:
				419	set_type_bits(start_bits, ctype_word, FALSE);
				420	break;
				421
				422	case OP_WORDCHAR:
				423	set_type_bits(start_bits, ctype_word, TRUE);
				424	break;
				425
				426	/* One or more character type fudges the pointer and restarts, knowing
				427	it will hit a single character type and stop there. */
				428
				429	case OP_TYPEPLUS:
				430	case OP_TYPEMINPLUS:
				431	tcode++;
				432	try_next = TRUE;
				433	break;
				434
				435	case OP_TYPEEXACT:
				436	tcode += 3;
				437	try_next = TRUE;
				438	break;
				439
				440	/* Zero or more repeats of character types set the bits and then
				441	try again. */
				442
				443	case OP_TYPEUPTO:
				444	case OP_TYPEMINUPTO:
				445	tcode += 2; /* Fall through */
				446
				447	case OP_TYPESTAR:
				448	case OP_TYPEMINSTAR:
				449	case OP_TYPEQUERY:
				450	case OP_TYPEMINQUERY:
				451	switch(tcode[1])
				452	{
				453	case OP_NOT_DIGIT:
				454	set_type_bits(start_bits, ctype_digit, FALSE);
				455	break;
				456
				457	case OP_DIGIT:
				458	set_type_bits(start_bits, ctype_digit, TRUE);
				459	break;
				460
				461	case OP_NOT_WHITESPACE:
				462	set_type_bits(start_bits, ctype_space, FALSE);
				463	break;
				464
				465	case OP_WHITESPACE:
				466	set_type_bits(start_bits, ctype_space, TRUE);
				467	break;
				468
				469	case OP_NOT_WORDCHAR:
				470	set_type_bits(start_bits, ctype_word, FALSE);
				471	break;
				472
				473	case OP_WORDCHAR:
				474	set_type_bits(start_bits, ctype_word, TRUE);
				475	break;
				476	}
				477
				478	tcode += 2;
				479	try_next = TRUE;
				480	break;
				481
				482	/* Character class: set the bits and either carry on or not,
				483	according to the repeat count. */
				484
				485	case OP_CLASS:
				486	case OP_NEGCLASS:
				487	{
				488	uschar *base = tcode;
				489	uschar data, end;
				490	uschar *bitmap = start_bits;
				491	uschar local[32];
				492	int flags = base[1];
				493	int i;
				494
				495	tcode += 4 + 2 * tcode[2] + tcode[3]; /* Advance past the item */
				496	switch (*tcode)
				497	{
				498	case OP_CRSTAR:
				499	case OP_CRMINSTAR:
				500	case OP_CRQUERY:
				501	case OP_CRMINQUERY:
				502	tcode++;
				503	try_next = TRUE;
				504	break;
				505
				506	case OP_CRRANGE:
				507	case OP_CRMINRANGE:
				508	if (((tcode[1] << 8) + tcode[2]) == 0)
				509	{
				510	tcode += 5;
				511	try_next = TRUE;
				512	}
				513	break;
				514	}
				515
				516	/* For a negated class, we have to build a separate table of all
				517	the bits in the class, and then turn all other bits on in the main
				518	table. Otherwise there are problems with things like [^\da]. */
				519
				520	if (*base == OP_NEGCLASS)
				521	{
				522	memset(local, 0, 32);
				523	bitmap = local;
				524	}
				525
				526	/* Character types */
				527
				528	for (i = 0; flags != 0; i++)
				529	{
				530	if ((flags & 1) != 0)
				531	set_type_bits(bitmap, class_types[i/2], (i & 1) == 0);
				532	flags >>= 1;
				533	}
				534
				535	/* Ranges and individual characters */
				536
				537	data = base + 4;
				538	end = data + base[2] * 2;
				539	while (data < end)
				540	{
				541	for (i = *data; i <= data[1]; i++) set_bit(bitmap, i, caseless);
				542	data += 2;
				543	}
				544
				545	end += base[3];
				546	while (data < end) set_bit(bitmap, *data++, caseless);
				547
				548	/* If a negated class, transfer data from local map to the main one */
				549
				550	if (bitmap != start_bits)
				551	for (i = 0; i < 32; i++) start_bits[i] \|= ~local[i];
				552	}
				553	break; /* End of class handling */
				554
				555	} /* End of switch */
				556	} /* End of try_next loop */
				557
				558	code += (code[1] << 8) + code[2]; /* Advance to next branch */
				559	}
				560	while (*code == OP_ALT);
				561	return TRUE;
				562	}
				563
				564
				565
				566	/*************************************************
				567	* Study a compiled expression *
				568	*************************************************/
				569
				570	/* This function is handed a compiled expression that it must study to produce
				571	information that will speed up the matching. It returns a pcre_extra block
				572	which then gets handed back to pcre_exec().
				573
				574	Arguments:
				575	re points to the compiled expression
				576	options contains option bits
				577	errorptr points to where to place error messages;
				578	set NULL unless error
				579
				580	Returns: pointer to a pcre_extra block,
				581	NULL on error or if no optimization possible
				582	*/
				583
				584	pcre_extra *
				585	pcre_study(pcre external_re, int options, char *errorptr)
				586	{
				587	BOOL caseless;
				588	uschar start_bits[32];
				589	real_pcre_extra *extra;
				590	real_pcre re = (real_pcre )external_re;
				591
				592	*errorptr = NULL;
				593
				594	if (re == NULL \|\| re->magic_number != MAGIC_NUMBER)
				595	{
				596	*errorptr = "argument is not a compiled regular expression";
				597	return NULL;
				598	}
				599
				600	if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
				601	{
				602	*errorptr = "unknown or incorrect option bit(s) set";
				603	return NULL;
				604	}
				605
				606	/* For an anchored pattern, or an unchored pattern that has a first char, or a
				607	multiline pattern that matches only at "line starts", no further processing at
				608	present. */
				609
				610	if ((re->options & (PCRE_ANCHORED\|PCRE_FIRSTSET\|PCRE_STARTLINE)) != 0)
				611	return NULL;
				612
				613	/* Determine the caseless state from the compiled pattern and the current
				614	options. */
				615
				616	caseless = ((re->options \| options) & PCRE_CASELESS) != 0;
				617
				618	/* See if we can find a fixed set of initial characters for the pattern. */
				619
				620	memset(start_bits, 0, 32);
				621	if (!set_start_bits(re->code, start_bits, caseless)) return NULL;
				622
				623	/* Get an "extra" block and put the information therein. */
				624
				625	extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
				626
				627	if (extra == NULL)
				628	{
				629	*errorptr = "failed to get memory";
				630	return NULL;
				631	}
				632	extra->options = PCRE_STUDY_MAPPED \| (caseless? PCRE_STUDY_CASELESS : 0);
				633	memcpy(extra->start_bits, start_bits, 32);
				634
				635	return (pcre_extra *)extra;
				636	}
				637
				638	/* End of pcre-study.c */
				639	/*************************************************
				640	* Perl-Compatible Regular Expressions *
				641	*************************************************/
				642
				643	/*
				644	This is a library of functions to support regular expressions whose syntax
				645	and semantics are as close as possible to those of the Perl 5 language. See
				646	the file Tech.Notes for some information on the internals.
				647
				648	Written by: Philip Hazel <ph10@cam.ac.uk>
				649
				650	Copyright (c) 1997 University of Cambridge
				651
				652	-----------------------------------------------------------------------------
				653	Permission is granted to anyone to use this software for any purpose on any
				654	computer system, and to redistribute it freely, subject to the following
				655	restrictions:
				656
				657	1. This software is distributed in the hope that it will be useful,
				658	but WITHOUT ANY WARRANTY; without even the implied warranty of
				659	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
				660
				661	2. The origin of this software must not be misrepresented, either by
				662	explicit claim or by omission.
				663
				664	3. Altered versions must be plainly marked as such, and must not be
				665	misrepresented as being the original software.
				666	-----------------------------------------------------------------------------
				667	*/
				668
				669
				670	/* Define DEBUG to get debugging output on stdout. */
				671
				672	/* #define DEBUG */
				673
				674
				675	/* Include the internals header, which itself includes Standard C headers plus
				676	the external pcre header. */
				677
				678
				679	#ifndef Py_eval_input
				680	/* For Python 1.4, graminit.h has to be explicitly included */
				681	#define Py_eval_input eval_input
				682	#endif
				683
				684	/* Min and max values for the common repeats; for the maxima, 0 => infinity */
				685
				686	static char rep_min[] = { 0, 0, 1, 1, 0, 0 };
				687	static char rep_max[] = { 0, 0, 0, 0, 1, 1 };
				688
				689	/* Text forms of OP_ values and things, for debugging */
				690
				691	#ifdef DEBUG
				692	static char *OP_names[] = { "End", "\\A", "\\B", "\\b", "\\D", "\\d",
				693	"\\S", "\\s", "\\W", "\\w", "\\Z", "^", "$", "Any", "chars",
				694	"", "?", "+", "+?", "?", "??", "{", "{", "{",
				695	"", "?", "+", "+?", "?", "??", "{", "{", "{",
				696	"", "?", "+", "+?", "?", "??", "{", "{",
				697	"class", "negclass", "Ref",
				698	"Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
				699	"Brazero", "Braminzero", "Bra"
				700	};
				701
				702	static char *class_names[] = { "\\d", "\\D", "\\s", "\\S", "\\w", "\\W" };
				703	#endif
				704
				705	/* Table of character type operators that correspond to the bits in the
				706	character class flags, starting at the least significant end. */
				707
				708	static char class_ops[] = {
				709	OP_DIGIT, OP_NOT_DIGIT,
				710	OP_WHITESPACE, OP_NOT_WHITESPACE,
				711	OP_WORDCHAR, OP_NOT_WORDCHAR };
				712
				713	/* Table for handling escaped characters in the range '0'-'z'. Positive returns
				714	are simple data values; negative values are for special things like \d and so
				715	on. Zero means further processing is needed (for things like \x), or the escape
				716	is invalid. */
				717
				718	/* PYTHON: Python doesn't support \e, but does support \v */
				719
				720	static short int escapes[] = {
				721	0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
				722	0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
				723	'@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
				724	0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
				725	0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
				726	0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
				727	'`', 7, -ESC_b, 0, -ESC_d, 0, '\f', 0, /* ` - g */
				728	0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
				729	0, 0, '\r', -ESC_s, '\t', 0, '\v', -ESC_w, /* p - w */
				730	0, 0, 0 /* x - z */
				731	};
				732
				733
				734	/* Definition to allow mutual recursion */
				735
				736	static BOOL compile_regexp(BOOL, int , uschar , uschar *,
				737	char *, PyObject );
				738
				739	/* Structure for passing "static" information around between the functions
				740	doing the matching, so that they are thread-safe. */
				741
				742	typedef struct match_data {
				743	int errorcode; /* As it says */
				744	int offset_vector; / Offset vector */
				745	int offset_end; /* One past the end */
				746	BOOL offset_overflow; /* Set if too many extractions */
				747	BOOL caseless; /* Case-independent flag */
				748	BOOL multiline; /* Multiline flag */
				749	uschar start_subject; / Start of the subject string */
				750	uschar end_subject; / End of the subject string */
				751
				752	uschar end_match_ptr; / Subject position at end match */
				753	int end_offset_top; /* Highwater mark at end of match */
				754	BOOL dotall; /* Dotall flag */
				755	int length; /* Length of the allocated stacks */
				756	int point; /* Point to add next item pushed onto stacks */
				757	/* Pointers to the 6 stacks */
				758	int off_num, offset_top, r1, r2;
				759	uschar eptr, ecode;
				760	} match_data;
				761
				762
				763
				764	/*************************************************
				765	* Return version string *
				766	*************************************************/
				767
				768	char *
				769	pcre_version(void)
				770	{
				771	return PCRE_VERSION;
				772	}
				773
				774
				775
				776
				777	/*************************************************
				778	* Return info about a compiled pattern *
				779	*************************************************/
				780
				781	/* This function picks potentially useful data out of the private
				782	structure.
				783
				784	Arguments:
				785	external_re points to compiled code
				786	optptr where to pass back the options
				787	first_char where to pass back the first character,
				788	or -1 if multiline and all branches start ^,
				789	or -2 otherwise
				790
				791	Returns: number of identifying extraction brackets
				792	or negative values on error
				793	*/
				794
				795	int
				796	pcre_info(pcre external_re, int optptr, int *first_char)
				797	{
				798	real_pcre re = (real_pcre )external_re;
				799	if (re == NULL) return PCRE_ERROR_NULL;
				800	if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
				801	if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
				802	if (first_char != NULL)
				803	*first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
				804	((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
				805	return re->top_bracket;
				806	}
				807
				808
				809
				810
				811	#ifdef DEBUG
				812	/*************************************************
				813	* Debugging function to print chars *
				814	*************************************************/
				815
				816	/* Print a sequence of chars in printable format, stopping at the end of the
				817	subject if the requested.
				818
				819	Arguments:
				820	p points to characters
				821	length number to print
				822	is_subject TRUE if printing from within md->start_subject
				823	md pointer to matching data block, if is_subject is TRUE
				824
				825	Returns: nothing
				826	*/
				827
				828	static pchars(uschar p, int length, BOOL is_subject, match_data md)
				829	{
				830	int c;
				831	if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
				832	while (length-- > 0)
				833	if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
				834	}
				835	#endif
				836
				837
				838
				839
				840	/*************************************************
				841	* Check subpattern for empty operand *
				842	*************************************************/
				843
				844	/* This function checks a bracketed subpattern to see if any of the paths
				845	through it could match an empty string. This is used to diagnose an error if
				846	such a subpattern is followed by a quantifier with an unlimited upper bound.
				847
				848	Argument:
				849	code points to the opening bracket
				850
				851	Returns: TRUE or FALSE
				852	*/
				853
				854	static BOOL
				855	could_be_empty(uschar *code)
				856	{
				857	do {
				858	uschar *cc = code + 3;
				859
				860	/* Scan along the opcodes for this branch; as soon as we find something
				861	that matches a non-empty string, break out and advance to test the next
				862	branch. If we get to the end of the branch, return TRUE for the whole
				863	sub-expression. */
				864
				865	for (;;)
				866	{
				867	/* Test an embedded subpattern; if it could not be empty, break the
				868	loop. Otherwise carry on in the branch. */
				869
				870	if ((int)(*cc) >= OP_BRA)
				871	{
				872	if (!could_be_empty(cc)) break;
				873	do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
				874	cc += 3;
				875	}
				876
				877	else switch (*cc)
				878	{
				879	/* Reached end of a branch: the subpattern may match the empty string */
				880
				881	case OP_ALT:
				882	case OP_KET:
				883	case OP_KETRMAX:
				884	case OP_KETRMIN:
				885	return TRUE;
				886
				887	/* Skip over assertive subpatterns */
				888
				889	case OP_ASSERT:
				890	case OP_ASSERT_NOT:
				891	do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
				892	cc += 3;
				893	break;
				894
				895	/* Skip over things that don't match chars */
				896
				897	case OP_SOD:
				898	case OP_EOD:
				899	case OP_CIRC:
				900	case OP_DOLL:
				901	case OP_BRAZERO:
				902	case OP_BRAMINZERO:
				903	case OP_NOT_WORD_BOUNDARY:
				904	case OP_WORD_BOUNDARY:
				905	cc++;
				906	break;
				907
				908	/* Skip over simple repeats with zero lower bound */
				909
				910	case OP_STAR:
				911	case OP_MINSTAR:
				912	case OP_QUERY:
				913	case OP_MINQUERY:
				914	case OP_TYPESTAR:
				915	case OP_TYPEMINSTAR:
				916	case OP_TYPEQUERY:
				917	case OP_TYPEMINQUERY:
				918	cc += 2;
				919	break;
				920
				921	/* Skip over UPTOs (lower bound is zero) */
				922
				923	case OP_UPTO:
				924	case OP_MINUPTO:
				925	case OP_TYPEUPTO:
				926	case OP_TYPEMINUPTO:
				927	cc += 4;
				928	break;
				929
				930	/* Check a class or a back reference for a zero minimum */
				931
				932	case OP_CLASS:
				933	case OP_NEGCLASS:
				934	case OP_REF:
				935	cc += (cc == OP_REF)? 2 : 4 + 2 cc[2] + cc[3];
				936
				937	switch (*cc)
				938	{
				939	case OP_CRSTAR:
				940	case OP_CRMINSTAR:
				941	case OP_CRQUERY:
				942	case OP_CRMINQUERY:
				943	cc++;
				944	break;
				945
				946	case OP_CRRANGE:
				947	case OP_CRMINRANGE:
				948	if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
				949	cc += 3;
				950	break;
				951
				952	default:
				953	goto NEXT_BRANCH;
				954	}
				955	break;
				956
				957	/* Anything else matches at least one character */
				958
				959	default:
				960	goto NEXT_BRANCH;
				961	}
				962	}
				963
				964	NEXT_BRANCH:
				965	code += (code[1] << 8) + code[2];
				966	}
				967	while (*code == OP_ALT);
				968
				969	/* No branches match the empty string */
				970
				971	return FALSE;
				972	}
				973
				974
				975	/* Determine the length of a group ID in an expression like
				976	(?P<foo_123>...)
				977	Arguments:
				978	ptr pattern position pointer (say that 3 times fast)
				979	finalchar the character that will mark the end of the ID
				980	errorptr points to the pointer to the error message
				981	*/
				982
				983	static int
				984	get_group_id(uschar ptr, char finalchar, char *errorptr)
				985	{
				986	uschar *start = ptr;
				987
				988	/* If the first character is not in \w, or is in \w but is a digit,
				989	report an error */
				990	if (!(pcre_ctypes[*ptr] & ctype_word) \|\|
				991	(pcre_ctypes[*ptr++] & ctype_digit))
				992	{
				993	*errorptr = "(?P identifier must start with a letter or underscore";
				994	return 0;
				995	}
				996
				997	/* Increment ptr until we either hit a null byte, the desired
				998	final character, or a non-word character */
				999	for(; (ptr != 0) && (ptr != finalchar) &&
				1000	(pcre_ctypes[*ptr] & ctype_word); ptr++)
				1001	{
				1002	}
				1003	if (*ptr==finalchar)
				1004	return ptr-start;
				1005	if (*ptr==0)
				1006	{
				1007	*errorptr = "unterminated (?P identifier";
				1008	return 0;
				1009	}
				1010	*errorptr = "illegal character in (?P identifier";
				1011	return 0;
				1012	}
				1013
				1014	/*************************************************
				1015	* Handle escapes *
				1016	*************************************************/
				1017
				1018	/* This function is called when a \ has been encountered. It either returns a
				1019	positive value for a simple escape such as \n, or a negative value which
				1020	encodes one of the more complicated things such as \d. On entry, ptr is
				1021	pointing at the \. On exit, it is on the final character of the escape
				1022	sequence.
				1023
				1024	Arguments:
				1025	ptrptr points to the pattern position pointer
				1026	errorptr points to the pointer to the error message
				1027
				1028	Returns: zero or positive => a data character
				1029	negative => a special escape sequence
				1030	on error, errorptr is set
				1031	*/
				1032
				1033	static int
				1034	check_escape(uschar ptrptr, char errorptr)
				1035	{
				1036	uschar ptr = ptrptr;
				1037	int c = (++ptr) & 255; / Ensure > 0 on signed-char systems */
				1038	int i;
				1039
				1040	if (c == 0) *errorptr = "\\ at end of pattern";
				1041
				1042	/* Digits or letters may have special meaning; all others are literals. */
				1043
				1044	else if (c < '0' \|\| c > 'z') {}
				1045
				1046	/* Do an initial lookup in a table. A non-zero result is something that can be
				1047	returned immediately. Otherwise further processing may be required. */
				1048
				1049	else if ((i = escapes[c - '0']) != 0) c = i;
				1050
				1051	/* Escapes that need further processing, or are illegal. */
				1052
				1053	else switch (c)
				1054	{
				1055	case '0':
				1056	c = 0;
				1057	while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_odigit) != 0 )
				1058	c = c * 8 + *(++ptr) - '0';
				1059	break;
				1060
				1061	case '1': case '2': case '3': case '4': case '5':
				1062	case '6': case '7': case '8': case '9':
				1063	{
				1064	/* PYTHON: Try to compute an octal value for a character */
				1065	for(c=0, i=0; c!=-1 && ptr[i]!=0 && i<3; i++)
				1066	{
				1067	if (( pcre_ctypes[ ptr[i] ] & ctype_odigit) != 0)
				1068	c = c * 8 + ptr[i]-'0';
				1069	else
				1070	c = -1; /* Non-octal character */
				1071	}
				1072	/* Aha! There were 3 octal digits, so it must be a character */
				1073	if (c != -1 && i == 3)
				1074	{
				1075	ptr += i-1;
				1076	break;
				1077	}
				1078	c = ptr[0]; /* Restore the first character after the \ */
				1079	c -= '0'; i = 1;
				1080	while (i<2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0)
				1081	{
				1082	c = c * 10 + ptr[1] - '0';
				1083	ptr++; i++;
				1084	}
				1085	if (c > 255 - ESC_REF) *errorptr = "back reference too big";
				1086	c = -(ESC_REF + c);
				1087	}
				1088	break;
				1089
				1090	case 'x':
				1091	{
				1092	int end, length;
				1093	char *string;
				1094	PyObject v, result;
				1095
				1096	i=1;
				1097	while (ptr[i]!=0 &&
				1098	( pcre_ctypes[ptr[i]] & ctype_xdigit) != 0)
				1099	i++;
				1100	if (i==1)
				1101	{
				1102	*errorptr="\\x must be followed by hex digits";
				1103	break;
				1104	}
				1105	length=i-1;
				1106	string=malloc(length+4+1);
				1107	if (string==NULL)
				1108	{
				1109	*errorptr="can't allocate memory for \\x string";
				1110	break;
				1111	}
				1112	/* Create a string containing "\x<hexdigits>", which will be
				1113	passed to eval() */
				1114	string[0]=string[length+3]='"';
				1115	string[1]='\\';
				1116	string[length+4]='\0';
				1117	memcpy(string+2, ptr, length+1);
				1118	ptr += length;
				1119	v=PyRun_String((char *)string, Py_eval_input,
				1120	PyEval_GetGlobals(), PyEval_GetLocals());
				1121	free(string);
				1122	/* The evaluation raised an exception */
				1123	if (v==NULL)
				1124	{
				1125	*errorptr="exception occurred during evaluation of \\x";
				1126	break;
				1127	}
				1128	if (PyString_Size(v)!=1)
				1129	{
				1130	Py_DECREF(v);
				1131	*errorptr="\\x string is not one byte in length";
				1132	break;
				1133	}
				1134	c=(unsigned char )PyString_AsString(v);
				1135	Py_DECREF(v);
				1136	break;
				1137	}
				1138	break;
				1139
				1140
				1141	case 'l':
				1142	case 'L':
				1143	case 'u':
				1144	case 'U':
				1145	case 'Q':
				1146	case 'E':
				1147	*errorptr = "the Perl escapes \\u, \\U, \\l, \\L, \\Q, \\E are not valid";
				1148	break;
				1149
				1150	default:
				1151	/* In Python, an unrecognized escape will simply return the character
				1152	after the backslash, so do nothing */
				1153	break;
				1154	}
				1155
				1156	*ptrptr = ptr;
				1157	return c;
				1158	}
				1159
				1160
				1161
				1162	/*************************************************
				1163	* Read repeat counts *
				1164	*************************************************/
				1165
				1166	/* Read an item of the form {n,m} and return the values.
				1167
				1168	Arguments:
				1169	p pointer to first char after '{'
				1170	minp pointer to int for min
				1171	maxp pointer to int for max
				1172	returned as -1 if no max
				1173	errorptr points to pointer to error message
				1174
				1175	Returns: pointer to '}' on success;
				1176	current ptr on error, with errorptr set
				1177	*/
				1178
				1179	static uschar *
				1180	read_repeat_counts(uschar p, int minp, int maxp, char *errorptr)
				1181	{
				1182	int min = 0;
				1183	int max = -1;
				1184
				1185	if ((pcre_ctypes[*p] & ctype_digit) == 0)
				1186	{
				1187	*errorptr = "number expected after {";
				1188	return p;
				1189	}
				1190
				1191	while ((pcre_ctypes[p] & ctype_digit) != 0) min = min 10 + *p++ - '0';
				1192
				1193	if (*p == '}') max = min; else
				1194	{
				1195	if (*p++ != ',')
				1196	{
				1197	*errorptr = "comma expected";
				1198	return p-1;
				1199	}
				1200	if (*p != '}')
				1201	{
				1202	max = 0;
				1203	while((pcre_ctypes[p] & ctype_digit) != 0) max = max 10 + *p++ - '0';
				1204	if (*p != '}')
				1205	{
				1206	*errorptr = "} expected";
				1207	return p;
				1208	}
				1209	if (max < min)
				1210	{
				1211	*errorptr = "numbers out of order";
				1212	return p;
				1213	}
				1214	}
				1215	}
				1216
				1217	/* Do paranoid checks, then fill in the required variables, and pass back the
				1218	pointer to the terminating '}'. */
				1219
				1220	if (max == 0) *errorptr = "zero maximum not allowed";
				1221	else if (min > 65535 \|\| max > 65535) *errorptr = "number too big";
				1222	else
				1223	{
				1224	*minp = min;
				1225	*maxp = max;
				1226	}
				1227	return p;
				1228	}
				1229
				1230
				1231
				1232	/*************************************************
				1233	* Compile one branch *
				1234	*************************************************/
				1235
				1236	/* Scan the pattern, compiling it into the code vector.
				1237
				1238	Arguments:
				1239	extended TRUE if the PCRE_EXTENDED option was set
				1240	brackets points to 2-element bracket vector
				1241	code points to the pointer to the current code point
				1242	ptrptr points to the current pattern pointer
				1243	errorptr points to pointer to error message
				1244
				1245	Returns: TRUE on success
				1246	FALSE, with *errorptr set on error
				1247	*/
				1248
				1249	static BOOL
				1250	compile_branch(BOOL extended, int brackets, uschar *codeptr,
				1251	uschar ptrptr, char errorptr, PyObject *dictionary)
				1252	{
				1253	int repeat_type, op_type;
				1254	int repeat_min, repeat_max;
				1255	int bravalue, length;
				1256	register int c;
				1257	register uschar code = codeptr;
				1258	uschar ptr = ptrptr;
				1259	uschar *previous = NULL;
				1260	uschar *oldptr;
				1261
				1262	/* Switch on next character until the end of the branch */
				1263
				1264	for (;; ptr++)
				1265	{
				1266	c = *ptr;
				1267	if (extended)
				1268	{
				1269	if ((pcre_ctypes[c] & ctype_space) != 0) continue;
				1270	if (c == '#')
				1271	{
				1272	while ((c = *(++ptr)) != 0 && c != '\n');
				1273	continue;
				1274	}
				1275	}
				1276
				1277	switch(c)
				1278	{
				1279	/* The branch terminates at end of string, \|, or ). */
				1280
				1281	case 0:
				1282	case '\|':
				1283	case ')':
				1284	*codeptr = code;
				1285	*ptrptr = ptr;
				1286	return TRUE;
				1287
				1288	/* Handle single-character metacharacters */
				1289
				1290	case '^':
				1291	previous = NULL;
				1292	*code++ = OP_CIRC;
				1293	break;
				1294
				1295	case '$':
				1296	previous = NULL;
				1297	*code++ = OP_DOLL;
				1298	break;
				1299
				1300	case '.':
				1301	previous = code;
				1302	*code++ = OP_ANY;
				1303	break;
				1304
				1305	/* Character classes. We do quite a bit of munging around here. There are
				1306	always four initial bytes: the op_code, a flags byte for things like \d, a
				1307	count of pairs and a count of single characters. The pairs then follow, and
				1308	finally the single characters. */
				1309
				1310	case '[':
				1311	{
				1312	int rangecount = 0;
				1313	int flags = 0;
				1314	int singles_count = 0;
				1315	char singles[256];
				1316
				1317	previous = code;
				1318
				1319	/* If the first character is '^', set the negation flag */
				1320
				1321	if ((c = (++ptr)) == '^') { code = OP_NEGCLASS; c = *(++ptr); }
				1322	else *code = OP_CLASS;
				1323	code += 4;
				1324
				1325	/* Process characters until ] is reached. By writing this as a "do" it
				1326	means that an initial ] is taken as a data character. */
				1327
				1328	do
				1329	{
				1330	if (c == 0)
				1331	{
				1332	*errorptr = "] missing";
				1333	goto FAILED;
				1334	}
				1335
				1336	/*** Perl treats '-' here as a data character, so PCRE had better
				1337	do the same ... cut out this diagnosis.
				1338
				1339	if (c == '-')
				1340	{
				1341	*errorptr = "unexpected '-' in character class";
				1342	goto FAILED;
				1343	}
				1344	... ***/
				1345
				1346	/* Backslash may introduce a single character, or it may introduce one
				1347	of the specials, which just set a flag. Escaped items are checked for
				1348	validity in the pre-compiling pass. The sequence \b is a special case.
				1349	Inside a class (and only there) it is treated as backslash. Elsewhere
				1350	it marks a word boundary. */
				1351
				1352	if (c == '\\')
				1353	{
				1354	uschar *save_ptr = ptr+1;
				1355	c = check_escape(&ptr, errorptr);
				1356	if (c < 0)
				1357	{
				1358	switch (-c)
				1359	{
				1360	case ESC_d: flags \|= CLASS_DIGITS; continue;
				1361	case ESC_D: flags \|= CLASS_NOT_DIGITS; continue;
				1362	case ESC_s: flags \|= CLASS_WHITESPACE; continue;
				1363	case ESC_S: flags \|= CLASS_NOT_WHITESPACE; continue;
				1364	case ESC_w: flags \|= CLASS_WORD; continue;
				1365	case ESC_W: flags \|= CLASS_NOT_WORD; continue;
				1366	default:
				1367	ptr = save_ptr;
				1368	c = *ptr;
				1369	break;
				1370
				1371	case ESC_b: c = '\b'; /* Treat as single character */
				1372	break;
				1373	}
				1374	}
				1375	/* Fall through if single character */
				1376	}
				1377
				1378	/* A single character may be followed by '-' to form a range. However,
				1379	Perl does not permit ']' to be the end of the range. A '-' character
				1380	here is treated as a literal. */
				1381
				1382	if (ptr[1] == '-' && ptr[2] != ']')
				1383	{
				1384	int d;
				1385	ptr += 2;
				1386	d = *ptr;
				1387
				1388	if (d == 0)
				1389	{
				1390	*errorptr = "incomplete range";
				1391	goto FAILED;
				1392	}
				1393
				1394	/* The second part of a range can be a single-character escape, but
				1395	not any of the other escapes. */
				1396
				1397	if (d == '\\')
				1398	{
				1399	d = check_escape(&ptr, errorptr);
				1400	if (d < 0)
				1401	{
				1402	if (d == -ESC_b) d = '\b'; else
				1403	{
				1404	*errorptr = "invalid range";
				1405	goto FAILED;
				1406	}
				1407	}
				1408	}
				1409
				1410	if (d < c)
				1411	{
				1412	*errorptr = "range out of order";
				1413	goto FAILED;
				1414	}
				1415
				1416	if (rangecount >= 255)
				1417	{
				1418	*errorptr = "too many ranges inside []";
				1419	goto FAILED;
				1420	}
				1421
				1422	rangecount++;
				1423	*code++ = c;
				1424	*code++ = d;
				1425	continue;
				1426	}
				1427
				1428	/* Handle a lone single character: save it up for outputting at the
				1429	end. Be paranoid and check that the buffer isn't going to overflow. */
				1430
				1431	if (singles_count >= 255)
				1432	{
				1433	*errorptr = "too many characters inside []";
				1434	goto FAILED;
				1435	}
				1436	singles[singles_count++] = c;
				1437	}
				1438
				1439	/* Loop until ']' reached; the check for end of string happens inside the
				1440	loop. This "while" is the end of the "do" above. */
				1441
				1442	while ((c = *(++ptr)) != ']');
				1443
				1444	/* Copy saved single characters, which follow the ranges in the output. */
				1445
				1446	c = 0;
				1447	while (c < singles_count) *code++ = singles[c++];
				1448
				1449	/* Finally fill in the flags and counts of ranges and single characters,
				1450	and advance the pointer past the ]. */
				1451
				1452	previous[1] = flags;
				1453	previous[2] = rangecount;
				1454	previous[3] = singles_count;
				1455	}
				1456	break;
				1457
				1458	/* Various kinds of repeat */
				1459
				1460	case '{':
				1461	ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
				1462	if (*errorptr != NULL) goto FAILED;
				1463	goto REPEAT;
				1464
				1465	case '*':
				1466	repeat_min = 0;
				1467	repeat_max = -1;
				1468	goto REPEAT;
				1469
				1470	case '+':
				1471	repeat_min = 1;
				1472	repeat_max = -1;
				1473	goto REPEAT;
				1474
				1475	case '?':
				1476	repeat_min = 0;
				1477	repeat_max = 1;
				1478
				1479	REPEAT:
				1480	if (previous == NULL)
				1481	{
				1482	*errorptr = "nothing to repeat";
				1483	goto FAILED;
				1484	}
				1485
				1486	/* If the next character is '?' this is a minimizing repeat. Advance to the
				1487	next character. */
				1488
				1489	if (ptr[1] == '?') { repeat_type = 1; ptr++; } else repeat_type = 0;
				1490
				1491	/* If previous was a string of characters, chop off the last one and use it
				1492	as the subject of the repeat. If there was only one character, we can
				1493	abolish the previous item altogether. */
				1494
				1495	if (*previous == OP_CHARS)
				1496	{
				1497	int len = previous[1];
				1498	if (len == 1)
				1499	{
				1500	c = previous[2];
				1501	code = previous;
				1502	}
				1503	else
				1504	{
				1505	c = previous[len+1];
				1506	previous[1]--;
				1507	code--;
				1508	}
				1509	op_type = 0; /* Use single-char op codes */
				1510	goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
				1511	}
				1512
				1513	/* If previous was a character type match (\d or similar), abolish it and
				1514	create a suitable repeat item. The code is shared with single-character
				1515	repeats by adding a suitable offset into repeat_type. */
				1516
				1517	if ((int)previous < OP_EOD \|\| previous == OP_ANY)
				1518	{
				1519	op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
				1520	c = *previous;
				1521	code = previous;
				1522
				1523	OUTPUT_SINGLE_REPEAT:
				1524	repeat_type += op_type; /* Combine both values for many cases */
				1525
				1526	/* A minimum of zero is handled either as the special case * or ?, or as
				1527	an UPTO, with the maximum given. */
				1528
				1529	if (repeat_min == 0)
				1530	{
				1531	if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
				1532	else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
				1533	else
				1534	{
				1535	*code++ = OP_UPTO + repeat_type;
				1536	*code++ = repeat_max >> 8;
				1537	*code++ = (repeat_max & 255);
				1538	}
				1539	}
				1540
				1541	/* The case {1,} is handled as the special case + */
				1542
				1543	else if (repeat_min == 1 && repeat_max == -1)
				1544	*code++ = OP_PLUS + repeat_type;
				1545
				1546	/* The case {n,n} is just an EXACT, while the general case {n,m} is
				1547	handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
				1548
				1549	else
				1550	{
				1551	if (repeat_min != 1)
				1552	{
				1553	code++ = OP_EXACT + op_type; / NB EXACT doesn't have repeat_type */
				1554	*code++ = repeat_min >> 8;
				1555	*code++ = (repeat_min & 255);
				1556	}
				1557
				1558	/* If the mininum is 1 and the previous item was a character string,
				1559	we either have to put back the item that got cancelled if the string
				1560	length was 1, or add the character back onto the end of a longer
				1561	string. For a character type nothing need be done; it will just get put
				1562	back naturally. */
				1563
				1564	else if (*previous == OP_CHARS)
				1565	{
				1566	if (code == previous) code += 2; else previous[1]++;
				1567	}
				1568
				1569	/* Insert an UPTO if the max is greater than the min. */
				1570
				1571	if (repeat_max != repeat_min)
				1572	{
				1573	*code++ = c;
				1574	repeat_max -= repeat_min;
				1575	*code++ = OP_UPTO + repeat_type;
				1576	*code++ = repeat_max >> 8;
				1577	*code++ = (repeat_max & 255);
				1578	}
				1579	}
				1580
				1581	/* The character or character type itself comes last in all cases. */
				1582
				1583	*code++ = c;
				1584	}
				1585
				1586	/* If previous was a character class or a back reference, we put the repeat
				1587	stuff after it. */
				1588
				1589	else if (previous == OP_CLASS \|\| previous == OP_NEGCLASS \|\|
				1590	*previous == OP_REF)
				1591	{
				1592	if (repeat_min == 0 && repeat_max == -1)
				1593	*code++ = OP_CRSTAR + repeat_type;
				1594	else if (repeat_min == 1 && repeat_max == -1)
				1595	*code++ = OP_CRPLUS + repeat_type;
				1596	else if (repeat_min == 0 && repeat_max == 1)
				1597	*code++ = OP_CRQUERY + repeat_type;
				1598	else
				1599	{
				1600	*code++ = OP_CRRANGE + repeat_type;
				1601	*code++ = repeat_min >> 8;
				1602	*code++ = repeat_min & 255;
				1603	if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
				1604	*code++ = repeat_max >> 8;
				1605	*code++ = repeat_max & 255;
				1606	}
				1607	}
				1608
				1609	/* If previous was a bracket group, we may have to replicate it in certain
				1610	cases. If the maximum repeat count is unlimited, check that the bracket
				1611	group cannot match the empty string, and diagnose an error if it can. */
				1612
				1613	else if ((int)*previous >= OP_BRA)
				1614	{
				1615	int i;
				1616	int length = code - previous;
				1617
				1618	if (repeat_max == -1 && could_be_empty(previous))
				1619	{
				1620	*errorptr = "operand of unlimited repeat could match the empty string";
				1621	goto FAILED;
				1622	}
				1623
				1624	/* If the minimum is greater than zero, and the maximum is unlimited or
				1625	equal to the minimum, the first copy remains where it is, and is
				1626	replicated up to the minimum number of times. This case includes the +
				1627	repeat, but of course no replication is needed in that case. */
				1628
				1629	if (repeat_min > 0 && (repeat_max == -1 \|\| repeat_max == repeat_min))
				1630	{
				1631	for (i = 1; i < repeat_min; i++)
				1632	{
				1633	memcpy(code, previous, length);
				1634	code += length;
				1635	}
				1636	}
				1637
				1638	/* If the minimum is zero, stick BRAZERO in front of the first copy.
				1639	Then, if there is a fixed upper limit, replicated up to that many times,
				1640	sticking BRAZERO in front of all the optional ones. */
				1641
				1642	else
				1643	{
				1644	if (repeat_min == 0)
				1645	{
				1646	memmove(previous+1, previous, length);
				1647	code++;
				1648	*previous++ = OP_BRAZERO + repeat_type;
				1649	}
				1650
				1651	for (i = 1; i < repeat_min; i++)
				1652	{
				1653	memcpy(code, previous, length);
				1654	code += length;
				1655	}
				1656
				1657	for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
				1658	{
				1659	*code++ = OP_BRAZERO + repeat_type;
				1660	memcpy(code, previous, length);
				1661	code += length;
				1662	}
				1663	}
				1664
				1665	/* If the maximum is unlimited, set a repeater in the final copy. */
				1666
				1667	if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
				1668	}
				1669
				1670	/* Else there's some kind of shambles */
				1671
				1672	else
				1673	{
				1674	*errorptr = "internal error 1 (unexpected repeat)";
				1675	goto FAILED;
				1676	}
				1677
				1678	/* In all case we no longer have a previous item. */
				1679
				1680	previous = NULL;
				1681	break;
				1682
				1683
				1684	/* Start of nested bracket sub-expression, or comment or lookahead.
				1685	First deal with special things that can come after a bracket; all are
				1686	introduced by ?, and the appearance of any of them means that this is not a
				1687	referencing group. They were checked for validity in the first pass over
				1688	the string, so we don't have to check for syntax errors here. */
				1689
				1690	case '(':
				1691	previous = code; /* Only real brackets can be repeated */
				1692	if (*(++ptr) == '?')
				1693	{
				1694	bravalue = OP_BRA;
				1695
				1696	switch (*(++ptr))
				1697	{
				1698	case '#':
				1699	case 'i':
				1700	case 'm':
				1701	case 's':
				1702	case 'x':
				1703	ptr++;
				1704	while (*ptr != ')') ptr++;
				1705	previous = NULL;
				1706	continue;
				1707
				1708	case ':': /* Non-extracting bracket */
				1709	ptr++;
				1710	break;
				1711
				1712	case '=': /* Assertions can't be repeated */
				1713	bravalue = OP_ASSERT;
				1714	ptr++;
				1715	previous = NULL;
				1716	break;
				1717
				1718	case '!':
				1719	bravalue = OP_ASSERT_NOT;
				1720	ptr++;
				1721	previous = NULL;
				1722	break;
				1723
				1724	case ('P'):
				1725	ptr++;
				1726	if (*ptr=='<')
				1727	{
				1728	/* (?P<groupname>...) */
				1729	int idlen;
				1730	PyObject string, intobj;
				1731
				1732	ptr++;
				1733	idlen = get_group_id(ptr, '>', errorptr);
				1734	if (*errorptr) {
				1735	goto FAILED;
				1736	}
				1737	string = PyString_FromStringAndSize(ptr, idlen);
				1738	intobj = PyInt_FromLong( brackets[0] );
				1739	if (intobj == NULL \|\| string==NULL)
				1740	{
				1741	Py_XDECREF(string);
				1742	Py_XDECREF(intobj);
				1743	*errorptr = "exception raised";
				1744	goto FAILED;
				1745	}
				1746	PyDict_SetItem(dictionary, string, intobj);
				1747	Py_DECREF(string); Py_DECREF(intobj);
				1748	ptr += idlen+1; /* Point to rest of expression */
				1749	goto do_grouping_bracket;
				1750	}
				1751	if (*ptr=='=')
				1752	{
				1753	/* (?P=groupname) */
				1754	int idlen, refnum;
				1755	PyObject string, intobj;
				1756
				1757	ptr++;
				1758	idlen = get_group_id(ptr, ')', errorptr);
				1759	if (*errorptr) {
				1760	goto FAILED;
				1761	}
				1762	string = PyString_FromStringAndSize(ptr, idlen);
				1763	if (string==NULL)
				1764	{
				1765	Py_XDECREF(string);
				1766	*errorptr = "exception raised";
				1767	goto FAILED;
				1768	}
				1769	intobj = PyDict_GetItem(dictionary, string);
				1770	if (intobj==NULL) {
				1771	*errorptr = "?P= group identifier isn't defined";
				1772	goto FAILED;
				1773	}
				1774
				1775	refnum = PyInt_AsLong(intobj);
				1776	Py_DECREF(string); Py_DECREF(intobj);
				1777	*code++ = OP_REF;
				1778	*code++ = refnum;
				1779	/* The continue will cause the top-level for() loop to
				1780	be resumed, so ptr will be immediately incremented.
				1781	Therefore, the following line adds just idlen, not
				1782	idlen+1 */
				1783	ptr += idlen;
				1784	continue;
				1785	}
				1786	/* The character after ?P is neither < nor =, so
				1787	report an error. Add more Python-extensions here. */
				1788	*errorptr="unknown after (?P";
				1789	goto FAILED;
				1790	break;
				1791	default:
				1792	*errorptr = "unknown after (?";
				1793	goto FAILED;
				1794	}
				1795	}
				1796
				1797	/* Else we have a referencing group */
				1798
				1799	else
				1800	{
				1801	do_grouping_bracket:
				1802	if (brackets[0] > EXTRACT_MAX)
				1803	{
				1804	*errorptr = "too many extraction brackets";
				1805	goto FAILED;
				1806	}
				1807	brackets[1] = brackets[0];
				1808	bravalue = OP_BRA + brackets[0]++;
				1809	}
				1810
				1811	/* Process nested bracketed re; at end pointer is on the bracket. We copy
				1812	code into a non-register variable in order to be able to pass its address
				1813	because some compilers complain otherwise. */
				1814
				1815	*code = bravalue;
				1816	{
				1817	uschar *mcode = code;
				1818	if (!compile_regexp(extended, brackets, &mcode, &ptr, errorptr, dictionary))
				1819	goto FAILED;
				1820	code = mcode;
				1821	}
				1822
				1823	if (*ptr != ')')
				1824	{
				1825	*errorptr = "missing )";
				1826	goto FAILED;
				1827	}
				1828	break;
				1829
				1830	/* Check \ for being a real metacharacter; if not, fall through and handle
				1831	it as a data character at the start of a string. Escape items are checked
				1832	for validity in the pre-compiling pass. */
				1833
				1834	case '\\':
				1835	oldptr = ptr;
				1836	c = check_escape(&ptr, errorptr);
				1837
				1838	/* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
				1839	are arranged to be the negation of the corresponding OP_values. For the
				1840	back references, the values are ESC_REF plus the reference number. Only
				1841	back references and those types that consume a character may be repeated.
				1842	We can test for values between ESC_b and ESC_Z for the latter; this may
				1843	have to change if any new ones are ever created. */
				1844
				1845	if (c < 0)
				1846	{
				1847	if (-c >= ESC_REF)
				1848	{
				1849	int refnum = -c -ESC_REF;
				1850	if (brackets[1] < refnum ) {
				1851	*errorptr = "backreference to non-existent group";
				1852	goto FAILED;
				1853	}
				1854	previous = code;
				1855	*code++ = OP_REF;
				1856	*code++ = refnum;
				1857	}
				1858	else
				1859	{
				1860	previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
				1861	*code++ = -c;
				1862	}
				1863	continue;
				1864	}
				1865
				1866	/* Reset and fall through */
				1867
				1868	ptr = oldptr;
				1869	c = '\\';
				1870
				1871	/* Handle a run of data characters until a metacharacter is encountered.
				1872	The first character is guaranteed not to be whitespace or # when the
				1873	extended flag is set. */
				1874
				1875	default:
				1876	previous = code;
				1877	*code = OP_CHARS;
				1878	code += 2;
				1879	length = 0;
				1880
				1881	do
				1882	{
				1883	if (extended)
				1884	{
				1885	if ((pcre_ctypes[c] & ctype_space) != 0) continue;
				1886	if (c == '#')
				1887	{
				1888	while ((c = *(++ptr)) != 0 && c != '\n');
				1889	if (c == 0) break;
				1890	continue;
				1891	}
				1892	}
				1893
				1894	/* Backslash may introduce a data char or a metacharacter. Escaped items
				1895	are checked for validity in the pre-compiling pass. Stop the string
				1896	before a metaitem. */
				1897
				1898	if (c == '\\')
				1899	{
				1900	oldptr = ptr;
				1901	c = check_escape(&ptr, errorptr);
				1902	if (c < 0) { ptr = oldptr; break; }
				1903	}
				1904
				1905	/* Ordinary character or single-char escape */
				1906
				1907	*code++ = c;
				1908	length++;
				1909	}
				1910
				1911	/* This "while" is the end of the "do" above. */
				1912
				1913	while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
				1914
				1915	/* Compute the length and set it in the data vector, and advance to
				1916	the next state. */
				1917
				1918	previous[1] = length;
				1919	ptr--;
				1920	break;
				1921	}
				1922	} /* end of big loop */
				1923
				1924	/* Control never reaches here by falling through, only by a goto for all the
				1925	error states. Pass back the position in the pattern so that it can be displayed
				1926	to the user for diagnosing the error. */
				1927
				1928	FAILED:
				1929	*ptrptr = ptr;
				1930	return FALSE;
				1931	}
				1932
				1933
				1934
				1935
				1936	/*************************************************
				1937	* Compile sequence of alternatives *
				1938	*************************************************/
				1939
				1940	/* On entry, ptr is pointing past the bracket character, but on return
				1941	it points to the closing bracket, or vertical bar, or end of string.
				1942	The code variable is pointing at the byte into which the BRA operator has been
				1943	stored.
				1944
				1945	Argument:
				1946	extended TRUE if PCRE_EXTENDED was set
				1947	brackets -> 2-element vector containing next and top bracket numbers
				1948	codeptr -> the address of the current code pointer
				1949	ptrptr -> the address of the current pattern pointer
				1950	errorptr -> pointer to error message
				1951
				1952	Returns: TRUE on success
				1953	*/
				1954
				1955	static BOOL
				1956	compile_regexp(BOOL extended, int brackets, uschar *codeptr,
				1957	uschar ptrptr, char errorptr, PyObject *dictionary)
				1958	{
				1959	uschar ptr = ptrptr;
				1960	uschar code = codeptr;
				1961	uschar *start_bracket = code;
				1962
				1963	for (;;)
				1964	{
				1965	int length;
				1966	uschar *last_branch = code;
				1967
				1968	code += 3;
				1969	if (!compile_branch(extended, brackets, &code, &ptr, errorptr, dictionary))
				1970	{
				1971	*ptrptr = ptr;
				1972	return FALSE;
				1973	}
				1974
				1975	/* Fill in the length of the last branch */
				1976
				1977	length = code - last_branch;
				1978	last_branch[1] = length >> 8;
				1979	last_branch[2] = length & 255;
				1980
				1981	/* Reached end of expression, either ')' or end of pattern. Insert a
				1982	terminating ket and the length of the whole bracketed item, and return,
				1983	leaving the pointer at the terminating char. */
				1984
				1985	if (*ptr != '\|')
				1986	{
				1987	length = code - start_bracket;
				1988	*code++ = OP_KET;
				1989	*code++ = length >> 8;
				1990	*code++ = length & 255;
				1991	*codeptr = code;
				1992	*ptrptr = ptr;
				1993	return TRUE;
				1994	}
				1995
				1996	/* Another branch follows; insert an "or" node and advance the pointer. */
				1997
				1998	*code = OP_ALT;
				1999	ptr++;
				2000	}
				2001	/* Control never reaches here */
				2002	}
				2003
				2004
				2005
				2006	/*************************************************
				2007	* Check for anchored expression *
				2008	*************************************************/
				2009
				2010	/* Try to find out if this is an anchored regular expression. Consider each
				2011	alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
				2012	all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
				2013	it's anchored. However, if this is a multiline pattern, then only OP_SOD
				2014	counts, since OP_CIRC can match in the middle.
				2015
				2016	A branch is also implicitly anchored if it starts with .* because that will try
				2017	the rest of the pattern at all possible matching points, so there is no point
				2018	trying them again.
				2019
				2020	Argument: points to start of expression (the bracket)
				2021	Returns: TRUE or FALSE
				2022	*/
				2023
				2024	static BOOL
				2025	is_anchored(register uschar *code, BOOL multiline)
				2026	{
				2027	do {
				2028	int op = (int)code[3];
				2029	if (op >= OP_BRA \|\| op == OP_ASSERT)
				2030	{ if (!is_anchored(code+3, multiline)) return FALSE; }
				2031	else if (op == OP_TYPESTAR \|\| op == OP_TYPEMINSTAR)
				2032	{ if (code[4] != OP_ANY) return FALSE; }
				2033	else if (op != OP_SOD && (multiline \|\| op != OP_CIRC)) return FALSE;
				2034	code += (code[1] << 8) + code[2];
				2035	}
				2036	while (*code == OP_ALT);
				2037	return TRUE;
				2038	}
				2039
				2040
				2041
				2042	/*************************************************
				2043	* Check for start with \n line expression *
				2044	*************************************************/
				2045
				2046	/* This is called for multiline expressions to try to find out if every branch
				2047	starts with ^ so that "first char" processing can be done to speed things up.
				2048
				2049	Argument: points to start of expression (the bracket)
				2050	Returns: TRUE or FALSE
				2051	*/
				2052
				2053	static BOOL
				2054	is_startline(uschar *code)
				2055	{
				2056	do {
				2057	if ((int)code[3] >= OP_BRA \|\| code[3] == OP_ASSERT)
				2058	{ if (!is_startline(code+3)) return FALSE; }
				2059	else if (code[3] != OP_CIRC) return FALSE;
				2060	code += (code[1] << 8) + code[2];
				2061	}
				2062	while (*code == OP_ALT);
				2063	return TRUE;
				2064	}
				2065
				2066
				2067
				2068	/*************************************************
				2069	* Check for fixed first char *
				2070	*************************************************/
				2071
				2072	/* Try to find out if there is a fixed first character. This is called for
				2073	unanchored expressions, as it speeds up their processing quite considerably.
				2074	Consider each alternative branch. If they all start with the same char, or with
				2075	a bracket all of whose alternatives start with the same char (recurse ad lib),
				2076	then we return that char, otherwise -1.
				2077
				2078	Argument: points to start of expression (the bracket)
				2079	Returns: -1 or the fixed first char
				2080	*/
				2081
				2082	static int
				2083	find_firstchar(uschar *code)
				2084	{
				2085	register int c = -1;
				2086	do
				2087	{
				2088	register int charoffset = 4;
				2089
				2090	if ((int)code[3] >= OP_BRA \|\| code[3] == OP_ASSERT)
				2091	{
				2092	register int d;
				2093	if ((d = find_firstchar(code+3)) < 0) return -1;
				2094	if (c < 0) c = d; else if (c != d) return -1;
				2095	}
				2096
				2097	else switch(code[3])
				2098	{
				2099	default:
				2100	return -1;
				2101
				2102	case OP_EXACT: /* Fall through */
				2103	charoffset++;
				2104
				2105	case OP_CHARS: /* Fall through */
				2106	charoffset++;
				2107
				2108	case OP_PLUS:
				2109	case OP_MINPLUS:
				2110	if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;
				2111	break;
				2112	}
				2113	code += (code[1] << 8) + code[2];
				2114	}
				2115	while (*code == OP_ALT);
				2116	return c;
				2117	}
				2118
				2119
				2120
				2121	/*************************************************
				2122	* Compile a Regular Expression *
				2123	*************************************************/
				2124
				2125	/* This function takes a string and returns a pointer to a block of store
				2126	holding a compiled version of the expression.
				2127
				2128	Arguments:
				2129	pattern the regular expression
				2130	options various option bits
				2131	errorptr pointer to pointer to error text
				2132	erroroffset ptr offset in pattern where error was detected
				2133
				2134	Returns: pointer to compiled data block, or NULL on error,
				2135	with errorptr and erroroffset set
				2136	*/
				2137
				2138	pcre *
				2139	pcre_compile(char pattern, int options, char *errorptr, int
				2140	erroroffset, PyObject dictionary)
				2141	{
				2142	real_pcre *re;
				2143	int spaces = 0;
				2144	int length = 3; /* For initial BRA plus length */
				2145	int runlength;
				2146	int c, size;
				2147	int brackets[2];
				2148	int brastack[200];
				2149	int brastackptr = 0;
				2150	BOOL extended = (options & PCRE_EXTENDED) != 0;
				2151	uschar code, ptr;
				2152
				2153	#ifdef DEBUG
				2154	uschar code_base, code_end;
				2155	#endif
				2156
				2157	/* Miscellaneous initialization; the copy the error pointers into static
				2158	variables so all functions can access them. */
				2159
				2160	brackets[0] = 1; /* Next bracket number */
				2161	brackets[1] = 0; /* Highest used bracket number */
				2162
				2163	*errorptr = NULL;
				2164	*erroroffset = 0;
				2165
				2166	if ((options & ~PUBLIC_OPTIONS) != 0)
				2167	{
				2168	*errorptr = "unknown option bit(s) set";
				2169	return NULL;
				2170	}
				2171
				2172	#ifdef DEBUG
				2173	printf("------------------------------------------------------------------\n");
				2174	printf("%s\n", pattern);
				2175	#endif
				2176
				2177	/* The first thing to do is to make a pass over the pattern to compute the
				2178	amount of store required to hold the compiled code. This does not have to be
				2179	perfect as long as errors are overestimates. At the same time we can detect any
				2180	internal flag settings. Make an attempt to correct for any counted white space
				2181	if an "extended" flag setting appears late in the pattern. We can't be so
				2182	clever for #-comments. */
				2183
				2184	ptr = (uschar *)(pattern - 1);
				2185	while ((c = *(++ptr)) != 0)
				2186	{
				2187	int min, max;
				2188
				2189	if ((pcre_ctypes[c] & ctype_space) != 0)
				2190	{
				2191	if (extended) continue;
				2192	spaces++;
				2193	}
				2194
				2195	if (extended && c == '#')
				2196	{
				2197	while ((c = *(++ptr)) != 0 && c != '\n');
				2198	continue;
				2199	}
				2200
				2201	switch(c)
				2202	{
				2203	/* A backslashed item may be an escaped "normal" character or a
				2204	character type. For a "normal" character, put the pointers and
				2205	character back so that tests for whitespace etc. in the input
				2206	are done correctly. */
				2207
				2208	case '\\':
				2209	{
				2210	uschar *save_ptr = ptr;
				2211	c = check_escape(&ptr, errorptr);
				2212	if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
				2213	if (c >= 0)
				2214	{
				2215	ptr = save_ptr;
				2216	c = '\\';
				2217	goto NORMAL_CHAR;
				2218	}
				2219	}
				2220	length++;
				2221
				2222	/* A back reference needs an additional char, plus either one or 5
				2223	bytes for a repeat. */
				2224
				2225	if (c <= -ESC_REF)
				2226	{
				2227	length++; /* For single back reference */
				2228	if (ptr[1] == '{')
				2229	{
				2230	ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
				2231	if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
				2232	if ((min == 0 && (max == 1 \|\| max == -1)) \|\|
				2233	(min == 1 && max == -1))
				2234	length++;
				2235	else length += 5;
				2236	if (ptr[1] == '?') ptr++;
				2237	}
				2238	}
				2239	continue;
				2240
				2241	case '^':
				2242	case '.':
				2243	case '$':
				2244	case '': / These repeats won't be after brackets; */
				2245	case '+': /* those are handled separately */
				2246	case '?':
				2247	length++;
				2248	continue;
				2249
				2250	/* This covers the cases of repeats after a single char, metachar, class,
				2251	or back reference. */
				2252
				2253	case '{':
				2254	ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
				2255	if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
				2256	if ((min == 0 && (max == 1 \|\| max == -1)) \|\|
				2257	(min == 1 && max == -1))
				2258	length++;
				2259	else
				2260	{
				2261	length--; /* Uncount the original char or metachar */
				2262	if (min == 1) length++; else if (min > 0) length += 4;
				2263	if (max > 0) length += 4; else length += 2;
				2264	}
				2265	if (ptr[1] == '?') ptr++;
				2266	continue;
				2267
				2268	/* An alternation contains an offset to the next branch or ket. */
				2269	case '\|':
				2270	length += 3;
				2271	continue;
				2272
				2273	/* A character class uses 4 characters plus the characters in it. Don't
				2274	worry about character types that aren't allowed in classes - they'll get
				2275	picked up during the compile. */
				2276
				2277	case '[':
				2278	length += 4;
				2279	if (ptr[1] == '^') ptr++;
				2280	do
				2281	{
				2282	if (*(++ptr) == '\\')
				2283	{
				2284	(void)check_escape(&ptr, errorptr);
				2285	if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
				2286	}
				2287	length++;
				2288	}
				2289	while (ptr != 0 && ptr != ']');
				2290
				2291	/* A repeat needs either 1 or 5 bytes. */
				2292
				2293	if (ptr[1] == '{')
				2294	{
				2295	ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
				2296	if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
				2297	if ((min == 0 && (max == 1 \|\| max == -1)) \|\|
				2298	(min == 1 && max == -1))
				2299	length++;
				2300	else length += 5;
				2301	if (ptr[1] == '?') ptr++;
				2302	}
				2303	continue;
				2304
				2305	/* Brackets may be genuine groups or special things */
				2306
				2307	case '(':
				2308
				2309	/* Handle special forms of bracket, which all start (? */
				2310
				2311	if (ptr[1] == '?') switch (c = ptr[2])
				2312	{
				2313	/* Skip over comments entirely */
				2314	case '#':
				2315	ptr += 3;
				2316	while (ptr != 0 && ptr != ')') ptr++;
				2317	if (*ptr == 0)
				2318	{
				2319	*errorptr = "missing ) after comment";
				2320	goto PCRE_ERROR_RETURN;
				2321	}
				2322	continue;
				2323
				2324	/* Non-referencing groups and lookaheads just move the pointer on, and
				2325	then behave like a non-special bracket. */
				2326
				2327	case ':':
				2328	case '=':
				2329	case '!':
				2330	ptr += 2;
				2331	break;
				2332
				2333	/* Else loop setting valid options until ) is met. Anything else is an
				2334	error. */
				2335
				2336	case ('P'):
				2337	{
				2338	int idlen;
				2339	switch (*ptr++) {
				2340	case ('<'):
				2341	idlen = get_group_id(ptr++, '>', errorptr);
				2342	if (*errorptr) goto PCRE_ERROR_RETURN;
				2343	ptr += idlen+1;
				2344	break;
				2345	case ('='):
				2346	idlen = get_group_id(ptr++, ')', errorptr);
				2347	if (*errorptr) goto PCRE_ERROR_RETURN;
				2348	ptr += idlen+1;
				2349	length++;
				2350	break;
				2351	}
				2352	}
				2353	break;
				2354
				2355	default:
				2356	ptr += 2;
				2357	for (;; ptr++)
				2358	{
				2359	if ((c = *ptr) == 'i')
				2360	{
				2361	options \|= PCRE_CASELESS;
				2362	continue;
				2363	}
				2364	else if ((c = *ptr) == 'm')
				2365	{
				2366	options \|= PCRE_MULTILINE;
				2367	continue;
				2368	}
				2369	else if ((c = *ptr) == 's')
				2370	{
				2371	options \|= PCRE_DOTALL;
				2372	continue;
				2373	}
				2374	else if (c == 'x')
				2375	{
				2376	options \|= PCRE_EXTENDED;
				2377	extended = TRUE;
				2378	length -= spaces; /* Already counted spaces */
				2379	continue;
				2380	}
				2381	else if (c == ')') break;
				2382
				2383	*errorptr = "undefined after (?";
				2384	goto PCRE_ERROR_RETURN;
				2385	}
				2386	continue; /* End of this bracket handling */
				2387	}
				2388
				2389	/* Non-special forms of bracket. Save length for computing whole length
				2390	at end if there's a repeat that requires duplication of the group. */
				2391
				2392	if (brastackptr >= sizeof(brastack)/sizeof(int))
				2393	{
				2394	*errorptr = "too many brackets";
				2395	goto PCRE_ERROR_RETURN;
				2396	}
				2397
				2398	brastack[brastackptr++] = length;
				2399	length += 3;
				2400	continue;
				2401
				2402	/* Handle ket. Look for subsequent max/min; for certain sets of values we
				2403	have to replicate this bracket up to that many times. */
				2404
				2405	case ')':
				2406	length += 3;
				2407	{
				2408	int min = 1;
				2409	int max = 1;
				2410	int duplength = length - brastack[--brastackptr];
				2411
				2412	/* Leave ptr at the final char; for read_repeat_counts this happens
				2413	automatically; for the others we need an increment. */
				2414
				2415	if ((c = ptr[1]) == '{')
				2416	{
				2417	ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
				2418	if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
				2419	}
				2420	else if (c == '*') { min = 0; max = -1; ptr++; }
				2421	else if (c == '+') { max = -1; ptr++; }
				2422	else if (c == '?') { min = 0; ptr++; }
				2423
				2424	/* If there is a minimum > 1 we have to replicate up to min-1 times; if
				2425	there is a limited maximum we have to replicate up to max-1 times and
				2426	allow for a BRAZERO item before each optional copy, as we also have to
				2427	do before the first copy if the minimum is zero. */
				2428
				2429	if (min == 0) length++;
				2430	else if (min > 1) length += (min - 1) * duplength;
				2431	if (max > min) length += (max - min) * (duplength + 1);
				2432	}
				2433
				2434	continue;
				2435
				2436	/* Non-special character. For a run of such characters the length required
				2437	is the number of characters + 2, except that the maximum run length is 255.
				2438	We won't get a skipped space or a non-data escape or the start of a #
				2439	comment as the first character, so the length can't be zero. */
				2440
				2441	NORMAL_CHAR:
				2442	default:
				2443	length += 2;
				2444	runlength = 0;
				2445	do
				2446	{
				2447	if ((pcre_ctypes[c] & ctype_space) != 0)
				2448	{
				2449	if (extended) continue;
				2450	spaces++;
				2451	}
				2452
				2453	if (extended && c == '#')
				2454	{
				2455	while ((c = *(++ptr)) != 0 && c != '\n');
				2456	continue;
				2457	}
				2458
				2459	/* Backslash may introduce a data char or a metacharacter; stop the
				2460	string before the latter. */
				2461
				2462	if (c == '\\')
				2463	{
				2464	uschar *saveptr = ptr;
				2465	c = check_escape(&ptr, errorptr);
				2466	if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
				2467	if (c < 0) { ptr = saveptr; break; }
				2468	}
				2469
				2470	/* Ordinary character or single-char escape */
				2471
				2472	runlength++;
				2473	}
				2474
				2475	/* This "while" is the end of the "do" above. */
				2476
				2477	while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
				2478
				2479	ptr--;
				2480	length += runlength;
				2481	continue;
				2482	}
				2483	}
				2484
				2485	length += 4; /* For final KET and END */
				2486
				2487	if (length > 65539)
				2488	{
				2489	*errorptr = "regular expression too large";
				2490	return NULL;
				2491	}
				2492
				2493	/* Compute the size of data block needed and get it, either from malloc or
				2494	externally provided function. Put in the magic number and the options. */
				2495
				2496	size = length + sizeof(real_pcre) - sizeof(re->code);
				2497	re = (real_pcre *)(pcre_malloc)(size);
				2498
				2499	if (re == NULL)
				2500	{
				2501	*errorptr = "failed to get memory";
				2502	return NULL;
				2503	}
				2504
				2505	re->magic_number = MAGIC_NUMBER;
				2506	re->options = options;
				2507
				2508	/* Set up a starting, non-extracting bracket, then compile the expression. On
				2509	error, *errorptr will be set non-NULL, so we don't need to look at the result
				2510	of the function here. */
				2511
				2512	ptr = (uschar *)pattern;
				2513	code = re->code;
				2514	*code = OP_BRA;
				2515	(void)compile_regexp(extended, brackets, &code, &ptr, errorptr, dictionary);
				2516	re->top_bracket = brackets[1];
				2517
				2518	/* If not reached end of pattern on success, there's an excess bracket. */
				2519
				2520	if (errorptr == NULL && ptr != 0) *errorptr = "unmatched brackets";
				2521	/* Fill in the terminating state and check for disastrous overflow, but
				2522	if debugging, leave the test till after things are printed out. */
				2523
				2524	*code++ = OP_END;
				2525
				2526	#ifndef DEBUG
				2527	if (code - re->code > length) *errorptr = "internal error: code overflow";
				2528	#endif
				2529
				2530	/* Failed to compile */
				2531
				2532	if (*errorptr != NULL)
				2533	{
				2534	(pcre_free)(re);
				2535	PCRE_ERROR_RETURN:
				2536	erroroffset = ptr - (uschar )pattern;
				2537	return NULL;
				2538	}
				2539
				2540	/* If the anchored option was not passed, set flag if we can determine that it
				2541	is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
				2542	we can determine what the first character has to be, because that speeds up
				2543	unanchored matches no end. In the case of multiline matches, an alternative is
				2544	to set the PCRE_STARTLINE flag if all branches start with ^. */
				2545
				2546	if ((options & PCRE_ANCHORED) == 0)
				2547	{
				2548	if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))
				2549	re->options \|= PCRE_ANCHORED;
				2550	else
				2551	{
				2552	int c = find_firstchar(re->code);
				2553	if (c >= 0)
				2554	{
				2555	re->first_char = c;
				2556	re->options \|= PCRE_FIRSTSET;
				2557	}
				2558	else if (is_startline(re->code))
				2559	re->options \|= PCRE_STARTLINE;
				2560	}
				2561	}
				2562
				2563	/* Print out the compiled data for debugging */
				2564
				2565	#ifdef DEBUG
				2566
				2567	printf("Length = %d top_bracket = %d%s%s%s%s\n",
				2568	length, re->top_bracket,
				2569	((re->options & PCRE_ANCHORED) != 0)? " anchored" : "",
				2570	((re->options & PCRE_CASELESS) != 0)? " caseless" : "",
				2571	extended? " extended" : "",
				2572	((re->options & PCRE_MULTILINE) != 0)? " multiline" : "");
				2573
				2574	if ((re->options & PCRE_FIRSTSET) != 0)
				2575	{
				2576	if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
				2577	else printf("First char = \\x%02x\n", re->first_char);
				2578	}
				2579
				2580	code_end = code;
				2581	code_base = code = re->code;
				2582
				2583	while (code < code_end)
				2584	{
				2585	int charlength;
				2586
				2587	printf("%3d ", code - code_base);
				2588
				2589	if (*code >= OP_BRA)
				2590	{
				2591	printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
				2592	code += 2;
				2593	}
				2594
				2595	else switch(*code)
				2596	{
				2597	case OP_CHARS:
				2598	charlength = *(++code);
				2599	printf("%3d ", charlength);
				2600	while (charlength-- > 0)
				2601	if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
				2602	break;
				2603
				2604	case OP_KETRMAX:
				2605	case OP_KETRMIN:
				2606	case OP_ALT:
				2607	case OP_KET:
				2608	case OP_ASSERT:
				2609	case OP_ASSERT_NOT:
				2610	printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
				2611	code += 2;
				2612	break;
				2613
				2614	case OP_STAR:
				2615	case OP_MINSTAR:
				2616	case OP_PLUS:
				2617	case OP_MINPLUS:
				2618	case OP_QUERY:
				2619	case OP_MINQUERY:
				2620	case OP_TYPESTAR:
				2621	case OP_TYPEMINSTAR:
				2622	case OP_TYPEPLUS:
				2623	case OP_TYPEMINPLUS:
				2624	case OP_TYPEQUERY:
				2625	case OP_TYPEMINQUERY:
				2626	if (*code >= OP_TYPESTAR)
				2627	printf(" %s", OP_names[code[1]]);
				2628	else if (isprint(c = code[1])) printf(" %c", c);
				2629	else printf(" \\x%02x", c);
				2630	printf("%s", OP_names[*code++]);
				2631	break;
				2632
				2633	case OP_EXACT:
				2634	case OP_UPTO:
				2635	case OP_MINUPTO:
				2636	if (isprint(c = code[3])) printf(" %c{", c);
				2637	else printf(" \\x%02x{", c);
				2638	if (*code != OP_EXACT) printf(",");
				2639	printf("%d}", (code[1] << 8) + code[2]);
				2640	if (*code == OP_MINUPTO) printf("?");
				2641	code += 3;
				2642	break;
				2643
				2644	case OP_TYPEEXACT:
				2645	case OP_TYPEUPTO:
				2646	case OP_TYPEMINUPTO:
				2647	printf(" %s{", OP_names[code[3]]);
				2648	if (*code != OP_TYPEEXACT) printf(",");
				2649	printf("%d}", (code[1] << 8) + code[2]);
				2650	if (*code == OP_TYPEMINUPTO) printf("?");
				2651	code += 3;
				2652	break;
				2653
				2654	case OP_REF:
				2655	printf(" \\%d", *(++code));
				2656	break;
				2657
				2658	case OP_CLASS:
				2659	case OP_NEGCLASS:
				2660	{
				2661	int i, min, max;
				2662	int flags = code[1];
				2663	int rangecount = code[2];
				2664	int charcount = code[3];
				2665
				2666	printf(" [%s", (*code == OP_CLASS)? "" : "^");
				2667	code += 3;
				2668
				2669	for (i = 0; i < 8; i++)
				2670	if ((flags & (1 << i)) != 0) printf("%s", class_names[i]);
				2671
				2672	for (i = 0; i < rangecount; i++)
				2673	{
				2674	if (isprint((++code))) printf("%c-", code); else printf("\\x%02x-", *code);
				2675	if (isprint((++code))) printf("%c", code); else printf("\\x%02x", *code);
				2676	}
				2677
				2678	for (i = 0; i < charcount; i++)
				2679	{
				2680	if (!isprint((++code))) printf("\\x%02x", code);
				2681	else if (strchr("-\\]", code) != NULL) printf("\\%c", code);
				2682	else printf("%c", *code);
				2683	}
				2684	printf("]");
				2685
				2686	switch(*(++code))
				2687	{
				2688	case OP_CRSTAR:
				2689	case OP_CRMINSTAR:
				2690	case OP_CRPLUS:
				2691	case OP_CRMINPLUS:
				2692	case OP_CRQUERY:
				2693	case OP_CRMINQUERY:
				2694	printf("%s", OP_names[*code]);
				2695	break;
				2696
				2697	case OP_CRRANGE:
				2698	case OP_CRMINRANGE:
				2699	min = (code[1] << 8) + code[2];
				2700	max = (code[3] << 8) + code[4];
				2701	if (max == 0) printf("{%d,}", min);
				2702	else printf("{%d,%d}", min, max);
				2703	if (*code == OP_CRMINRANGE) printf("?");
				2704	code += 4;
				2705	break;
				2706
				2707	default:
				2708	code--;
				2709	}
				2710	}
				2711	break;
				2712
				2713	/* Anything else is just a one-node item */
				2714
				2715	default:
				2716	printf(" %s", OP_names[*code]);
				2717	break;
				2718	}
				2719
				2720	code++;
				2721	printf("\n");
				2722	}
				2723	printf("------------------------------------------------------------------\n");
				2724
				2725	/* This check is done here in the debugging case so that the code that
				2726	was compiled can be seen. */
				2727
				2728	if (code - re->code > length)
				2729	{
				2730	*errorptr = "internal error: code overflow";
				2731	(pcre_free)(re);
				2732	erroroffset = ptr - (uschar )pattern;
				2733	return NULL;
				2734	}
				2735	#endif
				2736
				2737	return (pcre *)re;
				2738	}
				2739
				2740
				2741
				2742	/*************************************************
				2743	* Match a character type *
				2744	*************************************************/
				2745
				2746	/* Not used in all the places it might be as it's sometimes faster
				2747	to put the code inline.
				2748
				2749	Arguments:
				2750	type the character type
				2751	c the character
				2752	multiline the multiline flag
				2753
				2754	Returns: TRUE if character is of the type
				2755	*/
				2756
				2757	static BOOL
				2758	match_type(int type, int c, BOOL dotall)
				2759	{
				2760
				2761	#ifdef DEBUG
				2762	if (isprint(c)) printf("matching subject %c against ", c);
				2763	else printf("matching subject \\x%02x against ", c);
				2764	printf("%s\n", OP_names[type]);
				2765	#endif
				2766
				2767	switch(type)
				2768	{
				2769	case OP_ANY: return dotall \|\| c != '\n';
				2770	case OP_NOT_DIGIT: return (pcre_ctypes[c] & ctype_digit) == 0;
				2771	case OP_DIGIT: return (pcre_ctypes[c] & ctype_digit) != 0;
				2772	case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
				2773	case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0;
				2774	case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0;
				2775	case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0;
				2776	}
				2777	return FALSE;
				2778	}
				2779
				2780
				2781	/*************************************************
				2782	* Match a character class *
				2783	*************************************************/
				2784
				2785	/* Return "result" if char is in the class and "!result" otherwise.
				2786
				2787	Arguments:
				2788	data points to the class item
				2789	c the subject character
				2790	result value to return if in class
				2791	md matching "static" data
				2792
				2793	Returns: result or !result
				2794	*/
				2795
				2796	static BOOL
				2797	match_class(register uschar data, register int c, BOOL result, match_data md)
				2798	{
				2799	int flags = data[1];
				2800	int i;
				2801	uschar *base = data;
				2802	uschar *end;
				2803
				2804	#ifdef DEBUG
				2805	{
				2806	uschar *d = base + 3;
				2807
				2808	if (isprint(c))
				2809	printf("match %c against [%s", c, result? "" : "^");
				2810	else
				2811	printf("match \\x%02x against [%s", c, result? "" : "^");
				2812
				2813	for (i = 0; i < 8; i++)
				2814	if ((flags & (1 << i)) != 0) printf("%s", class_names[i]);
				2815
				2816	for (i = 0; i < data[2]; i++)
				2817	{
				2818	if (isprint((++d))) printf("%c-", d); else printf("\\x%02x-", *d);
				2819	if (isprint((++d))) printf("%c", d); else printf("\\x%02x", *d);
				2820	}
				2821
				2822	for (i = 0; i < data[3]; i++)
				2823	{
				2824	if (!isprint((++d))) printf("\\x%02x", d);
				2825	else if (strchr("-\\]", d) != NULL) printf("\\%c", d);
				2826	else printf("%c", *d);
				2827	}
				2828	printf("]\n");
				2829	}
				2830	#endif
				2831
				2832	/* Test for any character types */
				2833
				2834	for (i = 0; flags != 0; i++)
				2835	{
				2836	if ((flags & 1) != 0 && match_type(class_ops[i], c, md->dotall))
				2837	return result;
				2838	flags >>= 1;
				2839	}
				2840
				2841	/* Advance pointer to the specific chars and do the caseless or caseful testing
				2842	of the ranges and individual characters as necessary. */
				2843
				2844	data += 4;
				2845	end = data + base[2] * 2;
				2846
				2847	/* Caseless character ranges are slightly tricky, because of cases like [W-c].
				2848	What we do is to uppercase the subject char if it is beyond the end of the
				2849	range, or lowercase it if it is before the start of the range and try again if
				2850	a caseful comparison has failed. This works because upper case letters come
				2851	before lower case in ASCII code. It would not work in EBCDIC, for example,
				2852	where they are the other way round, but then ranges like [W-c] would be illegal
				2853	in EBCDIC. */
				2854
				2855	if (md->caseless)
				2856	{
				2857	while (data < end)
				2858	{
				2859	register int d;
				2860	if (c >= (int)*data && c <= (int)data[1]) return result;
				2861	d = (c < (int)*data)? pcre_lcc[c] : pcre_ucc[c];
				2862	if (d >= (int)*data && d <= (int)data[1]) return result;
				2863	data += 2;
				2864	}
				2865	end += base[3];
				2866	c = pcre_lcc[c];
				2867	while (data < end) if (c == pcre_lcc[*data++]) return result;
				2868	}
				2869
				2870	/* Caseful is easy */
				2871
				2872	else
				2873	{
				2874	while (data < end)
				2875	{
				2876	if (c >= (int)*data && c <= (int)data[1]) return result;
				2877	data += 2;
				2878	}
				2879	end += base[3];
				2880	while (data < end) if (c == *data++) return result;
				2881	}
				2882
				2883	/* Character is not in the class */
				2884
				2885	return !result;
				2886	}
				2887
				2888
				2889
				2890	/*************************************************
				2891	* Match a back-reference *
				2892	*************************************************/
				2893
				2894	/* If a back reference hasn't been set, the match fails.
				2895
				2896	Arguments:
				2897	number reference number
				2898	eptr points into the subject
				2899	length length to be matched
				2900	md points to match data block
				2901
				2902	Returns: TRUE if matched
				2903	*/
				2904
				2905	static BOOL
				2906	match_ref(int number, register uschar eptr, int length, match_data md)
				2907	{
				2908	uschar *p = md->start_subject + md->offset_vector[number];
				2909
				2910	#ifdef DEBUG
				2911	if (eptr >= md->end_subject)
				2912	printf("matching subject <null>");
				2913	else
				2914	{
				2915	printf("matching subject ");
				2916	pchars(eptr, length, TRUE, md);
				2917	}
				2918	printf(" against backref ");
				2919	pchars(p, length, FALSE, md);
				2920	printf("\n");
				2921	#endif
				2922
				2923	/* Always fail if not enough characters left */
				2924
				2925	if (length > md->end_subject - p) return FALSE;
				2926
				2927	/* Separate the caselesss case for speed */
				2928
				2929	if (md->caseless)
				2930	{ while (length-- > 0) if (pcre_lcc[p++] != pcre_lcc[eptr++]) return FALSE; }
				2931	else
				2932	{ while (length-- > 0) if (p++ != eptr++) return FALSE; }
				2933
				2934	return TRUE;
				2935	}
				2936
				2937	static int free_stack(match_data *md)
				2938	{
				2939	/* Free any stack space that was allocated by the call to match(). */
				2940	if (md->off_num) free(md->off_num);
				2941	if (md->offset_top) free(md->offset_top);
				2942	if (md->r1) free(md->r1);
				2943	if (md->r2) free(md->r2);
				2944	if (md->eptr) free(md->eptr);
				2945	if (md->ecode) free(md->ecode);
				2946	}
				2947
				2948	static int grow_stack(match_data *md)
				2949	{
				2950	md->length = md->length ? md->length+md->length/2 : 200;
				2951	md->offset_top=realloc(md->offset_top, md->length*sizeof(int));
				2952	md->eptr=realloc(md->eptr, md->lengthsizeof(void ));
				2953	md->ecode=realloc(md->ecode, md->lengthsizeof(void ));
				2954	md->off_num=realloc(md->off_num, md->length*sizeof(int));
				2955	md->r1=realloc(md->r1, md->length*sizeof(int));
				2956	md->r2=realloc(md->r2, md->length*sizeof(int));
				2957	return 0;
				2958	}
				2959
				2960	/*************************************************
				2961	* Match from current position *
				2962	*************************************************/
				2963
				2964	/* On entry ecode points to the first opcode, and eptr to the first character.
				2965
				2966	Arguments:
				2967	eptr pointer in subject
				2968	ecode position in code
				2969	offset_top current top pointer
				2970	md pointer to "static" info for the match
				2971
				2972	Returns: TRUE if matched
				2973	*/
				2974
				2975	static BOOL
				2976	match(register uschar eptr, register uschar ecode, int offset_top,
				2977	match_data *md)
				2978	{
				2979	int save_stack_position = md->point;
				2980	match_loop:
				2981
				2982	#define SUCCEED goto succeed
				2983	#define FAIL goto fail
				2984
				2985	for (;;)
				2986	{
				2987	int min, max, ctype;
				2988	register int i;
				2989	register int c;
				2990	BOOL minimize;
				2991
				2992	/* Opening bracket. Check the alternative branches in turn, failing if none
				2993	match. We have to set the start offset if required and there is space
				2994	in the offset vector so that it is available for subsequent back references
				2995	if the bracket matches. However, if the bracket fails, we must put back the
				2996	previous value of both offsets in case they were set by a previous copy of
				2997	the same bracket. Don't worry about setting the flag for the error case here;
				2998	that is handled in the code for KET. */
				2999
				3000	if ((int)*ecode >= OP_BRA)
				3001	{
				3002	int number = (*ecode - OP_BRA) << 1;
				3003	int save_offset1, save_offset2;
				3004
				3005	#ifdef DEBUG
				3006	printf("start bracket %d\n", number/2);
				3007	#endif
				3008
				3009	if (number > 0 && number < md->offset_end)
				3010	{
				3011	save_offset1 = md->offset_vector[number];
				3012	save_offset2 = md->offset_vector[number+1];
				3013	md->offset_vector[number] = eptr - md->start_subject;
				3014
				3015	#ifdef DEBUG
				3016	printf("saving %d %d\n", save_offset1, save_offset2);
				3017	#endif
				3018	}
				3019
				3020	/* Recurse for all the alternatives. */
				3021
				3022	do
				3023	{
				3024	if (match(eptr, ecode+3, offset_top, md)) SUCCEED;
				3025	ecode += (ecode[1] << 8) + ecode[2];
				3026	}
				3027	while (*ecode == OP_ALT);
				3028
				3029	#ifdef DEBUG
				3030	printf("bracket %d failed\n", number/2);
				3031	#endif
				3032
				3033	if (number > 0 && number < md->offset_end)
				3034	{
				3035	md->offset_vector[number] = save_offset1;
				3036	md->offset_vector[number+1] = save_offset2;
				3037	}
				3038
				3039	FAIL;
				3040	}
				3041
				3042	/* Other types of node can be handled by a switch */
				3043
				3044	switch(*ecode)
				3045	{
				3046	case OP_END:
				3047	md->end_match_ptr = eptr; /* Record where we ended */
				3048	md->end_offset_top = offset_top; /* and how many extracts were taken */
				3049	SUCCEED;
				3050
				3051	/* Assertion brackets. Check the alternative branches in turn - the
				3052	matching won't pass the KET for an assertion. If any one branch matches,
				3053	the assertion is true. */
				3054
				3055	case OP_ASSERT:
				3056	do
				3057	{
				3058	if (match(eptr, ecode+3, offset_top, md)) break;
				3059	ecode += (ecode[1] << 8) + ecode[2];
				3060	}
				3061	while (*ecode == OP_ALT);
				3062	if (*ecode == OP_KET) FAIL;
				3063
				3064	/* Continue from after the assertion, updating the offsets high water
				3065	mark, since extracts may have been taken during the assertion. */
				3066
				3067	do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
				3068	ecode += 3;
				3069	offset_top = md->end_offset_top;
				3070	continue;
				3071
				3072	/* Negative assertion: all branches must fail to match */
				3073
				3074	case OP_ASSERT_NOT:
				3075	do
				3076	{
				3077	if (match(eptr, ecode+3, offset_top, md)) FAIL;
				3078	ecode += (ecode[1] << 8) + ecode[2];
				3079	}
				3080	while (*ecode == OP_ALT);
				3081	ecode += 3;
				3082	continue;
				3083
				3084	/* An alternation is the end of a branch; scan along to find the end of the
				3085	bracketed group and go to there. */
				3086
				3087	case OP_ALT:
				3088	do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
				3089	break;
				3090
				3091	/* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
				3092	that it may occur zero times. It may repeat infinitely, or not at all -
				3093	i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
				3094	repeat limits are compiled as a number of copies, with the optional ones
				3095	preceded by BRAZERO or BRAMINZERO. */
				3096
				3097	case OP_BRAZERO:
				3098	{
				3099	uschar *next = ecode+1;
				3100	if (match(eptr, next, offset_top, md)) SUCCEED;
				3101	do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
				3102	ecode = next + 3;
				3103	}
				3104	break;
				3105
				3106	case OP_BRAMINZERO:
				3107	{
				3108	uschar *next = ecode+1;
				3109	do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
				3110	if (match(eptr, next+3, offset_top, md)) SUCCEED;
				3111	ecode++;
				3112	}
				3113	break;;
				3114
				3115	/* End of a group, repeated or non-repeating. If we are at the end of
				3116	an assertion "group", stop matching and SUCCEED, but record the
				3117	current high water mark for use by positive assertions. */
				3118
				3119	case OP_KET:
				3120	case OP_KETRMIN:
				3121	case OP_KETRMAX:
				3122	{
				3123	int number, start, end;
				3124	uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
				3125
				3126	if (prev == OP_ASSERT \|\| prev == OP_ASSERT_NOT)
				3127	{
				3128	md->end_offset_top = offset_top;
				3129	SUCCEED;
				3130	}
				3131
				3132	/* In all other cases we have to check the group number back at the
				3133	start and if necessary complete handling an extraction by setting the
				3134	final offset and bumping the high water mark. */
				3135
				3136	number = (*prev - OP_BRA) << 1;
				3137
				3138	#ifdef DEBUG
				3139	printf("end bracket %d\n", number/2);
				3140	#endif
				3141
				3142	if (number > 0)
				3143	{
				3144	if (number >= md->offset_end) md->offset_overflow = TRUE; else
				3145	{
				3146	start=md->offset_vector[number] ; end =md->offset_vector[number+1];
				3147	md->offset_vector[number+1] = eptr - md->start_subject;
				3148	if (offset_top <= number) offset_top = number + 2;
				3149	}
				3150	}
				3151
				3152	/* For a non-repeating ket, just advance to the next node and continue at
				3153	this level. */
				3154
				3155	if (*ecode == OP_KET)
				3156	{
				3157	ecode += 3;
				3158	break;
				3159	}
				3160
				3161	/* The repeating kets try the rest of the pattern or restart from the
				3162	preceding bracket, in the appropriate order. */
				3163
				3164	if (*ecode == OP_KETRMIN)
				3165	{
				3166	uschar *ptr;
				3167	if (match(eptr, ecode+3, offset_top, md)) goto succeed;
				3168	/* Handle alternation inside the BRA...KET; push the additional
				3169	alternatives onto the stack
				3170	XXX this tries the alternatives backwards! */
				3171	ptr=prev;
				3172	do {
				3173	ptr += (ptr[1]<<8)+ ptr[2];
				3174	if (*ptr==OP_ALT)
				3175	{
				3176	if (md->length == md->point) grow_stack(md);
				3177	md->offset_top[md->point] = offset_top;
				3178	md->eptr[md->point] = eptr;
				3179	md->ecode[md->point] = ptr+3;
				3180	md->r1[md->point] = 0;
				3181	md->r2[md->point] = 0;
				3182	md->off_num[md->point] = 0;
				3183	md->point++;
				3184	}
				3185	} while (*ptr==OP_ALT);
				3186	ecode=prev+3; goto match_loop;
				3187	}
				3188	else /* OP_KETRMAX */
				3189	{
				3190	uschar *ptr;
				3191	int points_pushed=0;
				3192
				3193	/* Push one failure point, that will resume matching at the code after
				3194	the KETRMAX opcode. */
				3195	if (md->length == md->point) grow_stack(md);
				3196	md->offset_top[md->point] = offset_top;
				3197	md->eptr[md->point] = eptr;
				3198	md->ecode[md->point] = ecode+3;
				3199	md->r1[md->point] = md->offset_vector[number];
				3200	md->r2[md->point] = md->offset_vector[number+1];
				3201	md->off_num[md->point] = number;
				3202	md->point++;
				3203
				3204	md->offset_vector[number] = eptr - md->start_subject;
				3205	/* Handle alternation inside the BRA...KET; push each of the
				3206	additional alternatives onto the stack
				3207	XXX this tries the alternatives backwards! */
				3208	ptr=prev;
				3209	do {
				3210	ptr += (ptr[1]<<8)+ ptr[2];
				3211	if (*ptr==OP_ALT)
				3212	{
				3213	if (md->length == md->point) grow_stack(md);
				3214	md->offset_top[md->point] = offset_top;
				3215	md->eptr[md->point] = eptr;
				3216	md->ecode[md->point] = ptr+3;
				3217	md->r1[md->point] = 0;
				3218	md->r2[md->point] = 0;
				3219	md->off_num[md->point] = 0;
				3220	md->point++;
				3221	points_pushed++;
				3222	}
				3223	} while (*ptr==OP_ALT);
				3224	/* Jump to the first (or only) alternative and resume trying to match */
				3225	ecode=prev+3; goto match_loop;
				3226	}
				3227	}
				3228	FAIL;
				3229
				3230	/* Start of subject, or after internal newline if multiline */
				3231
				3232	case OP_CIRC:
				3233	if (md->multiline)
				3234	{
				3235	if (eptr != md->start_subject && eptr[-1] != '\n') FAIL;
				3236	ecode++;
				3237	break;
				3238	}
				3239	/* ... else fall through */
				3240
				3241	/* Start of subject assertion */
				3242
				3243	case OP_SOD:
				3244	if (eptr != md->start_subject) FAIL;
				3245	ecode++;
				3246	break;
				3247
				3248	/* End of subject, or before internal newline if multiline */
				3249
				3250	case OP_DOLL:
				3251	if (md->multiline)
				3252	{
				3253	if (eptr < md->end_subject && *eptr != '\n') FAIL;
				3254	ecode++;
				3255	break;
				3256	}
				3257	/* ... else fall through */
				3258
				3259	/* End of subject assertion */
				3260
				3261	case OP_EOD:
				3262	if (eptr < md->end_subject) FAIL;
				3263	ecode++;
				3264	break;
				3265
				3266	/* Word boundary assertions */
				3267
				3268	case OP_NOT_WORD_BOUNDARY:
				3269	case OP_WORD_BOUNDARY:
				3270	{
				3271	BOOL prev_is_word = (eptr != md->start_subject) &&
				3272	((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
				3273	BOOL cur_is_word = (eptr < md->end_subject) &&
				3274	((pcre_ctypes[*eptr] & ctype_word) != 0);
				3275	if ((*ecode++ == OP_WORD_BOUNDARY)?
				3276	cur_is_word == prev_is_word : cur_is_word != prev_is_word)
				3277	FAIL;
				3278	}
				3279	break;
				3280
				3281	/* Match a single character type; inline for speed */
				3282
				3283	case OP_ANY:
				3284	if (!md->dotall && eptr < md->end_subject && *eptr == '\n') FAIL;
				3285	if (eptr++ >= md->end_subject) FAIL;
				3286	ecode++;
				3287	break;
				3288
				3289	case OP_NOT_DIGIT:
				3290	if (eptr >= md->end_subject \|\| (pcre_ctypes[*eptr++] & ctype_digit) != 0)
				3291	FAIL;
				3292	ecode++;
				3293	break;
				3294
				3295	case OP_DIGIT:
				3296	if (eptr >= md->end_subject \|\| (pcre_ctypes[*eptr++] & ctype_digit) == 0)
				3297	FAIL;
				3298	ecode++;
				3299	break;
				3300
				3301	case OP_NOT_WHITESPACE:
				3302	if (eptr >= md->end_subject \|\| (pcre_ctypes[*eptr++] & ctype_space) != 0)
				3303	FAIL;
				3304	ecode++;
				3305	break;
				3306
				3307	case OP_WHITESPACE:
				3308	if (eptr >= md->end_subject \|\| (pcre_ctypes[*eptr++] & ctype_space) == 0)
				3309	FAIL;
				3310	ecode++;
				3311	break;
				3312
				3313	case OP_NOT_WORDCHAR:
				3314	if (eptr >= md->end_subject \|\| (pcre_ctypes[*eptr++] & ctype_word) != 0)
				3315	FAIL;
				3316	ecode++;
				3317	break;
				3318
				3319	case OP_WORDCHAR:
				3320	if (eptr >= md->end_subject \|\| (pcre_ctypes[*eptr++] & ctype_word) == 0)
				3321	FAIL;
				3322	ecode++;
				3323	break;
				3324
				3325	/* Match a back reference, possibly repeatedly. Look past the end of the
				3326	item to see if there is repeat information following. The code is similar
				3327	to that for character classes, but repeated for efficiency. Then obey
				3328	similar code to character type repeats - written out again for speed.
				3329	However, if the referenced string is the empty string, always treat
				3330	it as matched, any number of times (otherwise there could be infinite
				3331	loops). */
				3332
				3333	case OP_REF:
				3334	{
				3335	int length;
				3336	int number = ecode[1] << 1; /* Doubled reference number */
				3337	ecode += 2; /* Advance past the item */
				3338
				3339	if (number >= offset_top \|\| md->offset_vector[number] < 0)
				3340	{
				3341	md->errorcode = PCRE_ERROR_BADREF;
				3342	FAIL;
				3343	}
				3344
				3345	length = md->offset_vector[number+1] - md->offset_vector[number];
				3346
				3347	switch (*ecode)
				3348	{
				3349	case OP_CRSTAR:
				3350	case OP_CRMINSTAR:
				3351	case OP_CRPLUS:
				3352	case OP_CRMINPLUS:
				3353	case OP_CRQUERY:
				3354	case OP_CRMINQUERY:
				3355	c = *ecode++ - OP_CRSTAR;
				3356	minimize = (c & 1) != 0;
				3357	min = rep_min[c]; /* Pick up values from tables; */
				3358	max = rep_max[c]; /* zero for max => infinity */
				3359	if (max == 0) max = INT_MAX;
				3360	break;
				3361
				3362	case OP_CRRANGE:
				3363	case OP_CRMINRANGE:
				3364	minimize = (*ecode == OP_CRMINRANGE);
				3365	min = (ecode[1] << 8) + ecode[2];
				3366	max = (ecode[3] << 8) + ecode[4];
				3367	if (max == 0) max = INT_MAX;
				3368	ecode += 5;
				3369	break;
				3370
				3371	default: /* No repeat follows */
				3372	if (!match_ref(number, eptr, length, md)) FAIL;
				3373	eptr += length;
				3374	continue; /* With the main loop */
				3375	}
				3376
				3377	/* If the length of the reference is zero, just continue with the
				3378	main loop. */
				3379
				3380	if (length == 0) continue;
				3381
				3382	/* First, ensure the minimum number of matches are present. We get back
				3383	the length of the reference string explicitly rather than passing the
				3384	address of eptr, so that eptr can be a register variable. */
				3385
				3386	for (i = 1; i <= min; i++)
				3387	{
				3388	if (!match_ref(number, eptr, length, md)) FAIL;
				3389	eptr += length;
				3390	}
				3391
				3392	/* If min = max, continue at the same level without recursion.
				3393	They are not both allowed to be zero. */
				3394
				3395	if (min == max) continue;
				3396
				3397	/* If minimizing, keep trying and advancing the pointer */
				3398
				3399	if (minimize)
				3400	{
				3401	for (i = min;; i++)
				3402	{
				3403	if (match(eptr, ecode, offset_top, md)) SUCCEED;
				3404	if (i >= max \|\| !match_ref(number, eptr, length, md))
				3405	FAIL;
				3406	eptr += length;
				3407	}
				3408	/* Control never gets here */
				3409	}
				3410
				3411	/* If maximizing, find the longest string and work backwards */
				3412
				3413	else
				3414	{
				3415	uschar *pp = eptr;
				3416	for (i = min; i < max; i++)
				3417	{
				3418	if (!match_ref(number, eptr, length, md)) break;
				3419	eptr += length;
				3420	}
				3421	while (eptr >= pp)
				3422	{
				3423	if (match(eptr, ecode, offset_top, md)) SUCCEED;
				3424	eptr -= length;
				3425	}
				3426	FAIL;
				3427	}
				3428	}
				3429	/* Control never gets here */
				3430
				3431	/* Match a character class, possibly repeatedly. Look past the end of the
				3432	item to see if there is repeat information following. Then obey similar
				3433	code to character type repeats - written out again for speed. */
				3434
				3435	case OP_CLASS:
				3436	case OP_NEGCLASS:
				3437	{
				3438	BOOL result = *ecode == OP_CLASS;
				3439	uschar data = ecode; / Save for matching */
				3440
				3441	ecode += 4 + 2 * ecode[2] + ecode[3]; /* Advance past the item */
				3442
				3443	switch (*ecode)
				3444	{
				3445	case OP_CRSTAR:
				3446	case OP_CRMINSTAR:
				3447	case OP_CRPLUS:
				3448	case OP_CRMINPLUS:
				3449	case OP_CRQUERY:
				3450	case OP_CRMINQUERY:
				3451	c = *ecode++ - OP_CRSTAR;
				3452	minimize = (c & 1) != 0;
				3453	min = rep_min[c]; /* Pick up values from tables; */
				3454	max = rep_max[c]; /* zero for max => infinity */
				3455	if (max == 0) max = INT_MAX;
				3456	break;
				3457
				3458	case OP_CRRANGE:
				3459	case OP_CRMINRANGE:
				3460	minimize = (*ecode == OP_CRMINRANGE);
				3461	min = (ecode[1] << 8) + ecode[2];
				3462	max = (ecode[3] << 8) + ecode[4];
				3463	if (max == 0) max = INT_MAX;
				3464	ecode += 5;
				3465	break;
				3466
				3467	default: /* No repeat follows */
				3468	if (eptr >= md->end_subject \|\| !match_class(data, *eptr++, result, md))
				3469	FAIL;
				3470	continue; /* With the main loop */
				3471	}
				3472
				3473	/* First, ensure the minimum number of matches are present. */
				3474
				3475	for (i = 1; i <= min; i++)
				3476	if (eptr >= md->end_subject \|\| !match_class(data, *eptr++, result, md))
				3477	FAIL;
				3478
				3479	/* If max == min we can continue with the main loop without the
				3480	need to recurse. */
				3481
				3482	if (min == max) continue;
				3483
				3484	/* If minimizing, keep testing the rest of the expression and advancing
				3485	the pointer while it matches the class. */
				3486
				3487	if (minimize)
				3488	{
				3489	for (i = min;; i++)
				3490	{
				3491	if (match(eptr, ecode, offset_top, md)) SUCCEED;
				3492	if (i >= max \|\| eptr >= md->end_subject \|\|
				3493	!match_class(data, *eptr++, result, md)) FAIL;
				3494	}
				3495	/* Control never gets here */
				3496	}
				3497
				3498	/* If maximizing, find the longest possible run, then work backwards. */
				3499
				3500	else
				3501	{
				3502	uschar *pp = eptr;
				3503	for (i = min; i < max; i++)
				3504	{
				3505	if (eptr >= md->end_subject \|\| !match_class(data, *eptr, result, md))
				3506	break;
				3507	eptr++;
				3508	}
				3509	while (eptr >= pp)
				3510	if (match(eptr--, ecode, offset_top, md)) SUCCEED;
				3511	FAIL;
				3512	}
				3513	}
				3514	/* Control never gets here */
				3515
				3516	/* Match a run of characters */
				3517
				3518	case OP_CHARS:
				3519	{
				3520	register int length = ecode[1];
				3521	ecode += 2;
				3522
				3523	#ifdef DEBUG
				3524	if (eptr >= md->end_subject)
				3525	printf("matching subject <null> against pattern ");
				3526	else
				3527	{
				3528	printf("matching subject ");
				3529	pchars(eptr, length, TRUE, md);
				3530	printf(" against pattern ");
				3531	}
				3532	pchars(ecode, length, FALSE, md);
				3533	printf("\n");
				3534	#endif
				3535
				3536	if (length > md->end_subject - eptr) FAIL;
				3537	if (md->caseless)
				3538	{
				3539	while (length-- > 0) if (pcre_lcc[ecode++] != pcre_lcc[eptr++]) FAIL;
				3540	}
				3541	else
				3542	{
				3543	while (length-- > 0) if (ecode++ != eptr++) FAIL;
				3544	}
				3545	}
				3546	break;
				3547
				3548	/* Match a single character repeatedly; different opcodes share code. */
				3549
				3550	case OP_EXACT:
				3551	min = max = (ecode[1] << 8) + ecode[2];
				3552	ecode += 3;
				3553	goto REPEATCHAR;
				3554
				3555	case OP_UPTO:
				3556	case OP_MINUPTO:
				3557	min = 0;
				3558	max = (ecode[1] << 8) + ecode[2];
				3559	minimize = *ecode == OP_MINUPTO;
				3560	ecode += 3;
				3561	goto REPEATCHAR;
				3562
				3563	case OP_STAR:
				3564	case OP_MINSTAR:
				3565	case OP_PLUS:
				3566	case OP_MINPLUS:
				3567	case OP_QUERY:
				3568	case OP_MINQUERY:
				3569	c = *ecode++ - OP_STAR;
				3570	minimize = (c & 1) != 0;
				3571	min = rep_min[c]; /* Pick up values from tables; */
				3572	max = rep_max[c]; /* zero for max => infinity */
				3573	if (max == 0) max = INT_MAX;
				3574
				3575	/* Common code for all repeated single-character matches. We can give
				3576	up quickly if there are fewer than the minimum number of characters left in
				3577	the subject. */
				3578
				3579	REPEATCHAR:
				3580	if (min > md->end_subject - eptr) FAIL;
				3581	c = *ecode++;
				3582
				3583	/* The code is duplicated for the caseless and caseful cases, for speed,
				3584	since matching characters is likely to be quite common. First, ensure the
				3585	minimum number of matches are present. If min = max, continue at the same
				3586	level without recursing. Otherwise, if minimizing, keep trying the rest of
				3587	the expression and advancing one matching character if failing, up to the
				3588	maximum. Alternatively, if maximizing, find the maximum number of
				3589	characters and work backwards. */
				3590
				3591	#ifdef DEBUG
				3592	printf("matching %c{%d,%d} against subject %.*s\n", c, min, max,
				3593	max, eptr);
				3594	#endif
				3595
				3596	if (md->caseless)
				3597	{
				3598	c = pcre_lcc[c];
				3599	for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) FAIL;
				3600	if (min == max) continue;
				3601	if (minimize)
				3602	{
				3603	for (i = min;; i++)
				3604	{
				3605	if (match(eptr, ecode, offset_top, md)) SUCCEED;
				3606	if (i >= max \|\| eptr >= md->end_subject \|\| c != pcre_lcc[*eptr++])
				3607	FAIL;
				3608	}
				3609	/* Control never gets here */
				3610	}
				3611	else
				3612	{
				3613	uschar *pp = eptr;
				3614	for (i = min; i < max; i++)
				3615	{
				3616	if (eptr >= md->end_subject \|\| c != pcre_lcc[*eptr]) break;
				3617	eptr++;
				3618	}
				3619	while (eptr >= pp)
				3620	if (match(eptr--, ecode, offset_top, md)) SUCCEED;
				3621	FAIL;
				3622	}
				3623	}
				3624
				3625	/* Caseful comparisons */
				3626
				3627	else
				3628	{
				3629	for (i = 1; i <= min; i++) if (c != *eptr++) FAIL;
				3630	if (min == max) continue;
				3631	if (minimize)
				3632	{
				3633	for (i = min;; i++)
				3634	{
				3635	if (match(eptr, ecode, offset_top, md)) SUCCEED;
				3636	if (i >= max \|\| eptr >= md->end_subject \|\| c != *eptr++) FAIL;
				3637	}
				3638	/* Control never gets here */
				3639	}
				3640	else
				3641	{
				3642	uschar *pp = eptr;
				3643	for (i = min; i < max; i++)
				3644	{
				3645	if (eptr >= md->end_subject \|\| c != *eptr) break;
				3646	eptr++;
				3647	}
				3648	while (eptr >= pp)
				3649	if (match(eptr--, ecode, offset_top, md)) SUCCEED;
				3650	FAIL;
				3651	}
				3652	}
				3653	/* Control never gets here */
				3654
				3655	/* Match a single character type repeatedly; several different opcodes
				3656	share code. This is very similar to the code for single characters, but we
				3657	repeat it in the interests of efficiency. */
				3658
				3659	case OP_TYPEEXACT:
				3660	min = max = (ecode[1] << 8) + ecode[2];
				3661	minimize = TRUE;
				3662	ecode += 3;
				3663	goto REPEATTYPE;
				3664
				3665	case OP_TYPEUPTO:
				3666	case OP_TYPEMINUPTO:
				3667	min = 0;
				3668	max = (ecode[1] << 8) + ecode[2];
				3669	minimize = *ecode == OP_TYPEMINUPTO;
				3670	ecode += 3;
				3671	goto REPEATTYPE;
				3672
				3673	case OP_TYPESTAR:
				3674	case OP_TYPEMINSTAR:
				3675	case OP_TYPEPLUS:
				3676	case OP_TYPEMINPLUS:
				3677	case OP_TYPEQUERY:
				3678	case OP_TYPEMINQUERY:
				3679	c = *ecode++ - OP_TYPESTAR;
				3680	minimize = (c & 1) != 0;
				3681	min = rep_min[c]; /* Pick up values from tables; */
				3682	max = rep_max[c]; /* zero for max => infinity */
				3683	if (max == 0) max = INT_MAX;
				3684
				3685	/* Common code for all repeated single character type matches */
				3686
				3687	REPEATTYPE:
				3688	ctype = ecode++; / Code for the character type */
				3689
				3690	/* First, ensure the minimum number of matches are present. Use inline
				3691	code for maximizing the speed, and do the type test once at the start
				3692	(i.e. keep it out of the loop). Also test that there are at least the
				3693	minimum number of characters before we start. */
				3694
				3695	if (min > md->end_subject - eptr) FAIL;
				3696	if (min > 0) switch(ctype)
				3697	{
				3698	case OP_ANY:
				3699	if (!md->dotall)
				3700	{ for (i = 1; i <= min; i++) if (*eptr++ == '\n') FAIL; }
				3701	else eptr += min;
				3702	break;
				3703
				3704	case OP_NOT_DIGIT:
				3705	for (i = 1; i <= min; i++)
				3706	if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) FAIL;
				3707	break;
				3708
				3709	case OP_DIGIT:
				3710	for (i = 1; i <= min; i++)
				3711	if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) FAIL;
				3712	break;
				3713
				3714	case OP_NOT_WHITESPACE:
				3715	for (i = 1; i <= min; i++)
				3716	if ((pcre_ctypes[*eptr++] & ctype_space) != 0) FAIL;
				3717	break;
				3718
				3719	case OP_WHITESPACE:
				3720	for (i = 1; i <= min; i++)
				3721	if ((pcre_ctypes[*eptr++] & ctype_space) == 0) FAIL;
				3722	break;
				3723
				3724	case OP_NOT_WORDCHAR:
				3725	for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
				3726	FAIL;
				3727	break;
				3728
				3729	case OP_WORDCHAR:
				3730	for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
				3731	FAIL;
				3732	break;
				3733	}
				3734
				3735	/* If min = max, continue at the same level without recursing */
				3736
				3737	if (min == max) continue;
				3738
				3739	/* If minimizing, we have to test the rest of the pattern before each
				3740	subsequent match, so inlining isn't much help; just use the function. */
				3741
				3742	if (minimize)
				3743	{
				3744	for (i = min;; i++)
				3745	{
				3746	if (match(eptr, ecode, offset_top, md)) SUCCEED;
				3747	if (i >= max \|\| eptr >= md->end_subject \|\|
				3748	!match_type(ctype, *eptr++, md->dotall))
				3749	FAIL;
				3750	}
				3751	/* Control never gets here */
				3752	}
				3753
				3754	/* If maximizing it is worth using inline code for speed, doing the type
				3755	test once at the start (i.e. keep it out of the loop). */
				3756
				3757	else
				3758	{
				3759	uschar *pp = eptr;
				3760	switch(ctype)
				3761	{
				3762	case OP_ANY:
				3763	if (!md->dotall)
				3764	{
				3765	for (i = min; i < max; i++)
				3766	{
				3767	if (eptr >= md->end_subject \|\| *eptr == '\n') break;
				3768	eptr++;
				3769	}
				3770	}
				3771	else
				3772	{
				3773	c = max - min;
				3774	if (c > md->end_subject - eptr) c = md->end_subject - eptr;
				3775	eptr += c;
				3776	}
				3777	break;
				3778
				3779	case OP_NOT_DIGIT:
				3780	for (i = min; i < max; i++)
				3781	{
				3782	if (eptr >= md->end_subject \|\| (pcre_ctypes[*eptr] & ctype_digit) != 0)
				3783	break;
				3784	eptr++;
				3785	}
				3786	break;
				3787
				3788	case OP_DIGIT:
				3789	for (i = min; i < max; i++)
				3790	{
				3791	if (eptr >= md->end_subject \|\| (pcre_ctypes[*eptr] & ctype_digit) == 0)
				3792	break;
				3793	eptr++;
				3794	}
				3795	break;
				3796
				3797	case OP_NOT_WHITESPACE:
				3798	for (i = min; i < max; i++)
				3799	{
				3800	if (eptr >= md->end_subject \|\| (pcre_ctypes[*eptr] & ctype_space) != 0)
				3801	break;
				3802	eptr++;
				3803	}
				3804	break;
				3805
				3806	case OP_WHITESPACE:
				3807	for (i = min; i < max; i++)
				3808	{
				3809	if (eptr >= md->end_subject \|\| (pcre_ctypes[*eptr] & ctype_space) == 0)
				3810	break;
				3811	eptr++;
				3812	}
				3813	break;
				3814
				3815	case OP_NOT_WORDCHAR:
				3816	for (i = min; i < max; i++)
				3817	{
				3818	if (eptr >= md->end_subject \|\| (pcre_ctypes[*eptr] & ctype_word) != 0)
				3819	break;
				3820	eptr++;
				3821	}
				3822	break;
				3823
				3824	case OP_WORDCHAR:
				3825	for (i = min; i < max; i++)
				3826	{
				3827	if (eptr >= md->end_subject \|\| (pcre_ctypes[*eptr] & ctype_word) == 0)
				3828	break;
				3829	eptr++;
				3830	}
				3831	break;
				3832	}
				3833
				3834	while (eptr >= pp)
				3835	if (match(eptr--, ecode, offset_top, md)) SUCCEED;
				3836	FAIL;
				3837	}
				3838	/* Control never gets here */
				3839
				3840	/* There's been some horrible disaster. */
				3841
				3842	default:
				3843	#ifdef DEBUG
				3844	printf("Unknown opcode %d\n", *ecode);
				3845	#endif
				3846	md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
				3847	FAIL;
				3848	}
				3849
				3850	/* Do not stick any code in here without much thought; it is assumed
				3851	that "continue" in the code above comes out to here to repeat the main
				3852	loop. */
				3853
				3854	} /* End of main loop */
				3855	/* Control never reaches here */
				3856
				3857	fail:
				3858	if (md->point > save_stack_position)
				3859	{
				3860	/* If there are still points remaining on the stack, pop the next one off */
				3861	int start, end, off_num;
				3862
				3863	md->point--;
				3864	offset_top = md->offset_top[md->point];
				3865	eptr = md->eptr[md->point];
				3866	ecode = md->ecode[md->point];
				3867	off_num = md->off_num[md->point];
				3868	md->offset_vector[off_num] = md->r1[md->point];
				3869	md->offset_vector[off_num+1] = md->r2[md->point];
				3870	goto match_loop;
				3871	}
				3872	/* Failure, and nothing left on the stack, so end this function call */
				3873
				3874	/* Restore the top of the stack to where it was before this function
				3875	call. This lets us use one stack for everything; recursive calls
				3876	can push and pop information, and may increase the stack. When
				3877	the call returns, the parent function can resume pushing and
				3878	popping wherever it was. */
				3879
				3880	md->point = save_stack_position;
				3881	return FALSE;
				3882
				3883	succeed:
				3884	return TRUE;
				3885	}
				3886
				3887
				3888	/*************************************************
				3889	* Execute a Regular Expression *
				3890	*************************************************/
				3891
				3892	/* This function applies a compiled re to a subject string and picks out
				3893	portions of the string if it matches. Two elements in the vector are set for
				3894	each substring: the offsets to the start and end of the substring.
				3895
				3896	Arguments:
				3897	re points to the compiled expression
				3898	extra points to "hints" from pcre_study() or is NULL
				3899	subject points to the subject string
				3900	length length of subject string (may contain binary zeros)
				3901	options option bits
				3902	offsets points to a vector of ints to be filled in with offsets
				3903	offsetcount the number of elements in the vector
				3904
				3905	Returns: > 0 => success; value is the number of elements filled in
				3906	= 0 => success, but offsets is not big enough
				3907	-1 => failed to match
				3908	< -1 => some kind of unexpected problem
				3909	*/
				3910
				3911	int
				3912	pcre_exec(pcre external_re, pcre_extra external_extra, char *subject,
				3913	int length, int options, int *offsets, int offsetcount)
				3914	{
				3915	int resetcount;
				3916	int first_char = -1;
				3917	match_data match_block;
				3918	uschar *start_bits = NULL;
				3919	uschar start_match = (uschar )subject;
				3920	uschar *end_subject;
				3921	real_pcre re = (real_pcre )external_re;
				3922	real_pcre_extra extra = (real_pcre_extra )external_extra;
				3923	BOOL anchored = ((re->options \| options) & PCRE_ANCHORED) != 0;
				3924	BOOL startline = (re->options & PCRE_STARTLINE) != 0;
				3925
				3926	if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
				3927
				3928	if (re == NULL \|\| subject == NULL \|\|
				3929	(offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
				3930	if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
				3931
				3932	match_block.start_subject = (uschar *)subject;
				3933	match_block.end_subject = match_block.start_subject + length;
				3934	end_subject = match_block.end_subject;
				3935
				3936	match_block.caseless = ((re->options \| options) & PCRE_CASELESS) != 0;
				3937	match_block.multiline = ((re->options \|options) & PCRE_MULTILINE) != 0;
				3938	match_block.dotall = ((re->options \|options) & PCRE_DOTALL) != 0;
				3939
				3940	match_block.offset_vector = offsets; /* Where offsets go */
				3941	match_block.offset_end = (offsetcount & (-2)); /* Past max permitted (even) */
				3942	match_block.offset_overflow = FALSE;
				3943
				3944	match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
				3945
				3946	/* Set the stack state to empty */
				3947	match_block.off_num = match_block.offset_top = NULL;
				3948	match_block.r1 = match_block.r2 = NULL;
				3949	match_block.eptr = match_block.ecode = NULL;
				3950	match_block.point = match_block.length = 0;
				3951
				3952	/* Compute the minimum number of offsets that we need to reset each time. Doing
				3953	this makes a huge difference to execution time when there aren't many brackets
				3954	in the pattern. */
				3955
				3956	resetcount = 2 + re->top_bracket * 2;
				3957	if (resetcount > offsetcount) resetcount = offsetcount;
				3958
				3959	/* If MULTILINE is set at exec time but was not set at compile time, and the
				3960	anchored flag is set, we must re-check because a setting provoked by ^ in the
				3961	pattern is not right in multi-line mode. Calling is_anchored() again here does
				3962	the right check, because multiline is now set. If it now yields FALSE, the
				3963	expression must have had ^ starting some of its branches. Check to see if
				3964	that is true for all branches, and if so, set the startline flag. */
				3965
				3966	if (match_block. multiline && anchored && (re->options & PCRE_MULTILINE) == 0 &&
				3967	!is_anchored(re->code, match_block.multiline))
				3968	{
				3969	anchored = FALSE;
				3970	if (is_startline(re->code)) startline = TRUE;
				3971	}
				3972
				3973	/* Set up the first character to match, if available. The first_char value is
				3974	never set for an anchored regular expression, but the anchoring may be forced
				3975	at run time, so we have to test for anchoring. The first char may be unset for
				3976	an unanchored pattern, of course. If there's no first char and the pattern was
				3977	studied, the may be a bitmap of possible first characters. However, we can
				3978	use this only if the caseless state of the studying was correct. */
				3979
				3980	if (!anchored)
				3981	{
				3982	if ((re->options & PCRE_FIRSTSET) != 0)
				3983	{
				3984	first_char = re->first_char;
				3985	if (match_block.caseless) first_char = pcre_lcc[first_char];
				3986	}
				3987	else
				3988	if (!startline && extra != NULL &&
				3989	(extra->options & PCRE_STUDY_MAPPED) != 0 &&
				3990	((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless)
				3991	start_bits = extra->start_bits;
				3992	}
				3993
				3994	/* Loop for unanchored matches; for anchored regexps the loop runs just once. */
				3995
				3996	do
				3997	{
				3998	register int *iptr = offsets;
				3999	register int *iend = offsets + resetcount;
				4000
				4001	/* Reset the maximum number of extractions we might see. */
				4002
				4003	while (iptr < iend) *iptr++ = -1;
				4004
				4005	/* Advance to a unique first char if possible */
				4006
				4007	if (first_char >= 0)
				4008	{
				4009	if (match_block.caseless)
				4010	while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
				4011	start_match++;
				4012	else
				4013	while (start_match < end_subject && *start_match != first_char)
				4014	start_match++;
				4015	}
				4016
				4017	/* Or to just after \n for a multiline match if possible */
				4018
				4019	else if (startline)
				4020	{
				4021	if (start_match > match_block.start_subject)
				4022	{
				4023	while (start_match < end_subject && start_match[-1] != '\n')
				4024	start_match++;
				4025	}
				4026	}
				4027
				4028	/* Or to a non-unique first char */
				4029
				4030	else if (start_bits != NULL)
				4031	{
				4032	while (start_match < end_subject)
				4033	{
				4034	register int c = *start_match;
				4035	if ((start_bits[c/8] & (1<<(c%8))) == 0) start_match++; else break;
				4036	}
				4037	}
				4038
				4039	#ifdef DEBUG
				4040	printf(">>>> Match against: ");
				4041	pchars(start_match, end_subject - start_match, TRUE, &match_block);
				4042	printf("\n");
				4043	#endif
				4044
				4045	/* When a match occurs, substrings will be set for all internal extractions;
				4046	we just need to set up the whole thing as substring 0 before returning. If
				4047	there were too many extractions, set the return code to zero. */
				4048
				4049	if (match(start_match, re->code, 2, &match_block))
				4050	{
				4051	int rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
				4052	if (match_block.offset_end < 2) rc = 0; else
				4053	{
				4054	offsets[0] = start_match - match_block.start_subject;
				4055	offsets[1] = match_block.end_match_ptr - match_block.start_subject;
				4056	}
				4057	#ifdef DEBUG
				4058	printf(">>>> returning %d\n", rc);
				4059	#endif
				4060	free_stack(&match_block);
				4061	return rc;
				4062	}
				4063	}
				4064	while (!anchored &&
				4065	match_block.errorcode == PCRE_ERROR_NOMATCH &&
				4066	start_match++ < end_subject);
				4067
				4068	#ifdef DEBUG
				4069	printf(">>>> returning %d\n", match_block.errorcode);
				4070	#endif
				4071	free_stack(&match_block);
				4072	return match_block.errorcode;
				4073	}
				4074
				4075	/* End of pcre.c */