blob: 468751d2c81e2f3c43ac8249aaa6a8a255fef44e [file] [log] [blame]
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001
2/*************************************************
3* Perl-Compatible Regular Expressions *
4*************************************************/
5
6/* DO NOT EDIT THIS FILE! */
7
8/* This file is automatically written by the merge-files.py script
9included with the PCRE distribution for Python; it's produced from
10several C files, and code is removed in the process. If you want to
11modify the code or track down bugs, it will be much easier to work
12with the code in its original, multiple-file form. Don't edit this
13file by hand, or submit patches to it.
14
15The Python-specific PCRE distribution can be retrieved from
16 http://starship.skyport.net/crew/amk/regex/
17
Guido van Rossum58132c61997-12-17 00:24:13 +000018The unmodified original PCRE distribution is available at
19ftp://ftp.cus.cam.ac.uk/pub/software/programs/pcre/, and is originally
20written by: Philip Hazel <ph10@cam.ac.uk>
Guido van Rossum51b3aa31997-10-06 14:43:11 +000021
22Extensively modified by the Python String-SIG: <string-sig@python.org>
23Send bug reports to: <string-sig@python.org>
24(They'll figure out if it's a bug in PCRE or in the Python-specific
25changes.)
26
27 Copyright (c) 1997 University of Cambridge
28
29-----------------------------------------------------------------------------
30Permission is granted to anyone to use this software for any purpose on any
31computer system, and to redistribute it freely, subject to the following
32restrictions:
33
341. This software is distributed in the hope that it will be useful,
35 but WITHOUT ANY WARRANTY; without even the implied warranty of
36 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
37
382. The origin of this software must not be misrepresented, either by
39 explicit claim or by omission.
40
413. Altered versions must be plainly marked as such, and must not be
42 misrepresented as being the original software.
43-----------------------------------------------------------------------------
44*/
45
46
47#define FOR_PYTHON
Guido van Rossum58132c61997-12-17 00:24:13 +000048#include "pcre-int.h"
Guido van Rossum51b3aa31997-10-06 14:43:11 +000049#include "Python.h"
Guido van Rossum50700601997-12-08 17:15:20 +000050#include "mymalloc.h"
51#include <ctype.h>
Guido van Rossum51b3aa31997-10-06 14:43:11 +000052#include "graminit.h"
53
54/*************************************************
55* Perl-Compatible Regular Expressions *
56*************************************************/
57
58/* This file is automatically written by the makechartables auxiliary
59program. If you edit it by hand, you might like to edit the Makefile to
60prevent its ever being regenerated. */
61
62/* This table is a lower casing table. */
63
64unsigned char pcre_lcc[] = {
65 0, 1, 2, 3, 4, 5, 6, 7,
66 8, 9, 10, 11, 12, 13, 14, 15,
67 16, 17, 18, 19, 20, 21, 22, 23,
68 24, 25, 26, 27, 28, 29, 30, 31,
69 32, 33, 34, 35, 36, 37, 38, 39,
70 40, 41, 42, 43, 44, 45, 46, 47,
71 48, 49, 50, 51, 52, 53, 54, 55,
72 56, 57, 58, 59, 60, 61, 62, 63,
73 64, 97, 98, 99,100,101,102,103,
74 104,105,106,107,108,109,110,111,
75 112,113,114,115,116,117,118,119,
76 120,121,122, 91, 92, 93, 94, 95,
77 96, 97, 98, 99,100,101,102,103,
78 104,105,106,107,108,109,110,111,
79 112,113,114,115,116,117,118,119,
80 120,121,122,123,124,125,126,127,
81 128,129,130,131,132,133,134,135,
82 136,137,138,139,140,141,142,143,
83 144,145,146,147,148,149,150,151,
84 152,153,154,155,156,157,158,159,
85 160,161,162,163,164,165,166,167,
86 168,169,170,171,172,173,174,175,
87 176,177,178,179,180,181,182,183,
88 184,185,186,187,188,189,190,191,
89 192,193,194,195,196,197,198,199,
90 200,201,202,203,204,205,206,207,
91 208,209,210,211,212,213,214,215,
92 216,217,218,219,220,221,222,223,
93 224,225,226,227,228,229,230,231,
94 232,233,234,235,236,237,238,239,
95 240,241,242,243,244,245,246,247,
96 248,249,250,251,252,253,254,255 };
97
Guido van Rossum50700601997-12-08 17:15:20 +000098/* This table is a case flipping table. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +000099
Guido van Rossum50700601997-12-08 17:15:20 +0000100unsigned char pcre_fcc[] = {
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000101 0, 1, 2, 3, 4, 5, 6, 7,
102 8, 9, 10, 11, 12, 13, 14, 15,
103 16, 17, 18, 19, 20, 21, 22, 23,
104 24, 25, 26, 27, 28, 29, 30, 31,
105 32, 33, 34, 35, 36, 37, 38, 39,
106 40, 41, 42, 43, 44, 45, 46, 47,
107 48, 49, 50, 51, 52, 53, 54, 55,
108 56, 57, 58, 59, 60, 61, 62, 63,
Guido van Rossum50700601997-12-08 17:15:20 +0000109 64, 97, 98, 99,100,101,102,103,
110 104,105,106,107,108,109,110,111,
111 112,113,114,115,116,117,118,119,
112 120,121,122, 91, 92, 93, 94, 95,
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000113 96, 65, 66, 67, 68, 69, 70, 71,
114 72, 73, 74, 75, 76, 77, 78, 79,
115 80, 81, 82, 83, 84, 85, 86, 87,
116 88, 89, 90,123,124,125,126,127,
117 128,129,130,131,132,133,134,135,
118 136,137,138,139,140,141,142,143,
119 144,145,146,147,148,149,150,151,
120 152,153,154,155,156,157,158,159,
121 160,161,162,163,164,165,166,167,
122 168,169,170,171,172,173,174,175,
123 176,177,178,179,180,181,182,183,
124 184,185,186,187,188,189,190,191,
125 192,193,194,195,196,197,198,199,
126 200,201,202,203,204,205,206,207,
127 208,209,210,211,212,213,214,215,
128 216,217,218,219,220,221,222,223,
129 224,225,226,227,228,229,230,231,
130 232,233,234,235,236,237,238,239,
131 240,241,242,243,244,245,246,247,
132 248,249,250,251,252,253,254,255 };
133
Guido van Rossum50700601997-12-08 17:15:20 +0000134/* This table contains bit maps for digits, letters, 'word' chars, and
135white space. Each map is 32 bytes long and the bits run from the least
136significant end of each byte. */
137
138unsigned char pcre_cbits[] = {
139 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
140 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
141 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
142 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
143
144 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
145 0xfe,0xff,0xff,0x07,0xfe,0xff,0xff,0x07,
146 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
147 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
148
149 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
150 0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07,
151 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
152 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
153
154 0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00,
155 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
156 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
157 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 };
158
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000159/* This table identifies various classes of character by individual bits:
Guido van Rossum50700601997-12-08 17:15:20 +0000160 0x01 white space character
161 0x02 letter
162 0x04 decimal digit
163 0x08 hexadecimal digit
164 0x10 alphanumeric or '_'
165 0x80 regular expression metacharacter or binary zero
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000166*/
167
168unsigned char pcre_ctypes[] = {
169 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
170 0x00,0x01,0x01,0x01,0x01,0x01,0x00,0x00, /* 8- 15 */
171 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
172 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
173 0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /* - ' */
174 0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /* ( - / */
Guido van Rossum50700601997-12-08 17:15:20 +0000175 0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c, /* 0 - 7 */
176 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x80, /* 8 - ? */
177 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* @ - G */
178 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* H - O */
179 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* P - W */
180 0x12,0x12,0x12,0x80,0x00,0x00,0x80,0x10, /* X - _ */
181 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* ` - g */
182 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* h - o */
183 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* p - w */
184 0x12,0x12,0x12,0x80,0x80,0x00,0x00,0x00, /* x -127 */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000185 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
186 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
187 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
188 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
189 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
190 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
191 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
192 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
193 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
194 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
195 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
196 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
197 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
198 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
199 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
200 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
201
Guido van Rossum50700601997-12-08 17:15:20 +0000202/* End of chartables.c */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000203/*************************************************
204* Perl-Compatible Regular Expressions *
205*************************************************/
206
207/*
208This is a library of functions to support regular expressions whose syntax
209and semantics are as close as possible to those of the Perl 5 language. See
210the file Tech.Notes for some information on the internals.
211
212Written by: Philip Hazel <ph10@cam.ac.uk>
213
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000214 Copyright (c) 1998 University of Cambridge
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000215
216-----------------------------------------------------------------------------
217Permission is granted to anyone to use this software for any purpose on any
218computer system, and to redistribute it freely, subject to the following
219restrictions:
220
2211. This software is distributed in the hope that it will be useful,
222 but WITHOUT ANY WARRANTY; without even the implied warranty of
223 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
224
2252. The origin of this software must not be misrepresented, either by
226 explicit claim or by omission.
227
2283. Altered versions must be plainly marked as such, and must not be
229 misrepresented as being the original software.
230-----------------------------------------------------------------------------
231*/
232
233
234/* Include the internals header, which itself includes Standard C headers plus
235the external pcre header. */
236
237
238
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000239
240/*************************************************
241* Create bitmap of starting chars *
242*************************************************/
243
244/* This function scans a compiled unanchored expression and attempts to build a
245bitmap of the set of initial characters. If it can't, it returns FALSE. As time
246goes by, we may be able to get more clever at doing this.
247
248Arguments:
249 code points to an expression
250 start_bits points to a 32-byte table, initialized to 0
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000251
252Returns: TRUE if table built, FALSE otherwise
253*/
254
255static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +0000256set_start_bits(const uschar *code, uschar *start_bits)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000257{
Guido van Rossum50700601997-12-08 17:15:20 +0000258register int c;
Guido van Rossum95864d31998-12-21 18:35:49 +0000259volatile int dummy;
Guido van Rossum50700601997-12-08 17:15:20 +0000260
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000261do
262 {
Guido van Rossum58132c61997-12-17 00:24:13 +0000263 const uschar *tcode = code + 3;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000264 BOOL try_next = TRUE;
265
266 while (try_next)
267 {
268 try_next = FALSE;
269
270 if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
271 {
Guido van Rossum50700601997-12-08 17:15:20 +0000272 if (!set_start_bits(tcode, start_bits)) return FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000273 }
274
275 else switch(*tcode)
276 {
277 default:
278 return FALSE;
279
280 /* BRAZERO does the bracket, but carries on. */
281
282 case OP_BRAZERO:
283 case OP_BRAMINZERO:
Guido van Rossum50700601997-12-08 17:15:20 +0000284 if (!set_start_bits(++tcode, start_bits)) return FALSE;
Guido van Rossum95864d31998-12-21 18:35:49 +0000285 dummy = 1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000286 do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
287 tcode += 3;
288 try_next = TRUE;
289 break;
290
291 /* Single-char * or ? sets the bit and tries the next item */
292
293 case OP_STAR:
294 case OP_MINSTAR:
295 case OP_QUERY:
296 case OP_MINQUERY:
Guido van Rossum50700601997-12-08 17:15:20 +0000297 start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000298 tcode += 2;
299 try_next = TRUE;
300 break;
301
302 /* Single-char upto sets the bit and tries the next */
303
304 case OP_UPTO:
305 case OP_MINUPTO:
Guido van Rossum50700601997-12-08 17:15:20 +0000306 start_bits[tcode[3]/8] |= (1 << (tcode[3]&7));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000307 tcode += 4;
308 try_next = TRUE;
309 break;
310
311 /* At least one single char sets the bit and stops */
312
313 case OP_EXACT: /* Fall through */
314 tcode++;
315
316 case OP_CHARS: /* Fall through */
317 tcode++;
318
319 case OP_PLUS:
320 case OP_MINPLUS:
Guido van Rossum50700601997-12-08 17:15:20 +0000321 start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000322 break;
323
324 /* Single character type sets the bits and stops */
325
326 case OP_NOT_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000327 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000328 break;
329
330 case OP_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000331 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000332 break;
333
334 case OP_NOT_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000335 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000336 break;
337
338 case OP_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000339 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000340 break;
341
342 case OP_NOT_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000343 for (c = 0; c < 32; c++)
344 start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000345 break;
346
347 case OP_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000348 for (c = 0; c < 32; c++)
349 start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000350 break;
351
352 /* One or more character type fudges the pointer and restarts, knowing
353 it will hit a single character type and stop there. */
354
355 case OP_TYPEPLUS:
356 case OP_TYPEMINPLUS:
357 tcode++;
358 try_next = TRUE;
359 break;
360
361 case OP_TYPEEXACT:
362 tcode += 3;
363 try_next = TRUE;
364 break;
365
366 /* Zero or more repeats of character types set the bits and then
367 try again. */
368
369 case OP_TYPEUPTO:
370 case OP_TYPEMINUPTO:
371 tcode += 2; /* Fall through */
372
373 case OP_TYPESTAR:
374 case OP_TYPEMINSTAR:
375 case OP_TYPEQUERY:
376 case OP_TYPEMINQUERY:
377 switch(tcode[1])
378 {
379 case OP_NOT_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000380 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000381 break;
382
383 case OP_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000384 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000385 break;
386
387 case OP_NOT_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000388 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000389 break;
390
391 case OP_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000392 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000393 break;
394
395 case OP_NOT_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000396 for (c = 0; c < 32; c++)
397 start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000398 break;
399
400 case OP_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000401 for (c = 0; c < 32; c++)
402 start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000403 break;
404 }
405
406 tcode += 2;
407 try_next = TRUE;
408 break;
409
410 /* Character class: set the bits and either carry on or not,
411 according to the repeat count. */
412
413 case OP_CLASS:
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000414 case OP_NEGCLASS:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000415 {
Guido van Rossum50700601997-12-08 17:15:20 +0000416 tcode++;
417 for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
418 tcode += 32;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000419 switch (*tcode)
420 {
421 case OP_CRSTAR:
422 case OP_CRMINSTAR:
423 case OP_CRQUERY:
424 case OP_CRMINQUERY:
425 tcode++;
426 try_next = TRUE;
427 break;
428
429 case OP_CRRANGE:
430 case OP_CRMINRANGE:
431 if (((tcode[1] << 8) + tcode[2]) == 0)
432 {
433 tcode += 5;
434 try_next = TRUE;
435 }
436 break;
437 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000438 }
439 break; /* End of class handling */
440
441 } /* End of switch */
442 } /* End of try_next loop */
443
444 code += (code[1] << 8) + code[2]; /* Advance to next branch */
445 }
446while (*code == OP_ALT);
447return TRUE;
448}
449
450
451
452/*************************************************
453* Study a compiled expression *
454*************************************************/
455
456/* This function is handed a compiled expression that it must study to produce
457information that will speed up the matching. It returns a pcre_extra block
458which then gets handed back to pcre_exec().
459
460Arguments:
461 re points to the compiled expression
462 options contains option bits
463 errorptr points to where to place error messages;
464 set NULL unless error
465
466Returns: pointer to a pcre_extra block,
467 NULL on error or if no optimization possible
468*/
469
470pcre_extra *
Guido van Rossum58132c61997-12-17 00:24:13 +0000471pcre_study(const pcre *external_re, int options, const char **errorptr)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000472{
473BOOL caseless;
474uschar start_bits[32];
475real_pcre_extra *extra;
Guido van Rossum58132c61997-12-17 00:24:13 +0000476const real_pcre *re = (const real_pcre *)external_re;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000477
478*errorptr = NULL;
479
480if (re == NULL || re->magic_number != MAGIC_NUMBER)
481 {
482 *errorptr = "argument is not a compiled regular expression";
483 return NULL;
484 }
485
486if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
487 {
488 *errorptr = "unknown or incorrect option bit(s) set";
489 return NULL;
490 }
491
Guido van Rossum50700601997-12-08 17:15:20 +0000492/* Caseless can either be from the compiled regex or from options. */
493
494caseless = ((re->options | options) & PCRE_CASELESS) != 0;
495
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000496/* For an anchored pattern, or an unchored pattern that has a first char, or a
497multiline pattern that matches only at "line starts", no further processing at
498present. */
499
500if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
501 return NULL;
502
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000503/* See if we can find a fixed set of initial characters for the pattern. */
504
Guido van Rossum50700601997-12-08 17:15:20 +0000505memset(start_bits, 0, 32 * sizeof(uschar));
506if (!set_start_bits(re->code, start_bits)) return NULL;
507
508/* If this studying is caseless, scan the created bit map and duplicate the
509bits for any letters. */
510
511if (caseless)
512 {
513 register int c;
514 for (c = 0; c < 256; c++)
515 {
516 if ((start_bits[c/8] & (1 << (c&7))) != 0 &&
517 (pcre_ctypes[c] & ctype_letter) != 0)
518 {
519 int d = pcre_fcc[c];
520 start_bits[d/8] |= (1 << (d&7));
521 }
522 }
523 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000524
525/* Get an "extra" block and put the information therein. */
526
527extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
528
529if (extra == NULL)
530 {
531 *errorptr = "failed to get memory";
532 return NULL;
533 }
Guido van Rossum50700601997-12-08 17:15:20 +0000534
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000535extra->options = PCRE_STUDY_MAPPED | (caseless? PCRE_STUDY_CASELESS : 0);
Guido van Rossum50700601997-12-08 17:15:20 +0000536memcpy(extra->start_bits, start_bits, sizeof(start_bits));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000537
538return (pcre_extra *)extra;
539}
540
Guido van Rossum50700601997-12-08 17:15:20 +0000541/* End of study.c */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000542/*************************************************
543* Perl-Compatible Regular Expressions *
544*************************************************/
545
546/*
547This is a library of functions to support regular expressions whose syntax
548and semantics are as close as possible to those of the Perl 5 language. See
549the file Tech.Notes for some information on the internals.
550
551Written by: Philip Hazel <ph10@cam.ac.uk>
552
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000553 Copyright (c) 1998 University of Cambridge
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000554
555-----------------------------------------------------------------------------
556Permission is granted to anyone to use this software for any purpose on any
557computer system, and to redistribute it freely, subject to the following
558restrictions:
559
5601. This software is distributed in the hope that it will be useful,
561 but WITHOUT ANY WARRANTY; without even the implied warranty of
562 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
563
5642. The origin of this software must not be misrepresented, either by
565 explicit claim or by omission.
566
5673. Altered versions must be plainly marked as such, and must not be
568 misrepresented as being the original software.
569-----------------------------------------------------------------------------
570*/
571
572
573/* Define DEBUG to get debugging output on stdout. */
574
575/* #define DEBUG */
576
Guido van Rossum557dea11997-12-22 22:46:52 +0000577/* Use a macro for debugging printing, 'cause that eliminates the the use
578of #ifdef inline, and there are *still* stupid compilers about that don't like
579indented pre-processor statements. I suppose it's only been 10 years... */
580
581#ifdef DEBUG
582#define DPRINTF(p) printf p
583#else
584#define DPRINTF(p) /*nothing*/
585#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000586
587/* Include the internals header, which itself includes Standard C headers plus
588the external pcre header. */
589
590
Guido van Rossum50700601997-12-08 17:15:20 +0000591
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000592
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000593#ifndef Py_eval_input
594/* For Python 1.4, graminit.h has to be explicitly included */
595#define Py_eval_input eval_input
Guido van Rossum50700601997-12-08 17:15:20 +0000596
597#endif /* FOR_PYTHON */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000598
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000599/* Allow compilation as C++ source code, should anybody want to do that. */
600
601#ifdef __cplusplus
602#define class pcre_class
603#endif
604
605
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000606/* Min and max values for the common repeats; for the maxima, 0 => infinity */
607
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000608static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
609static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000610
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000611/* Text forms of OP_ values and things, for debugging (not all used) */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000612
613#ifdef DEBUG
Guido van Rossum58132c61997-12-17 00:24:13 +0000614static const char *OP_names[] = {
615 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
Guido van Rossum50700601997-12-08 17:15:20 +0000616 "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z",
617 "localized \\B", "localized \\b", "localized \\W", "localized \\w",
618 "^", "$", "Any", "chars",
619 "not",
620 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000621 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
622 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
623 "*", "*?", "+", "+?", "?", "??", "{", "{",
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000624 "class", "negclass", "classL", "Ref",
Guido van Rossum50700601997-12-08 17:15:20 +0000625 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000626 "Brazero", "Braminzero", "Bra"
627};
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000628#endif
629
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000630/* Table for handling escaped characters in the range '0'-'z'. Positive returns
631are simple data values; negative values are for special things like \d and so
632on. Zero means further processing is needed (for things like \x), or the escape
633is invalid. */
634
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000635static const short int escapes[] = {
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000636 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
637 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
638 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
639 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
640 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
641 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
642 '`', 7, -ESC_b, 0, -ESC_d, 0, '\f', 0, /* ` - g */
643 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
644 0, 0, '\r', -ESC_s, '\t', 0, '\v', -ESC_w, /* p - w */
645 0, 0, 0 /* x - z */
646};
647
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000648/* Definition to allow mutual recursion */
649
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000650static BOOL
651compile_regex(int, int *, uschar **, const uschar **, const char **,
652 PyObject *);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000653
654/* Structure for passing "static" information around between the functions
655doing the matching, so that they are thread-safe. */
656
657typedef struct match_data {
658 int errorcode; /* As it says */
659 int *offset_vector; /* Offset vector */
660 int offset_end; /* One past the end */
661 BOOL offset_overflow; /* Set if too many extractions */
662 BOOL caseless; /* Case-independent flag */
Guido van Rossum50700601997-12-08 17:15:20 +0000663 BOOL runtime_caseless; /* Caseless forced at run time */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000664 BOOL multiline; /* Multiline flag */
Guido van Rossum50700601997-12-08 17:15:20 +0000665 BOOL notbol; /* NOTBOL flag */
666 BOOL noteol; /* NOTEOL flag */
667 BOOL dotall; /* Dot matches any char */
668 BOOL endonly; /* Dollar not before final \n */
Guido van Rossum58132c61997-12-17 00:24:13 +0000669 const uschar *start_subject; /* Start of the subject string */
670 const uschar *end_subject; /* End of the subject string */
Guido van Rossum50700601997-12-08 17:15:20 +0000671 jmp_buf fail_env; /* Environment for longjump() break out */
Guido van Rossum58132c61997-12-17 00:24:13 +0000672 const uschar *end_match_ptr; /* Subject position at end match */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000673 int end_offset_top; /* Highwater mark at end of match */
Guido van Rossum50700601997-12-08 17:15:20 +0000674 jmp_buf error_env; /* For longjmp() if an error occurs deep inside a
675 matching operation */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000676 int length; /* Length of the allocated stacks */
677 int point; /* Point to add next item pushed onto stacks */
678 /* Pointers to the 6 stacks */
679 int *off_num, *offset_top, *r1, *r2;
Guido van Rossum58132c61997-12-17 00:24:13 +0000680 const uschar **eptr, **ecode;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000681} match_data;
682
683
684
685/*************************************************
Guido van Rossum50700601997-12-08 17:15:20 +0000686* Global variables *
687*************************************************/
688
689/* PCRE is thread-clean and doesn't use any global variables in the normal
690sense. However, it calls memory allocation and free functions via the two
691indirections below, which are can be changed by the caller, but are shared
692between all threads. */
693
694void *(*pcre_malloc)(size_t) = malloc;
695void (*pcre_free)(void *) = free;
696
697
698
699
700/*************************************************
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000701* Return version string *
702*************************************************/
703
Guido van Rossum58132c61997-12-17 00:24:13 +0000704const char *
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000705pcre_version(void)
706{
707return PCRE_VERSION;
708}
709
710
711
712
713/*************************************************
714* Return info about a compiled pattern *
715*************************************************/
716
717/* This function picks potentially useful data out of the private
718structure.
719
720Arguments:
721 external_re points to compiled code
722 optptr where to pass back the options
723 first_char where to pass back the first character,
724 or -1 if multiline and all branches start ^,
725 or -2 otherwise
726
727Returns: number of identifying extraction brackets
728 or negative values on error
729*/
730
731int
Guido van Rossum50700601997-12-08 17:15:20 +0000732pcre_info(const pcre *external_re, int *optptr, int *first_char)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000733{
Guido van Rossum58132c61997-12-17 00:24:13 +0000734const real_pcre *re = (real_pcre *)external_re;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000735if (re == NULL) return PCRE_ERROR_NULL;
736if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
737if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
738if (first_char != NULL)
739 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
740 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
741return re->top_bracket;
742}
743
744
745
746
747#ifdef DEBUG
748/*************************************************
749* Debugging function to print chars *
750*************************************************/
751
752/* Print a sequence of chars in printable format, stopping at the end of the
753subject if the requested.
754
755Arguments:
756 p points to characters
757 length number to print
758 is_subject TRUE if printing from within md->start_subject
759 md pointer to matching data block, if is_subject is TRUE
760
761Returns: nothing
762*/
763
Guido van Rossum557dea11997-12-22 22:46:52 +0000764static void
765pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000766{
767int c;
768if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
769while (length-- > 0)
770 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
771}
772#endif
773
774
775
776
777/*************************************************
778* Check subpattern for empty operand *
779*************************************************/
780
781/* This function checks a bracketed subpattern to see if any of the paths
782through it could match an empty string. This is used to diagnose an error if
783such a subpattern is followed by a quantifier with an unlimited upper bound.
784
785Argument:
786 code points to the opening bracket
787
788Returns: TRUE or FALSE
789*/
790
791static BOOL
792could_be_empty(uschar *code)
793{
794do {
795 uschar *cc = code + 3;
796
797 /* Scan along the opcodes for this branch; as soon as we find something
798 that matches a non-empty string, break out and advance to test the next
799 branch. If we get to the end of the branch, return TRUE for the whole
800 sub-expression. */
801
802 for (;;)
803 {
804 /* Test an embedded subpattern; if it could not be empty, break the
805 loop. Otherwise carry on in the branch. */
806
Guido van Rossum50700601997-12-08 17:15:20 +0000807 if ((int)(*cc) >= OP_BRA || (int)(*cc) == OP_ONCE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000808 {
809 if (!could_be_empty(cc)) break;
810 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
811 cc += 3;
812 }
813
814 else switch (*cc)
815 {
816 /* Reached end of a branch: the subpattern may match the empty string */
817
818 case OP_ALT:
819 case OP_KET:
820 case OP_KETRMAX:
821 case OP_KETRMIN:
822 return TRUE;
823
Guido van Rossumd0f432b1998-02-20 21:45:14 +0000824 /* Skip over entire bracket groups with zero lower bound */
825
826 case OP_BRAZERO:
827 case OP_BRAMINZERO:
828 cc++;
829 /* Fall through */
830
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000831 /* Skip over assertive subpatterns */
832
833 case OP_ASSERT:
834 case OP_ASSERT_NOT:
835 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
836 cc += 3;
837 break;
838
839 /* Skip over things that don't match chars */
840
841 case OP_SOD:
842 case OP_EOD:
843 case OP_CIRC:
844 case OP_DOLL:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000845 case OP_NOT_WORD_BOUNDARY:
846 case OP_WORD_BOUNDARY:
Guido van Rossum50700601997-12-08 17:15:20 +0000847 case OP_NOT_WORD_BOUNDARY_L:
848 case OP_WORD_BOUNDARY_L:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000849 cc++;
850 break;
851
852 /* Skip over simple repeats with zero lower bound */
853
854 case OP_STAR:
855 case OP_MINSTAR:
856 case OP_QUERY:
857 case OP_MINQUERY:
Guido van Rossum50700601997-12-08 17:15:20 +0000858 case OP_NOTSTAR:
859 case OP_NOTMINSTAR:
860 case OP_NOTQUERY:
861 case OP_NOTMINQUERY:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000862 case OP_TYPESTAR:
863 case OP_TYPEMINSTAR:
864 case OP_TYPEQUERY:
865 case OP_TYPEMINQUERY:
866 cc += 2;
867 break;
868
869 /* Skip over UPTOs (lower bound is zero) */
870
871 case OP_UPTO:
872 case OP_MINUPTO:
873 case OP_TYPEUPTO:
874 case OP_TYPEMINUPTO:
875 cc += 4;
876 break;
877
878 /* Check a class or a back reference for a zero minimum */
879
880 case OP_CLASS:
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000881 case OP_NEGCLASS:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000882 case OP_REF:
Guido van Rossum50700601997-12-08 17:15:20 +0000883 case OP_CLASS_L:
884 switch(*cc)
885 {
886 case (OP_REF): cc += 2; break;
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000887 case (OP_CLASS): case (OP_NEGCLASS): cc += 1+32; break;
Guido van Rossum50700601997-12-08 17:15:20 +0000888 case (OP_CLASS_L): cc += 1+1+32; break;
889 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000890
891 switch (*cc)
892 {
893 case OP_CRSTAR:
894 case OP_CRMINSTAR:
895 case OP_CRQUERY:
896 case OP_CRMINQUERY:
897 cc++;
898 break;
899
900 case OP_CRRANGE:
901 case OP_CRMINRANGE:
902 if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
903 cc += 3;
904 break;
905
906 default:
907 goto NEXT_BRANCH;
908 }
909 break;
910
911 /* Anything else matches at least one character */
912
913 default:
914 goto NEXT_BRANCH;
915 }
916 }
917
918 NEXT_BRANCH:
919 code += (code[1] << 8) + code[2];
920 }
921while (*code == OP_ALT);
922
923/* No branches match the empty string */
924
925return FALSE;
926}
927
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000928/* Determine the length of a group ID in an expression like
929 (?P<foo_123>...)
930Arguments:
931 ptr pattern position pointer (say that 3 times fast)
932 finalchar the character that will mark the end of the ID
933 errorptr points to the pointer to the error message
934*/
935
936static int
Guido van Rossum58132c61997-12-17 00:24:13 +0000937get_group_id(const uschar *ptr, char finalchar, const char **errorptr)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000938{
Guido van Rossum58132c61997-12-17 00:24:13 +0000939 const uschar *start = ptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000940
941 /* If the first character is not in \w, or is in \w but is a digit,
942 report an error */
943 if (!(pcre_ctypes[*ptr] & ctype_word) ||
944 (pcre_ctypes[*ptr++] & ctype_digit))
945 {
946 *errorptr = "(?P identifier must start with a letter or underscore";
947 return 0;
948 }
949
950 /* Increment ptr until we either hit a null byte, the desired
951 final character, or a non-word character */
952 for(; (*ptr != 0) && (*ptr != finalchar) &&
953 (pcre_ctypes[*ptr] & ctype_word); ptr++)
954 {
Guido van Rossumc3861071997-10-08 02:07:40 +0000955 /* Empty loop body */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000956 }
957 if (*ptr==finalchar)
958 return ptr-start;
959 if (*ptr==0)
960 {
961 *errorptr = "unterminated (?P identifier";
962 return 0;
963 }
964 *errorptr = "illegal character in (?P identifier";
965 return 0;
966}
967
968/*************************************************
969* Handle escapes *
970*************************************************/
971
972/* This function is called when a \ has been encountered. It either returns a
973positive value for a simple escape such as \n, or a negative value which
974encodes one of the more complicated things such as \d. On entry, ptr is
975pointing at the \. On exit, it is on the final character of the escape
976sequence.
977
978Arguments:
979 ptrptr points to the pattern position pointer
980 errorptr points to the pointer to the error message
Guido van Rossum50700601997-12-08 17:15:20 +0000981 bracount number of previous extracting brackets
982 options the options bits
983 isclass TRUE if inside a character class
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000984
985Returns: zero or positive => a data character
986 negative => a special escape sequence
987 on error, errorptr is set
988*/
989
990static int
Guido van Rossum58132c61997-12-17 00:24:13 +0000991check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
992 int options, BOOL isclass)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000993{
Guido van Rossum58132c61997-12-17 00:24:13 +0000994const uschar *ptr = *ptrptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000995int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
996int i;
997
Guido van Rossum50700601997-12-08 17:15:20 +0000998if (c == 0) *errorptr = ERR1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000999
1000/* Digits or letters may have special meaning; all others are literals. */
1001
1002else if (c < '0' || c > 'z') {}
1003
1004/* Do an initial lookup in a table. A non-zero result is something that can be
1005returned immediately. Otherwise further processing may be required. */
1006
1007else if ((i = escapes[c - '0']) != 0) c = i;
1008
1009/* Escapes that need further processing, or are illegal. */
1010
Guido van Rossum50700601997-12-08 17:15:20 +00001011else
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001012 {
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001013
Guido van Rossum50700601997-12-08 17:15:20 +00001014 switch (c)
1015 {
1016 /* The handling of escape sequences consisting of a string of digits
1017 starting with one that is not zero is not straightforward. By experiment,
1018 the way Perl works seems to be as follows:
1019
1020 Outside a character class, the digits are read as a decimal number. If the
1021 number is less than 10, or if there are that many previous extracting
1022 left brackets, then it is a back reference. Otherwise, up to three octal
1023 digits are read to form an escaped byte. Thus \123 is likely to be octal
1024 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
1025 value is greater than 377, the least significant 8 bits are taken. Inside a
1026 character class, \ followed by a digit is always an octal number. */
1027
1028 case '1': case '2': case '3': case '4': case '5':
1029 case '6': case '7': case '8': case '9':
1030
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001031 {
1032 /* PYTHON: Try to compute an octal value for a character */
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001033 for(c=0, i=0; ptr[i]!=0 && i<3; i++)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001034 {
1035 if (( pcre_ctypes[ ptr[i] ] & ctype_odigit) != 0)
1036 c = c * 8 + ptr[i]-'0';
1037 else
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001038 break; /* Non-octal character--break out of the loop */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001039 }
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001040 /* It's a character if there were exactly 3 octal digits, or if
1041 we're inside a character class and there was at least one
1042 octal digit. */
1043 if ( (i == 3) || (isclass && i!=0) )
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001044 {
1045 ptr += i-1;
1046 break;
1047 }
1048 c = ptr[0]; /* Restore the first character after the \ */
1049 c -= '0'; i = 1;
1050 while (i<2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0)
1051 {
1052 c = c * 10 + ptr[1] - '0';
1053 ptr++; i++;
1054 }
1055 if (c > 255 - ESC_REF) *errorptr = "back reference too big";
1056 c = -(ESC_REF + c);
1057 }
1058 break;
1059
Guido van Rossum50700601997-12-08 17:15:20 +00001060 /* \0 always starts an octal number, but we may drop through to here with a
1061 larger first octal digit */
1062
1063 case '0':
1064 c -= '0';
1065 while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&
1066 ptr[1] != '8' && ptr[1] != '9')
1067 c = c * 8 + *(++ptr) - '0';
1068 break;
1069
1070 /* Special escapes not starting with a digit are straightforward */
1071
1072 case 'x':
1073 c = 0;
1074 while ( (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001075 {
Guido van Rossum50700601997-12-08 17:15:20 +00001076 ptr++;
1077 c = c * 16 + pcre_lcc[*ptr] -
1078 (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
1079 c &= 255;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001080 }
1081 break;
1082
1083
Guido van Rossum50700601997-12-08 17:15:20 +00001084 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1085 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
1086 for Perl compatibility, it is a literal. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001087
Guido van Rossum50700601997-12-08 17:15:20 +00001088 default:
1089 if ((options & PCRE_EXTRA) != 0) switch(c)
1090 {
1091 case 'X':
1092 c = -ESC_X; /* This could be a lookup if it ever got into Perl */
1093 break;
1094
1095 default:
1096 *errorptr = ERR3;
1097 break;
1098 }
1099 break;
1100 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001101 }
1102
1103*ptrptr = ptr;
1104return c;
1105}
1106
1107
1108
1109/*************************************************
Guido van Rossum50700601997-12-08 17:15:20 +00001110* Check for counted repeat *
1111*************************************************/
1112
1113/* This function is called when a '{' is encountered in a place where it might
1114start a quantifier. It looks ahead to see if it really is a quantifier or not.
1115It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1116where the ddds are digits.
1117
1118Arguments:
1119 p pointer to the first char after '{'
1120
1121Returns: TRUE or FALSE
1122*/
1123
1124static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00001125is_counted_repeat(const uschar *p)
Guido van Rossum50700601997-12-08 17:15:20 +00001126{
1127if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1128while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1129if (*p == '}') return TRUE;
1130
1131if (*p++ != ',') return FALSE;
1132if (*p == '}') return TRUE;
1133
1134if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1135while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1136return (*p == '}');
1137}
1138
1139
1140
1141/*************************************************
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001142* Read repeat counts *
1143*************************************************/
1144
Guido van Rossum50700601997-12-08 17:15:20 +00001145/* Read an item of the form {n,m} and return the values. This is called only
1146after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1147so the syntax is guaranteed to be correct, but we need to check the values.
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001148
1149Arguments:
1150 p pointer to first char after '{'
1151 minp pointer to int for min
1152 maxp pointer to int for max
1153 returned as -1 if no max
1154 errorptr points to pointer to error message
1155
1156Returns: pointer to '}' on success;
1157 current ptr on error, with errorptr set
1158*/
1159
Guido van Rossum58132c61997-12-17 00:24:13 +00001160static const uschar *
1161read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001162{
1163int min = 0;
1164int max = -1;
1165
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001166while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
1167
1168if (*p == '}') max = min; else
1169 {
Guido van Rossum50700601997-12-08 17:15:20 +00001170 if (*(++p) != '}')
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001171 {
1172 max = 0;
1173 while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001174 if (max < min)
1175 {
Guido van Rossum50700601997-12-08 17:15:20 +00001176 *errorptr = ERR4;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001177 return p;
1178 }
1179 }
1180 }
1181
1182/* Do paranoid checks, then fill in the required variables, and pass back the
1183pointer to the terminating '}'. */
1184
Guido van Rossum50700601997-12-08 17:15:20 +00001185if (min > 65535 || max > 65535)
1186 *errorptr = ERR5;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001187else
1188 {
1189 *minp = min;
1190 *maxp = max;
1191 }
1192return p;
1193}
1194
1195
1196
1197/*************************************************
1198* Compile one branch *
1199*************************************************/
1200
1201/* Scan the pattern, compiling it into the code vector.
1202
1203Arguments:
Guido van Rossum50700601997-12-08 17:15:20 +00001204 options the option bits
1205 bracket points to number of brackets used
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001206 code points to the pointer to the current code point
1207 ptrptr points to the current pattern pointer
1208 errorptr points to pointer to error message
1209
1210Returns: TRUE on success
1211 FALSE, with *errorptr set on error
1212*/
1213
1214static BOOL
Guido van Rossum50700601997-12-08 17:15:20 +00001215compile_branch(int options, int *brackets, uschar **codeptr,
Guido van Rossum58132c61997-12-17 00:24:13 +00001216 const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001217{
1218int repeat_type, op_type;
1219int repeat_min, repeat_max;
1220int bravalue, length;
Guido van Rossumdda66961998-05-07 15:32:44 +00001221int greedy_default, greedy_non_default;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001222register int c;
1223register uschar *code = *codeptr;
Guido van Rossum58132c61997-12-17 00:24:13 +00001224const uschar *ptr = *ptrptr;
1225const uschar *oldptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001226uschar *previous = NULL;
Guido van Rossum50700601997-12-08 17:15:20 +00001227uschar class[32];
1228uschar *class_flag; /* Pointer to the single-byte flag for OP_CLASS_L */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001229
Guido van Rossumdda66961998-05-07 15:32:44 +00001230/* Set up the default and non-default settings for greediness */
1231
1232greedy_default = ((options & PCRE_UNGREEDY) != 0);
1233greedy_non_default = greedy_default ^ 1;
1234
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001235/* Switch on next character until the end of the branch */
1236
1237for (;; ptr++)
1238 {
Guido van Rossum50700601997-12-08 17:15:20 +00001239 BOOL negate_class;
1240 int class_charcount;
1241 int class_lastchar;
1242
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001243 c = *ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00001244 if ((options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001245 {
1246 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1247 if (c == '#')
1248 {
1249 while ((c = *(++ptr)) != 0 && c != '\n');
1250 continue;
1251 }
1252 }
1253
1254 switch(c)
1255 {
1256 /* The branch terminates at end of string, |, or ). */
1257
1258 case 0:
1259 case '|':
1260 case ')':
1261 *codeptr = code;
1262 *ptrptr = ptr;
1263 return TRUE;
1264
1265 /* Handle single-character metacharacters */
1266
1267 case '^':
1268 previous = NULL;
1269 *code++ = OP_CIRC;
1270 break;
1271
1272 case '$':
1273 previous = NULL;
1274 *code++ = OP_DOLL;
1275 break;
1276
1277 case '.':
1278 previous = code;
1279 *code++ = OP_ANY;
1280 break;
1281
Guido van Rossum50700601997-12-08 17:15:20 +00001282 /* Character classes. These always build a 32-byte bitmap of the permitted
1283 characters, except in the special case where there is only one character.
1284 For negated classes, we build the map as usual, then invert it at the end.
1285 */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001286
1287 case '[':
Guido van Rossum50700601997-12-08 17:15:20 +00001288 previous = code;
1289 if (options & PCRE_LOCALE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001290 {
Guido van Rossum50700601997-12-08 17:15:20 +00001291 *code++ = OP_CLASS_L;
1292 /* Set the flag for localized classes (like \w) to 0 */
1293 class_flag = code;
1294 *class_flag = 0;
1295 }
1296 else
1297 {
1298 *code++ = OP_CLASS;
1299 class_flag = NULL;
1300 }
1301
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001302 /* If the first character is '^', set the negation flag, and use a
1303 different opcode. This only matters if caseless matching is specified at
1304 runtime. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001305
Guido van Rossum50700601997-12-08 17:15:20 +00001306 if ((c = *(++ptr)) == '^')
1307 {
1308 negate_class = TRUE;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001309 if (*(code-1)==OP_CLASS) *(code-1) = OP_NEGCLASS;
Guido van Rossum50700601997-12-08 17:15:20 +00001310 c = *(++ptr);
1311 }
1312 else negate_class = FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001313
Guido van Rossum50700601997-12-08 17:15:20 +00001314 /* Keep a count of chars so that we can optimize the case of just a single
1315 character. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001316
Guido van Rossum50700601997-12-08 17:15:20 +00001317 class_charcount = 0;
1318 class_lastchar = -1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001319
Guido van Rossum50700601997-12-08 17:15:20 +00001320 /* Initialize the 32-char bit map to all zeros. We have to build the
1321 map in a temporary bit of store, in case the class contains only 1
1322 character, because in that case the compiled code doesn't use the
1323 bit map. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001324
Guido van Rossum50700601997-12-08 17:15:20 +00001325 memset(class, 0, 32 * sizeof(uschar));
1326
1327 /* Process characters until ] is reached. By writing this as a "do" it
1328 means that an initial ] is taken as a data character. */
1329
1330 do
1331 {
1332 if (c == 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001333 {
Guido van Rossum50700601997-12-08 17:15:20 +00001334 *errorptr = ERR6;
1335 goto FAILED;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001336 }
1337
Guido van Rossum50700601997-12-08 17:15:20 +00001338 /* Backslash may introduce a single character, or it may introduce one
1339 of the specials, which just set a flag. Escaped items are checked for
1340 validity in the pre-compiling pass. The sequence \b is a special case.
Guido van Rossum58132c61997-12-17 00:24:13 +00001341 Inside a class (and only there) it is treated as backspace. Elsewhere
Guido van Rossum50700601997-12-08 17:15:20 +00001342 it marks a word boundary. Other escapes have preset maps ready to
1343 or into the one we are building. We assume they have more than one
1344 character in them, so set class_count bigger than one. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001345
Guido van Rossum50700601997-12-08 17:15:20 +00001346 if (c == '\\')
1347 {
1348 c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1349 if (-c == ESC_b) c = '\b';
1350 else if (c < 0)
1351 {
1352 class_charcount = 10;
1353 switch (-c)
1354 {
1355 case ESC_d:
Guido van Rossum50700601997-12-08 17:15:20 +00001356 {
1357 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];
1358 }
1359 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001360
Guido van Rossum50700601997-12-08 17:15:20 +00001361 case ESC_D:
Guido van Rossum50700601997-12-08 17:15:20 +00001362 {
1363 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];
1364 }
1365 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001366
Guido van Rossum50700601997-12-08 17:15:20 +00001367 case ESC_w:
1368 if (options & PCRE_LOCALE)
1369 {
1370 *class_flag |= 1;
1371 }
1372 else
1373 {
1374 for (c = 0; c < 32; c++)
1375 class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1376 }
1377 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001378
Guido van Rossum50700601997-12-08 17:15:20 +00001379 case ESC_W:
1380 if (options & PCRE_LOCALE)
1381 {
1382 *class_flag |= 2;
1383 }
1384 else
1385 {
1386 for (c = 0; c < 32; c++)
1387 class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1388 }
1389 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001390
Guido van Rossum50700601997-12-08 17:15:20 +00001391 case ESC_s:
Guido van Rossum50700601997-12-08 17:15:20 +00001392 {
1393 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];
1394 }
1395 continue;
1396
1397 case ESC_S:
Guido van Rossum50700601997-12-08 17:15:20 +00001398 {
1399 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];
1400 }
1401 continue;
1402
1403 default:
1404 *errorptr = ERR7;
1405 goto FAILED;
1406 }
1407 }
1408 /* Fall through if single character */
1409 }
1410
1411 /* A single character may be followed by '-' to form a range. However,
1412 Perl does not permit ']' to be the end of the range. A '-' character
1413 here is treated as a literal. */
1414
1415 if (ptr[1] == '-' && ptr[2] != ']')
1416 {
1417 int d;
1418 ptr += 2;
1419 d = *ptr;
1420
1421 if (d == 0)
1422 {
1423 *errorptr = ERR6;
1424 goto FAILED;
1425 }
1426
1427 /* The second part of a range can be a single-character escape, but
1428 not any of the other escapes. */
1429
1430 if (d == '\\')
1431 {
1432 d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1433 if (d < 0)
1434 {
1435 if (d == -ESC_b) d = '\b'; else
1436 {
1437 *errorptr = ERR7;
1438 goto FAILED;
1439 }
1440 }
1441 }
1442
1443 if (d < c)
1444 {
1445 *errorptr = ERR8;
1446 goto FAILED;
1447 }
1448
1449 for (; c <= d; c++)
1450 {
1451 class[c/8] |= (1 << (c&7));
1452 if ((options & PCRE_CASELESS) != 0)
1453 {
1454 int uc = pcre_fcc[c]; /* flip case */
1455 class[uc/8] |= (1 << (uc&7));
1456 }
1457 class_charcount++; /* in case a one-char range */
1458 class_lastchar = c;
1459 }
1460 continue; /* Go get the next char in the class */
1461 }
1462
1463 /* Handle a lone single character - we can get here for a normal
1464 non-escape char, or after \ that introduces a single character. */
1465
1466 class [c/8] |= (1 << (c&7));
1467 if ((options & PCRE_CASELESS) != 0)
1468 {
1469 c = pcre_fcc[c]; /* flip case */
1470 class[c/8] |= (1 << (c&7));
1471 }
1472 class_charcount++;
1473 class_lastchar = c;
1474 }
1475
1476 /* Loop until ']' reached; the check for end of string happens inside the
1477 loop. This "while" is the end of the "do" above. */
1478
1479 while ((c = *(++ptr)) != ']');
1480
1481 /* If class_charcount is 1 and class_lastchar is not negative, we saw
1482 precisely one character. This doesn't need the whole 32-byte bit map.
1483 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1484 it's negative. */
1485
1486 if (class_charcount == 1 && class_lastchar >= 0)
1487 {
1488 if (negate_class)
1489 {
1490 code[-1] = OP_NOT;
1491 }
1492 else
1493 {
1494 code[-1] = OP_CHARS;
1495 *code++ = 1;
1496 }
1497 *code++ = class_lastchar;
1498 }
1499
1500 /* Otherwise, negate the 32-byte map if necessary, and copy it into
1501 the code vector. */
1502
1503 else
1504 {
1505 /* If this is a localized opcode, bump the code pointer up */
1506 if (class_flag) code++;
1507 if (negate_class)
1508 {
1509 if (class_flag) *class_flag = (*class_flag) ^ 63;
1510 for (c = 0; c < 32; c++) code[c] = ~class[c];
1511 }
1512 else
1513 memcpy(code, class, 32);
1514 code += 32;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001515 }
1516 break;
1517
1518 /* Various kinds of repeat */
1519
1520 case '{':
Guido van Rossum50700601997-12-08 17:15:20 +00001521 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001522 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
1523 if (*errorptr != NULL) goto FAILED;
1524 goto REPEAT;
1525
1526 case '*':
1527 repeat_min = 0;
1528 repeat_max = -1;
1529 goto REPEAT;
1530
1531 case '+':
1532 repeat_min = 1;
1533 repeat_max = -1;
1534 goto REPEAT;
1535
1536 case '?':
1537 repeat_min = 0;
1538 repeat_max = 1;
1539
1540 REPEAT:
1541 if (previous == NULL)
1542 {
Guido van Rossum50700601997-12-08 17:15:20 +00001543 *errorptr = ERR9;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001544 goto FAILED;
1545 }
1546
Guido van Rossumdda66961998-05-07 15:32:44 +00001547 /* If the next character is '?' this is a minimizing repeat, by default,
1548 but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001549 next character. */
1550
Guido van Rossumdda66961998-05-07 15:32:44 +00001551 if (ptr[1] == '?')
1552 { repeat_type = greedy_non_default; ptr++; }
1553 else repeat_type = greedy_default;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001554
Guido van Rossum50700601997-12-08 17:15:20 +00001555 /* If the maximum is zero then the minimum must also be zero; Perl allows
1556 this case, so we do too - by simply omitting the item altogether. */
1557
1558 if (repeat_max == 0) code = previous;
1559
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001560 /* If previous was a string of characters, chop off the last one and use it
1561 as the subject of the repeat. If there was only one character, we can
1562 abolish the previous item altogether. */
1563
Guido van Rossum50700601997-12-08 17:15:20 +00001564 else if (*previous == OP_CHARS)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001565 {
1566 int len = previous[1];
1567 if (len == 1)
1568 {
1569 c = previous[2];
1570 code = previous;
1571 }
1572 else
1573 {
1574 c = previous[len+1];
1575 previous[1]--;
1576 code--;
1577 }
1578 op_type = 0; /* Use single-char op codes */
1579 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1580 }
1581
Guido van Rossum50700601997-12-08 17:15:20 +00001582 /* If previous was a single negated character ([^a] or similar), we use
1583 one of the special opcodes, replacing it. The code is shared with single-
1584 character repeats by adding a suitable offset into repeat_type. */
1585
1586 else if ((int)*previous == OP_NOT)
1587 {
1588 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
1589 c = previous[1];
1590 code = previous;
1591 goto OUTPUT_SINGLE_REPEAT;
1592 }
1593
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001594 /* If previous was a character type match (\d or similar), abolish it and
1595 create a suitable repeat item. The code is shared with single-character
1596 repeats by adding a suitable offset into repeat_type. */
1597
Guido van Rossum50700601997-12-08 17:15:20 +00001598 else if ((int)*previous < OP_CIRC || *previous == OP_ANY)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001599 {
1600 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1601 c = *previous;
1602 code = previous;
1603
1604 OUTPUT_SINGLE_REPEAT:
1605 repeat_type += op_type; /* Combine both values for many cases */
1606
1607 /* A minimum of zero is handled either as the special case * or ?, or as
1608 an UPTO, with the maximum given. */
1609
1610 if (repeat_min == 0)
1611 {
1612 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1613 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1614 else
1615 {
1616 *code++ = OP_UPTO + repeat_type;
1617 *code++ = repeat_max >> 8;
1618 *code++ = (repeat_max & 255);
1619 }
1620 }
1621
1622 /* The case {1,} is handled as the special case + */
1623
1624 else if (repeat_min == 1 && repeat_max == -1)
1625 *code++ = OP_PLUS + repeat_type;
1626
1627 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1628 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1629
1630 else
1631 {
1632 if (repeat_min != 1)
1633 {
1634 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1635 *code++ = repeat_min >> 8;
1636 *code++ = (repeat_min & 255);
1637 }
1638
1639 /* If the mininum is 1 and the previous item was a character string,
1640 we either have to put back the item that got cancelled if the string
1641 length was 1, or add the character back onto the end of a longer
Guido van Rossumdda66961998-05-07 15:32:44 +00001642 string. For a character type nothing need be done; it will just get
1643 put back naturally. Note that the final character is always going to
1644 get added below. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001645
1646 else if (*previous == OP_CHARS)
1647 {
1648 if (code == previous) code += 2; else previous[1]++;
1649 }
1650
Guido van Rossumdda66961998-05-07 15:32:44 +00001651 /* For a single negated character we also have to put back the
1652 item that got cancelled. */
1653
1654 else if (*previous == OP_NOT) code++;
1655
Guido van Rossum557dea11997-12-22 22:46:52 +00001656 /* If the maximum is unlimited, insert an OP_STAR. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001657
Guido van Rossum557dea11997-12-22 22:46:52 +00001658 if (repeat_max < 0)
1659 {
1660 *code++ = c;
1661 *code++ = OP_STAR + repeat_type;
1662 }
1663
1664 /* Else insert an UPTO if the max is greater than the min. */
1665
1666 else if (repeat_max != repeat_min)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001667 {
1668 *code++ = c;
1669 repeat_max -= repeat_min;
1670 *code++ = OP_UPTO + repeat_type;
1671 *code++ = repeat_max >> 8;
1672 *code++ = (repeat_max & 255);
1673 }
1674 }
1675
1676 /* The character or character type itself comes last in all cases. */
1677
1678 *code++ = c;
1679 }
1680
1681 /* If previous was a character class or a back reference, we put the repeat
1682 stuff after it. */
1683
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001684 else if (*previous == OP_CLASS || *previous == OP_NEGCLASS ||
1685 *previous==OP_CLASS_L || *previous == OP_REF)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001686 {
1687 if (repeat_min == 0 && repeat_max == -1)
1688 *code++ = OP_CRSTAR + repeat_type;
1689 else if (repeat_min == 1 && repeat_max == -1)
1690 *code++ = OP_CRPLUS + repeat_type;
1691 else if (repeat_min == 0 && repeat_max == 1)
1692 *code++ = OP_CRQUERY + repeat_type;
1693 else
1694 {
1695 *code++ = OP_CRRANGE + repeat_type;
1696 *code++ = repeat_min >> 8;
1697 *code++ = repeat_min & 255;
1698 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1699 *code++ = repeat_max >> 8;
1700 *code++ = repeat_max & 255;
1701 }
1702 }
1703
1704 /* If previous was a bracket group, we may have to replicate it in certain
1705 cases. If the maximum repeat count is unlimited, check that the bracket
1706 group cannot match the empty string, and diagnose an error if it can. */
1707
1708 else if ((int)*previous >= OP_BRA)
1709 {
1710 int i;
Guido van Rossum557dea11997-12-22 22:46:52 +00001711 int len = code - previous;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001712
1713 if (repeat_max == -1 && could_be_empty(previous))
1714 {
Guido van Rossum50700601997-12-08 17:15:20 +00001715 *errorptr = ERR10;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001716 goto FAILED;
1717 }
1718
1719 /* If the minimum is greater than zero, and the maximum is unlimited or
1720 equal to the minimum, the first copy remains where it is, and is
1721 replicated up to the minimum number of times. This case includes the +
1722 repeat, but of course no replication is needed in that case. */
1723
1724 if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1725 {
1726 for (i = 1; i < repeat_min; i++)
1727 {
Guido van Rossum557dea11997-12-22 22:46:52 +00001728 memcpy(code, previous, len);
1729 code += len;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001730 }
1731 }
1732
1733 /* If the minimum is zero, stick BRAZERO in front of the first copy.
1734 Then, if there is a fixed upper limit, replicated up to that many times,
1735 sticking BRAZERO in front of all the optional ones. */
1736
1737 else
1738 {
1739 if (repeat_min == 0)
1740 {
Guido van Rossum557dea11997-12-22 22:46:52 +00001741 memmove(previous+1, previous, len);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001742 code++;
1743 *previous++ = OP_BRAZERO + repeat_type;
1744 }
1745
1746 for (i = 1; i < repeat_min; i++)
1747 {
Guido van Rossum557dea11997-12-22 22:46:52 +00001748 memcpy(code, previous, len);
1749 code += len;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001750 }
1751
1752 for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1753 {
1754 *code++ = OP_BRAZERO + repeat_type;
Guido van Rossum557dea11997-12-22 22:46:52 +00001755 memcpy(code, previous, len);
1756 code += len;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001757 }
1758 }
1759
1760 /* If the maximum is unlimited, set a repeater in the final copy. */
1761
1762 if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
1763 }
1764
1765 /* Else there's some kind of shambles */
1766
1767 else
1768 {
Guido van Rossum50700601997-12-08 17:15:20 +00001769 *errorptr = ERR11;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001770 goto FAILED;
1771 }
1772
1773 /* In all case we no longer have a previous item. */
1774
1775 previous = NULL;
1776 break;
1777
1778
1779 /* Start of nested bracket sub-expression, or comment or lookahead.
1780 First deal with special things that can come after a bracket; all are
1781 introduced by ?, and the appearance of any of them means that this is not a
1782 referencing group. They were checked for validity in the first pass over
1783 the string, so we don't have to check for syntax errors here. */
1784
1785 case '(':
1786 previous = code; /* Only real brackets can be repeated */
1787 if (*(++ptr) == '?')
1788 {
1789 bravalue = OP_BRA;
1790
1791 switch (*(++ptr))
1792 {
1793 case '#':
1794 case 'i':
Guido van Rossumbd49ac41997-12-10 23:05:53 +00001795 case 'L':
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001796 case 'm':
1797 case 's':
1798 case 'x':
1799 ptr++;
1800 while (*ptr != ')') ptr++;
1801 previous = NULL;
1802 continue;
1803
1804 case ':': /* Non-extracting bracket */
1805 ptr++;
1806 break;
1807
1808 case '=': /* Assertions can't be repeated */
1809 bravalue = OP_ASSERT;
1810 ptr++;
1811 previous = NULL;
1812 break;
1813
1814 case '!':
1815 bravalue = OP_ASSERT_NOT;
1816 ptr++;
1817 previous = NULL;
1818 break;
1819
1820 case ('P'):
1821 ptr++;
1822 if (*ptr=='<')
1823 {
1824 /* (?P<groupname>...) */
1825 int idlen;
1826 PyObject *string, *intobj;
1827
1828 ptr++;
1829 idlen = get_group_id(ptr, '>', errorptr);
1830 if (*errorptr) {
1831 goto FAILED;
1832 }
Guido van Rossum57ba4f31997-12-02 20:40:28 +00001833 string = PyString_FromStringAndSize((char*)ptr, idlen);
Guido van Rossum50700601997-12-08 17:15:20 +00001834 intobj = PyInt_FromLong( brackets[0] + 1 );
Guido van Rossum58132c61997-12-17 00:24:13 +00001835 if (intobj == NULL || string == NULL)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001836 {
1837 Py_XDECREF(string);
1838 Py_XDECREF(intobj);
1839 *errorptr = "exception raised";
1840 goto FAILED;
1841 }
1842 PyDict_SetItem(dictionary, string, intobj);
Guido van Rossum58132c61997-12-17 00:24:13 +00001843 Py_DECREF(string); Py_DECREF(intobj); /* XXX DECREF commented out! */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001844 ptr += idlen+1; /* Point to rest of expression */
1845 goto do_grouping_bracket;
1846 }
1847 if (*ptr=='=')
1848 {
1849 /* (?P=groupname) */
1850 int idlen, refnum;
1851 PyObject *string, *intobj;
1852
1853 ptr++;
1854 idlen = get_group_id(ptr, ')', errorptr);
1855 if (*errorptr) {
1856 goto FAILED;
1857 }
Guido van Rossum50700601997-12-08 17:15:20 +00001858 string = PyString_FromStringAndSize((char *)ptr, idlen);
Guido van Rossumc3861071997-10-08 02:07:40 +00001859 if (string==NULL) {
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001860 *errorptr = "exception raised";
1861 goto FAILED;
1862 }
1863 intobj = PyDict_GetItem(dictionary, string);
1864 if (intobj==NULL) {
Guido van Rossumc3861071997-10-08 02:07:40 +00001865 Py_DECREF(string);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001866 *errorptr = "?P= group identifier isn't defined";
1867 goto FAILED;
1868 }
1869
1870 refnum = PyInt_AsLong(intobj);
Guido van Rossum1eadb411997-12-15 17:33:24 +00001871 Py_DECREF(string);
Guido van Rossum58132c61997-12-17 00:24:13 +00001872 /* The caller doesn't own the reference to the value
1873 returned from PyDict_GetItem, so intobj is not
1874 DECREF'ed. */
1875
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001876 *code++ = OP_REF;
1877 *code++ = refnum;
1878 /* The continue will cause the top-level for() loop to
1879 be resumed, so ptr will be immediately incremented.
1880 Therefore, the following line adds just idlen, not
1881 idlen+1 */
1882 ptr += idlen;
1883 continue;
1884 }
1885 /* The character after ?P is neither < nor =, so
1886 report an error. Add more Python-extensions here. */
1887 *errorptr="unknown after (?P";
1888 goto FAILED;
Guido van Rossum50700601997-12-08 17:15:20 +00001889
1890 case '>': /* "Match once" brackets */
1891 if ((options & PCRE_EXTRA) != 0) /* Not yet standard */
1892 {
1893 bravalue = OP_ONCE;
1894 ptr++;
1895 previous = NULL;
1896 break;
1897 }
1898 /* Else fall through */
1899
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001900 default:
Guido van Rossum50700601997-12-08 17:15:20 +00001901 *errorptr = ERR12;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001902 goto FAILED;
1903 }
1904 }
1905
1906 /* Else we have a referencing group */
1907
1908 else
1909 {
1910 do_grouping_bracket:
Guido van Rossum50700601997-12-08 17:15:20 +00001911 if (++(*brackets) > EXTRACT_MAX)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001912 {
Guido van Rossum50700601997-12-08 17:15:20 +00001913 *errorptr = ERR13;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001914 goto FAILED;
1915 }
Guido van Rossum50700601997-12-08 17:15:20 +00001916 bravalue = OP_BRA + *brackets;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001917 }
1918
1919 /* Process nested bracketed re; at end pointer is on the bracket. We copy
1920 code into a non-register variable in order to be able to pass its address
1921 because some compilers complain otherwise. */
1922
1923 *code = bravalue;
1924 {
1925 uschar *mcode = code;
Guido van Rossum50700601997-12-08 17:15:20 +00001926 if (!compile_regex(options, brackets, &mcode, &ptr, errorptr, dictionary))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001927 goto FAILED;
1928 code = mcode;
1929 }
1930
1931 if (*ptr != ')')
1932 {
Guido van Rossum50700601997-12-08 17:15:20 +00001933 *errorptr = ERR14;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001934 goto FAILED;
1935 }
1936 break;
1937
1938 /* Check \ for being a real metacharacter; if not, fall through and handle
1939 it as a data character at the start of a string. Escape items are checked
1940 for validity in the pre-compiling pass. */
1941
1942 case '\\':
1943 oldptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00001944 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001945
1946 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1947 are arranged to be the negation of the corresponding OP_values. For the
1948 back references, the values are ESC_REF plus the reference number. Only
1949 back references and those types that consume a character may be repeated.
1950 We can test for values between ESC_b and ESC_Z for the latter; this may
1951 have to change if any new ones are ever created. */
1952
1953 if (c < 0)
1954 {
1955 if (-c >= ESC_REF)
1956 {
Guido van Rossum50700601997-12-08 17:15:20 +00001957 int refnum = -c - ESC_REF;
1958 if (*brackets < refnum)
1959 {
1960 *errorptr = ERR15;
1961 goto FAILED;
1962 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001963 previous = code;
1964 *code++ = OP_REF;
1965 *code++ = refnum;
1966 }
1967 else
1968 {
Guido van Rossum50700601997-12-08 17:15:20 +00001969 previous = (-c > ESC_b && -c < ESC_X)? code : NULL;
1970 if ( (options & PCRE_LOCALE) != 0)
1971 {
1972 switch (c)
1973 {
1974 case (-ESC_b): c = -OP_WORD_BOUNDARY_L; break;
1975 case (-ESC_B): c = -OP_NOT_WORD_BOUNDARY_L; break;
1976 case (-ESC_w): c = -OP_WORDCHAR_L; break;
1977 case (-ESC_W): c = -OP_NOT_WORDCHAR_L; break;
1978 }
1979 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001980 *code++ = -c;
1981 }
1982 continue;
1983 }
1984
Guido van Rossum58132c61997-12-17 00:24:13 +00001985 /* Data character: Reset and fall through */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001986
1987 ptr = oldptr;
1988 c = '\\';
1989
1990 /* Handle a run of data characters until a metacharacter is encountered.
1991 The first character is guaranteed not to be whitespace or # when the
1992 extended flag is set. */
1993
Guido van Rossum50700601997-12-08 17:15:20 +00001994 NORMAL_CHAR:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001995 default:
1996 previous = code;
1997 *code = OP_CHARS;
1998 code += 2;
1999 length = 0;
2000
2001 do
2002 {
Guido van Rossum50700601997-12-08 17:15:20 +00002003 if ((options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002004 {
2005 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
2006 if (c == '#')
2007 {
2008 while ((c = *(++ptr)) != 0 && c != '\n');
2009 if (c == 0) break;
2010 continue;
2011 }
2012 }
2013
2014 /* Backslash may introduce a data char or a metacharacter. Escaped items
2015 are checked for validity in the pre-compiling pass. Stop the string
2016 before a metaitem. */
2017
2018 if (c == '\\')
2019 {
2020 oldptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00002021 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002022 if (c < 0) { ptr = oldptr; break; }
2023 }
2024
2025 /* Ordinary character or single-char escape */
2026
2027 *code++ = c;
2028 length++;
2029 }
2030
2031 /* This "while" is the end of the "do" above. */
2032
2033 while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2034
2035 /* Compute the length and set it in the data vector, and advance to
2036 the next state. */
2037
2038 previous[1] = length;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00002039 if (length < 255) ptr--;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002040 break;
2041 }
2042 } /* end of big loop */
2043
2044/* Control never reaches here by falling through, only by a goto for all the
2045error states. Pass back the position in the pattern so that it can be displayed
2046to the user for diagnosing the error. */
2047
2048FAILED:
2049*ptrptr = ptr;
2050return FALSE;
2051}
2052
2053
2054
2055
2056/*************************************************
2057* Compile sequence of alternatives *
2058*************************************************/
2059
2060/* On entry, ptr is pointing past the bracket character, but on return
2061it points to the closing bracket, or vertical bar, or end of string.
2062The code variable is pointing at the byte into which the BRA operator has been
2063stored.
2064
2065Argument:
Guido van Rossum50700601997-12-08 17:15:20 +00002066 options the option bits
2067 brackets -> int containing the number of extracting brackets used
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002068 codeptr -> the address of the current code pointer
2069 ptrptr -> the address of the current pattern pointer
2070 errorptr -> pointer to error message
2071
2072Returns: TRUE on success
2073*/
2074
2075static BOOL
Guido van Rossum50700601997-12-08 17:15:20 +00002076compile_regex(int options, int *brackets, uschar **codeptr,
Guido van Rossum58132c61997-12-17 00:24:13 +00002077 const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002078{
Guido van Rossum58132c61997-12-17 00:24:13 +00002079const uschar *ptr = *ptrptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002080uschar *code = *codeptr;
2081uschar *start_bracket = code;
2082
2083for (;;)
2084 {
2085 int length;
2086 uschar *last_branch = code;
2087
2088 code += 3;
Guido van Rossum50700601997-12-08 17:15:20 +00002089 if (!compile_branch(options, brackets, &code, &ptr, errorptr, dictionary))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002090 {
2091 *ptrptr = ptr;
2092 return FALSE;
2093 }
2094
2095 /* Fill in the length of the last branch */
2096
2097 length = code - last_branch;
2098 last_branch[1] = length >> 8;
2099 last_branch[2] = length & 255;
2100
2101 /* Reached end of expression, either ')' or end of pattern. Insert a
2102 terminating ket and the length of the whole bracketed item, and return,
2103 leaving the pointer at the terminating char. */
2104
2105 if (*ptr != '|')
2106 {
2107 length = code - start_bracket;
2108 *code++ = OP_KET;
2109 *code++ = length >> 8;
2110 *code++ = length & 255;
2111 *codeptr = code;
2112 *ptrptr = ptr;
2113 return TRUE;
2114 }
2115
2116 /* Another branch follows; insert an "or" node and advance the pointer. */
2117
2118 *code = OP_ALT;
2119 ptr++;
2120 }
2121/* Control never reaches here */
2122}
2123
2124
2125
2126/*************************************************
2127* Check for anchored expression *
2128*************************************************/
2129
2130/* Try to find out if this is an anchored regular expression. Consider each
2131alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2132all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2133it's anchored. However, if this is a multiline pattern, then only OP_SOD
2134counts, since OP_CIRC can match in the middle.
2135
2136A branch is also implicitly anchored if it starts with .* because that will try
2137the rest of the pattern at all possible matching points, so there is no point
2138trying them again.
2139
2140Argument: points to start of expression (the bracket)
2141Returns: TRUE or FALSE
2142*/
2143
2144static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00002145is_anchored(register const uschar *code, BOOL multiline)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002146{
2147do {
2148 int op = (int)code[3];
Guido van Rossum50700601997-12-08 17:15:20 +00002149 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002150 { if (!is_anchored(code+3, multiline)) return FALSE; }
2151 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2152 { if (code[4] != OP_ANY) return FALSE; }
2153 else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;
2154 code += (code[1] << 8) + code[2];
2155 }
2156while (*code == OP_ALT);
2157return TRUE;
2158}
2159
2160
2161
2162/*************************************************
2163* Check for start with \n line expression *
2164*************************************************/
2165
2166/* This is called for multiline expressions to try to find out if every branch
2167starts with ^ so that "first char" processing can be done to speed things up.
2168
2169Argument: points to start of expression (the bracket)
2170Returns: TRUE or FALSE
2171*/
2172
2173static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00002174is_startline(const uschar *code)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002175{
2176do {
2177 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2178 { if (!is_startline(code+3)) return FALSE; }
2179 else if (code[3] != OP_CIRC) return FALSE;
2180 code += (code[1] << 8) + code[2];
2181 }
2182while (*code == OP_ALT);
2183return TRUE;
2184}
2185
2186
2187
2188/*************************************************
2189* Check for fixed first char *
2190*************************************************/
2191
2192/* Try to find out if there is a fixed first character. This is called for
2193unanchored expressions, as it speeds up their processing quite considerably.
2194Consider each alternative branch. If they all start with the same char, or with
2195a bracket all of whose alternatives start with the same char (recurse ad lib),
2196then we return that char, otherwise -1.
2197
2198Argument: points to start of expression (the bracket)
2199Returns: -1 or the fixed first char
2200*/
2201
2202static int
2203find_firstchar(uschar *code)
2204{
2205register int c = -1;
2206do
2207 {
2208 register int charoffset = 4;
2209
2210 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2211 {
2212 register int d;
2213 if ((d = find_firstchar(code+3)) < 0) return -1;
2214 if (c < 0) c = d; else if (c != d) return -1;
2215 }
2216
2217 else switch(code[3])
2218 {
2219 default:
2220 return -1;
2221
2222 case OP_EXACT: /* Fall through */
2223 charoffset++;
2224
2225 case OP_CHARS: /* Fall through */
2226 charoffset++;
2227
2228 case OP_PLUS:
2229 case OP_MINPLUS:
2230 if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;
2231 break;
2232 }
2233 code += (code[1] << 8) + code[2];
2234 }
2235while (*code == OP_ALT);
2236return c;
2237}
2238
2239
2240
2241/*************************************************
2242* Compile a Regular Expression *
2243*************************************************/
2244
2245/* This function takes a string and returns a pointer to a block of store
2246holding a compiled version of the expression.
2247
2248Arguments:
2249 pattern the regular expression
2250 options various option bits
2251 errorptr pointer to pointer to error text
2252 erroroffset ptr offset in pattern where error was detected
2253
2254Returns: pointer to compiled data block, or NULL on error,
2255 with errorptr and erroroffset set
2256*/
2257
2258pcre *
Guido van Rossum58132c61997-12-17 00:24:13 +00002259pcre_compile(const char *pattern, int options, const char **errorptr,
Guido van Rossum50700601997-12-08 17:15:20 +00002260 int *erroroffset, PyObject *dictionary)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002261{
2262real_pcre *re;
2263int spaces = 0;
2264int length = 3; /* For initial BRA plus length */
2265int runlength;
2266int c, size;
Guido van Rossum50700601997-12-08 17:15:20 +00002267int bracount = 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002268int brastack[200];
Guido van Rossum50700601997-12-08 17:15:20 +00002269int top_backref = 0;
Guido van Rossum58132c61997-12-17 00:24:13 +00002270unsigned int brastackptr = 0;
2271uschar *code;
2272const uschar *ptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002273
2274#ifdef DEBUG
2275uschar *code_base, *code_end;
2276#endif
2277
Guido van Rossum50700601997-12-08 17:15:20 +00002278/* We can't pass back an error message if errorptr is NULL; I guess the best we
2279can do is just return NULL. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002280
Guido van Rossum50700601997-12-08 17:15:20 +00002281if (errorptr == NULL) return NULL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002282*errorptr = NULL;
Guido van Rossum50700601997-12-08 17:15:20 +00002283
2284/* However, we can give a message for this error */
2285
2286if (erroroffset == NULL)
2287 {
2288 *errorptr = ERR16;
2289 return NULL;
2290 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002291*erroroffset = 0;
2292
2293if ((options & ~PUBLIC_OPTIONS) != 0)
2294 {
Guido van Rossum50700601997-12-08 17:15:20 +00002295 *errorptr = ERR17;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002296 return NULL;
2297 }
2298
Guido van Rossum557dea11997-12-22 22:46:52 +00002299DPRINTF(("------------------------------------------------------------------\n"));
2300DPRINTF(("%s\n", pattern));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002301
2302/* The first thing to do is to make a pass over the pattern to compute the
2303amount of store required to hold the compiled code. This does not have to be
2304perfect as long as errors are overestimates. At the same time we can detect any
2305internal flag settings. Make an attempt to correct for any counted white space
2306if an "extended" flag setting appears late in the pattern. We can't be so
2307clever for #-comments. */
2308
Guido van Rossum58132c61997-12-17 00:24:13 +00002309ptr = (const uschar *)(pattern - 1);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002310while ((c = *(++ptr)) != 0)
2311 {
Guido van Rossum50700601997-12-08 17:15:20 +00002312 int min, max;
2313 int class_charcount;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002314
2315 if ((pcre_ctypes[c] & ctype_space) != 0)
2316 {
Guido van Rossum50700601997-12-08 17:15:20 +00002317 if ((options & PCRE_EXTENDED) != 0) continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002318 spaces++;
2319 }
2320
Guido van Rossum50700601997-12-08 17:15:20 +00002321 if (c == '#' && (options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002322 {
2323 while ((c = *(++ptr)) != 0 && c != '\n');
2324 continue;
2325 }
2326
2327 switch(c)
2328 {
2329 /* A backslashed item may be an escaped "normal" character or a
2330 character type. For a "normal" character, put the pointers and
2331 character back so that tests for whitespace etc. in the input
2332 are done correctly. */
2333
2334 case '\\':
2335 {
Guido van Rossum58132c61997-12-17 00:24:13 +00002336 const uschar *save_ptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00002337 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002338 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2339 if (c >= 0)
2340 {
2341 ptr = save_ptr;
2342 c = '\\';
2343 goto NORMAL_CHAR;
2344 }
2345 }
2346 length++;
2347
2348 /* A back reference needs an additional char, plus either one or 5
Guido van Rossum50700601997-12-08 17:15:20 +00002349 bytes for a repeat. We also need to keep the value of the highest
2350 back reference. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002351
2352 if (c <= -ESC_REF)
2353 {
Guido van Rossum50700601997-12-08 17:15:20 +00002354 int refnum = -c - ESC_REF;
2355 if (refnum > top_backref) top_backref = refnum;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002356 length++; /* For single back reference */
Guido van Rossum50700601997-12-08 17:15:20 +00002357 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002358 {
2359 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2360 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2361 if ((min == 0 && (max == 1 || max == -1)) ||
2362 (min == 1 && max == -1))
2363 length++;
2364 else length += 5;
2365 if (ptr[1] == '?') ptr++;
2366 }
2367 }
2368 continue;
2369
2370 case '^':
2371 case '.':
2372 case '$':
2373 case '*': /* These repeats won't be after brackets; */
2374 case '+': /* those are handled separately */
2375 case '?':
2376 length++;
2377 continue;
2378
2379 /* This covers the cases of repeats after a single char, metachar, class,
2380 or back reference. */
2381
2382 case '{':
Guido van Rossum50700601997-12-08 17:15:20 +00002383 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002384 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
2385 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2386 if ((min == 0 && (max == 1 || max == -1)) ||
2387 (min == 1 && max == -1))
2388 length++;
2389 else
2390 {
2391 length--; /* Uncount the original char or metachar */
2392 if (min == 1) length++; else if (min > 0) length += 4;
2393 if (max > 0) length += 4; else length += 2;
2394 }
2395 if (ptr[1] == '?') ptr++;
2396 continue;
2397
2398 /* An alternation contains an offset to the next branch or ket. */
2399 case '|':
2400 length += 3;
2401 continue;
2402
Guido van Rossum50700601997-12-08 17:15:20 +00002403 /* A character class uses 33 characters. Don't worry about character types
2404 that aren't allowed in classes - they'll get picked up during the compile.
2405 A character class that contains only one character uses 2 or 3 bytes,
2406 depending on whether it is negated or not. Notice this where we can. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002407
2408 case '[':
Guido van Rossum50700601997-12-08 17:15:20 +00002409 class_charcount = 0;
2410 if (*(++ptr) == '^') ptr++;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002411 do
2412 {
Guido van Rossum50700601997-12-08 17:15:20 +00002413 if (*ptr == '\\')
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002414 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002415 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002416 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
Guido van Rossum557dea11997-12-22 22:46:52 +00002417 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002418 }
Guido van Rossum50700601997-12-08 17:15:20 +00002419 else class_charcount++;
2420 ptr++;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002421 }
2422 while (*ptr != 0 && *ptr != ']');
2423
Guido van Rossum50700601997-12-08 17:15:20 +00002424 /* Repeats for negated single chars are handled by the general code */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002425
Guido van Rossum50700601997-12-08 17:15:20 +00002426 if (class_charcount == 1) length += 3; else
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002427 {
Guido van Rossum50700601997-12-08 17:15:20 +00002428 length += 33;
2429 if (options & PCRE_LOCALE) length++; /* Add a byte for the localization flag */
2430
2431 /* A repeat needs either 1 or 5 bytes. */
2432
Guido van Rossum557dea11997-12-22 22:46:52 +00002433 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
Guido van Rossum50700601997-12-08 17:15:20 +00002434 {
2435 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2436 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2437 if ((min == 0 && (max == 1 || max == -1)) ||
2438 (min == 1 && max == -1))
2439 length++;
2440 else length += 5;
2441 if (ptr[1] == '?') ptr++;
2442 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002443 }
2444 continue;
2445
2446 /* Brackets may be genuine groups or special things */
2447
2448 case '(':
2449
2450 /* Handle special forms of bracket, which all start (? */
2451
2452 if (ptr[1] == '?') switch (c = ptr[2])
2453 {
2454 /* Skip over comments entirely */
2455 case '#':
2456 ptr += 3;
2457 while (*ptr != 0 && *ptr != ')') ptr++;
2458 if (*ptr == 0)
2459 {
Guido van Rossum50700601997-12-08 17:15:20 +00002460 *errorptr = ERR18;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002461 goto PCRE_ERROR_RETURN;
2462 }
2463 continue;
2464
2465 /* Non-referencing groups and lookaheads just move the pointer on, and
Guido van Rossum50700601997-12-08 17:15:20 +00002466 then behave like a non-special bracket, except that they don't increment
2467 the count of extracting brackets. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002468
2469 case ':':
2470 case '=':
2471 case '!':
2472 ptr += 2;
2473 break;
2474
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002475 case ('P'):
2476 {
2477 int idlen;
2478 switch (*ptr++) {
2479 case ('<'):
2480 idlen = get_group_id(ptr++, '>', errorptr);
2481 if (*errorptr) goto PCRE_ERROR_RETURN;
2482 ptr += idlen+1;
2483 break;
2484 case ('='):
2485 idlen = get_group_id(ptr++, ')', errorptr);
2486 if (*errorptr) goto PCRE_ERROR_RETURN;
2487 ptr += idlen+1;
2488 length++;
2489 break;
2490 }
2491 }
2492 break;
2493
Guido van Rossum50700601997-12-08 17:15:20 +00002494 /* Ditto for the "once only" bracket, allowed only if the extra bit
2495 is set. */
2496
2497 case '>':
2498 if ((options & PCRE_EXTRA) != 0)
2499 {
2500 ptr += 2;
2501 break;
2502 }
Guido van Rossumdda66961998-05-07 15:32:44 +00002503 /* Else fall through */
Guido van Rossum50700601997-12-08 17:15:20 +00002504
2505 /* Else loop setting valid options until ) is met. Anything else is an
2506 error. */
2507
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002508 default:
2509 ptr += 2;
2510 for (;; ptr++)
2511 {
2512 if ((c = *ptr) == 'i')
2513 {
2514 options |= PCRE_CASELESS;
2515 continue;
2516 }
Guido van Rossumbd49ac41997-12-10 23:05:53 +00002517 else if ((c = *ptr) == 'L')
Guido van Rossum50700601997-12-08 17:15:20 +00002518 {
2519 options |= PCRE_LOCALE;
2520 continue;
2521 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002522 else if ((c = *ptr) == 'm')
2523 {
2524 options |= PCRE_MULTILINE;
2525 continue;
2526 }
Guido van Rossum50700601997-12-08 17:15:20 +00002527 else if (c == 's')
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002528 {
2529 options |= PCRE_DOTALL;
2530 continue;
2531 }
2532 else if (c == 'x')
2533 {
2534 options |= PCRE_EXTENDED;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002535 length -= spaces; /* Already counted spaces */
2536 continue;
2537 }
2538 else if (c == ')') break;
2539
Guido van Rossum50700601997-12-08 17:15:20 +00002540 *errorptr = ERR12;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002541 goto PCRE_ERROR_RETURN;
2542 }
2543 continue; /* End of this bracket handling */
2544 }
2545
Guido van Rossum50700601997-12-08 17:15:20 +00002546 /* Extracting brackets must be counted so we can process escapes in a
2547 Perlish way. */
2548
2549 else bracount++;
2550
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002551 /* Non-special forms of bracket. Save length for computing whole length
2552 at end if there's a repeat that requires duplication of the group. */
2553
2554 if (brastackptr >= sizeof(brastack)/sizeof(int))
2555 {
Guido van Rossum50700601997-12-08 17:15:20 +00002556 *errorptr = ERR19;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002557 goto PCRE_ERROR_RETURN;
2558 }
2559
2560 brastack[brastackptr++] = length;
2561 length += 3;
2562 continue;
2563
2564 /* Handle ket. Look for subsequent max/min; for certain sets of values we
Guido van Rossum557dea11997-12-22 22:46:52 +00002565 have to replicate this bracket up to that many times. If brastackptr is
2566 0 this is an unmatched bracket which will generate an error, but take care
2567 not to try to access brastack[-1]. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002568
2569 case ')':
2570 length += 3;
2571 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002572 int minval = 1;
2573 int maxval = 1;
2574 int duplength = (brastackptr > 0)? length - brastack[--brastackptr] : 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002575
2576 /* Leave ptr at the final char; for read_repeat_counts this happens
2577 automatically; for the others we need an increment. */
2578
Guido van Rossum50700601997-12-08 17:15:20 +00002579 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002580 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002581 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002582 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2583 }
Guido van Rossum557dea11997-12-22 22:46:52 +00002584 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2585 else if (c == '+') { maxval = -1; ptr++; }
2586 else if (c == '?') { minval = 0; ptr++; }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002587
Guido van Rossum557dea11997-12-22 22:46:52 +00002588 /* If there is a minimum > 1 we have to replicate up to minval-1 times;
2589 if there is a limited maximum we have to replicate up to maxval-1 times
2590 and allow for a BRAZERO item before each optional copy, as we also have
2591 to do before the first copy if the minimum is zero. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002592
Guido van Rossum557dea11997-12-22 22:46:52 +00002593 if (minval == 0) length++;
2594 else if (minval > 1) length += (minval - 1) * duplength;
2595 if (maxval > minval) length += (maxval - minval) * (duplength + 1);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002596 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002597 continue;
2598
2599 /* Non-special character. For a run of such characters the length required
2600 is the number of characters + 2, except that the maximum run length is 255.
2601 We won't get a skipped space or a non-data escape or the start of a #
2602 comment as the first character, so the length can't be zero. */
2603
2604 NORMAL_CHAR:
2605 default:
2606 length += 2;
2607 runlength = 0;
2608 do
2609 {
2610 if ((pcre_ctypes[c] & ctype_space) != 0)
2611 {
Guido van Rossum50700601997-12-08 17:15:20 +00002612 if ((options & PCRE_EXTENDED) != 0) continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002613 spaces++;
2614 }
2615
Guido van Rossum50700601997-12-08 17:15:20 +00002616 if (c == '#' && (options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002617 {
2618 while ((c = *(++ptr)) != 0 && c != '\n');
2619 continue;
2620 }
2621
2622 /* Backslash may introduce a data char or a metacharacter; stop the
2623 string before the latter. */
2624
2625 if (c == '\\')
2626 {
Guido van Rossum58132c61997-12-17 00:24:13 +00002627 const uschar *saveptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00002628 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002629 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2630 if (c < 0) { ptr = saveptr; break; }
2631 }
2632
2633 /* Ordinary character or single-char escape */
2634
2635 runlength++;
2636 }
2637
2638 /* This "while" is the end of the "do" above. */
2639
2640 while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2641
2642 ptr--;
2643 length += runlength;
2644 continue;
2645 }
2646 }
2647
2648length += 4; /* For final KET and END */
2649
2650if (length > 65539)
2651 {
Guido van Rossum50700601997-12-08 17:15:20 +00002652 *errorptr = ERR20;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002653 return NULL;
2654 }
2655
2656/* Compute the size of data block needed and get it, either from malloc or
Guido van Rossum557dea11997-12-22 22:46:52 +00002657externally provided function. We specify "code[0]" in the offsetof() expression
2658rather than just "code", because it has been reported that one broken compiler
2659fails on "code" because it is also an independent variable. It should make no
2660difference to the value of the offsetof(). */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002661
Guido van Rossum557dea11997-12-22 22:46:52 +00002662size = length + offsetof(real_pcre, code[0]);
Guido van Rossum50700601997-12-08 17:15:20 +00002663re = (real_pcre *)(pcre_malloc)(size+50);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002664
2665if (re == NULL)
2666 {
Guido van Rossum50700601997-12-08 17:15:20 +00002667 *errorptr = ERR21;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002668 return NULL;
2669 }
2670
Guido van Rossum557dea11997-12-22 22:46:52 +00002671/* Put in the magic number and the options. */
2672
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002673re->magic_number = MAGIC_NUMBER;
2674re->options = options;
2675
2676/* Set up a starting, non-extracting bracket, then compile the expression. On
2677error, *errorptr will be set non-NULL, so we don't need to look at the result
2678of the function here. */
2679
Guido van Rossum58132c61997-12-17 00:24:13 +00002680ptr = (const uschar *)pattern;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002681code = re->code;
2682*code = OP_BRA;
Guido van Rossum50700601997-12-08 17:15:20 +00002683bracount = 0;
2684(void)compile_regex(options, &bracount, &code, &ptr, errorptr, dictionary);
2685re->top_bracket = bracount;
2686re->top_backref = top_backref;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002687
2688/* If not reached end of pattern on success, there's an excess bracket. */
2689
Guido van Rossum50700601997-12-08 17:15:20 +00002690if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
2691
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002692/* Fill in the terminating state and check for disastrous overflow, but
2693if debugging, leave the test till after things are printed out. */
2694
2695*code++ = OP_END;
2696
Guido van Rossum50700601997-12-08 17:15:20 +00002697
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002698#ifndef DEBUG
Guido van Rossum50700601997-12-08 17:15:20 +00002699if (code - re->code > length) *errorptr = ERR23;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002700#endif
2701
2702/* Failed to compile */
2703
2704if (*errorptr != NULL)
2705 {
2706 (pcre_free)(re);
2707 PCRE_ERROR_RETURN:
Guido van Rossum58132c61997-12-17 00:24:13 +00002708 *erroroffset = ptr - (const uschar *)pattern;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002709 return NULL;
2710 }
2711
2712/* If the anchored option was not passed, set flag if we can determine that it
2713is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
2714we can determine what the first character has to be, because that speeds up
2715unanchored matches no end. In the case of multiline matches, an alternative is
2716to set the PCRE_STARTLINE flag if all branches start with ^. */
2717
2718if ((options & PCRE_ANCHORED) == 0)
2719 {
2720 if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))
2721 re->options |= PCRE_ANCHORED;
2722 else
2723 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002724 int ch = find_firstchar(re->code);
2725 if (ch >= 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002726 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002727 re->first_char = ch;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002728 re->options |= PCRE_FIRSTSET;
2729 }
2730 else if (is_startline(re->code))
2731 re->options |= PCRE_STARTLINE;
2732 }
2733 }
2734
2735/* Print out the compiled data for debugging */
2736
2737#ifdef DEBUG
2738
Guido van Rossum50700601997-12-08 17:15:20 +00002739printf("Length = %d top_bracket = %d top_backref=%d\n",
2740 length, re->top_bracket, re->top_backref);
2741
2742if (re->options != 0)
2743 {
Guido van Rossumdda66961998-05-07 15:32:44 +00002744 printf("%s%s%s%s%s%s%s%s\n",
Guido van Rossum50700601997-12-08 17:15:20 +00002745 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2746 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2747 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2748 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2749 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2750 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
Guido van Rossumdda66961998-05-07 15:32:44 +00002751 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
2752 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
Guido van Rossum50700601997-12-08 17:15:20 +00002753 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002754
2755if ((re->options & PCRE_FIRSTSET) != 0)
2756 {
2757 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2758 else printf("First char = \\x%02x\n", re->first_char);
2759 }
2760
2761code_end = code;
2762code_base = code = re->code;
2763
2764while (code < code_end)
2765 {
2766 int charlength;
2767
2768 printf("%3d ", code - code_base);
2769
2770 if (*code >= OP_BRA)
2771 {
2772 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2773 code += 2;
2774 }
2775
2776 else switch(*code)
2777 {
2778 case OP_CHARS:
2779 charlength = *(++code);
2780 printf("%3d ", charlength);
2781 while (charlength-- > 0)
2782 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2783 break;
2784
2785 case OP_KETRMAX:
2786 case OP_KETRMIN:
2787 case OP_ALT:
2788 case OP_KET:
2789 case OP_ASSERT:
2790 case OP_ASSERT_NOT:
Guido van Rossum50700601997-12-08 17:15:20 +00002791 case OP_ONCE:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002792 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2793 code += 2;
2794 break;
2795
2796 case OP_STAR:
2797 case OP_MINSTAR:
2798 case OP_PLUS:
2799 case OP_MINPLUS:
2800 case OP_QUERY:
2801 case OP_MINQUERY:
2802 case OP_TYPESTAR:
2803 case OP_TYPEMINSTAR:
2804 case OP_TYPEPLUS:
2805 case OP_TYPEMINPLUS:
2806 case OP_TYPEQUERY:
2807 case OP_TYPEMINQUERY:
2808 if (*code >= OP_TYPESTAR)
2809 printf(" %s", OP_names[code[1]]);
2810 else if (isprint(c = code[1])) printf(" %c", c);
2811 else printf(" \\x%02x", c);
2812 printf("%s", OP_names[*code++]);
2813 break;
2814
2815 case OP_EXACT:
2816 case OP_UPTO:
2817 case OP_MINUPTO:
2818 if (isprint(c = code[3])) printf(" %c{", c);
2819 else printf(" \\x%02x{", c);
Guido van Rossum557dea11997-12-22 22:46:52 +00002820 if (*code != OP_EXACT) printf("0,");
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002821 printf("%d}", (code[1] << 8) + code[2]);
2822 if (*code == OP_MINUPTO) printf("?");
2823 code += 3;
2824 break;
2825
2826 case OP_TYPEEXACT:
2827 case OP_TYPEUPTO:
2828 case OP_TYPEMINUPTO:
2829 printf(" %s{", OP_names[code[3]]);
2830 if (*code != OP_TYPEEXACT) printf(",");
2831 printf("%d}", (code[1] << 8) + code[2]);
2832 if (*code == OP_TYPEMINUPTO) printf("?");
2833 code += 3;
2834 break;
2835
Guido van Rossum50700601997-12-08 17:15:20 +00002836 case OP_NOT:
2837 if (isprint(c = *(++code))) printf(" [^%c]", c);
2838 else printf(" [^\\x%02x]", c);
2839 break;
2840
2841 case OP_NOTSTAR:
2842 case OP_NOTMINSTAR:
2843 case OP_NOTPLUS:
2844 case OP_NOTMINPLUS:
2845 case OP_NOTQUERY:
2846 case OP_NOTMINQUERY:
2847 if (isprint(c = code[1])) printf(" [^%c]", c);
2848 else printf(" [^\\x%02x]", c);
2849 printf("%s", OP_names[*code++]);
2850 break;
2851
2852 case OP_NOTEXACT:
2853 case OP_NOTUPTO:
2854 case OP_NOTMINUPTO:
2855 if (isprint(c = code[3])) printf(" [^%c]{", c);
2856 else printf(" [^\\x%02x]{", c);
2857 if (*code != OP_NOTEXACT) printf(",");
2858 printf("%d}", (code[1] << 8) + code[2]);
2859 if (*code == OP_NOTMINUPTO) printf("?");
2860 code += 3;
2861 break;
2862
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002863 case OP_REF:
2864 printf(" \\%d", *(++code));
Guido van Rossum557dea11997-12-22 22:46:52 +00002865 code ++;
2866 goto CLASS_REF_REPEAT;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002867
2868 case OP_CLASS:
Guido van Rossum042ff9e1998-04-03 21:13:31 +00002869 case OP_NEGCLASS:
Guido van Rossum50700601997-12-08 17:15:20 +00002870 case OP_CLASS_L:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002871 {
2872 int i, min, max;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002873
Guido van Rossum50700601997-12-08 17:15:20 +00002874 if (*code==OP_CLASS_L)
2875 {
2876 code++;
2877 printf("Locflag = %i ", *code++);
Guido van Rossum042ff9e1998-04-03 21:13:31 +00002878 printf(" [");
Guido van Rossum50700601997-12-08 17:15:20 +00002879 }
2880 else
Guido van Rossum042ff9e1998-04-03 21:13:31 +00002881 {
2882 if (*code++ == OP_CLASS) printf(" [");
2883 else printf(" ^[");
2884 }
2885
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002886
Guido van Rossum50700601997-12-08 17:15:20 +00002887 for (i = 0; i < 256; i++)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002888 {
Guido van Rossum50700601997-12-08 17:15:20 +00002889 if ((code[i/8] & (1 << (i&7))) != 0)
2890 {
2891 int j;
2892 for (j = i+1; j < 256; j++)
2893 if ((code[j/8] & (1 << (j&7))) == 0) break;
2894 if (i == '-' || i == ']') printf("\\");
2895 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2896 if (--j > i)
2897 {
2898 printf("-");
2899 if (j == '-' || j == ']') printf("\\");
2900 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2901 }
2902 i = j;
2903 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002904 }
2905 printf("]");
Guido van Rossum50700601997-12-08 17:15:20 +00002906 code += 32;
2907 /* code ++;*/
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002908
Guido van Rossum557dea11997-12-22 22:46:52 +00002909 CLASS_REF_REPEAT:
2910
Guido van Rossum50700601997-12-08 17:15:20 +00002911 switch(*code)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002912 {
2913 case OP_CRSTAR:
2914 case OP_CRMINSTAR:
2915 case OP_CRPLUS:
2916 case OP_CRMINPLUS:
2917 case OP_CRQUERY:
2918 case OP_CRMINQUERY:
2919 printf("%s", OP_names[*code]);
2920 break;
2921
2922 case OP_CRRANGE:
2923 case OP_CRMINRANGE:
2924 min = (code[1] << 8) + code[2];
2925 max = (code[3] << 8) + code[4];
2926 if (max == 0) printf("{%d,}", min);
2927 else printf("{%d,%d}", min, max);
2928 if (*code == OP_CRMINRANGE) printf("?");
2929 code += 4;
2930 break;
2931
2932 default:
2933 code--;
2934 }
2935 }
2936 break;
2937
2938 /* Anything else is just a one-node item */
2939
2940 default:
2941 printf(" %s", OP_names[*code]);
2942 break;
2943 }
2944
2945 code++;
2946 printf("\n");
2947 }
2948printf("------------------------------------------------------------------\n");
2949
2950/* This check is done here in the debugging case so that the code that
2951was compiled can be seen. */
2952
2953if (code - re->code > length)
2954 {
Guido van Rossum50700601997-12-08 17:15:20 +00002955 printf("length=%i, code length=%i\n", length, code-re->code);
2956 *errorptr = ERR23;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002957 (pcre_free)(re);
2958 *erroroffset = ptr - (uschar *)pattern;
2959 return NULL;
2960 }
2961#endif
2962
2963return (pcre *)re;
2964}
2965
2966
2967
2968/*************************************************
2969* Match a character type *
2970*************************************************/
2971
2972/* Not used in all the places it might be as it's sometimes faster
2973to put the code inline.
2974
2975Arguments:
2976 type the character type
2977 c the character
Guido van Rossum50700601997-12-08 17:15:20 +00002978 dotall the dotall flag
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002979
2980Returns: TRUE if character is of the type
2981*/
2982
2983static BOOL
2984match_type(int type, int c, BOOL dotall)
2985{
2986
2987#ifdef DEBUG
2988if (isprint(c)) printf("matching subject %c against ", c);
2989 else printf("matching subject \\x%02x against ", c);
2990printf("%s\n", OP_names[type]);
2991#endif
2992
2993switch(type)
2994 {
2995 case OP_ANY: return dotall || c != '\n';
2996 case OP_NOT_DIGIT: return (pcre_ctypes[c] & ctype_digit) == 0;
2997 case OP_DIGIT: return (pcre_ctypes[c] & ctype_digit) != 0;
2998 case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
2999 case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0;
3000 case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0;
3001 case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0;
Guido van Rossum58132c61997-12-17 00:24:13 +00003002 case OP_NOT_WORDCHAR_L: return (c!='_' && !isalnum(c));
3003 case OP_WORDCHAR_L: return (c=='_' || isalnum(c));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003004 }
3005return FALSE;
3006}
3007
3008
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003009
3010/*************************************************
3011* Match a back-reference *
3012*************************************************/
3013
3014/* If a back reference hasn't been set, the match fails.
3015
3016Arguments:
3017 number reference number
3018 eptr points into the subject
3019 length length to be matched
3020 md points to match data block
3021
3022Returns: TRUE if matched
3023*/
3024
3025static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00003026match_ref(int number, register const uschar *eptr, int length, match_data *md)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003027{
Guido van Rossum58132c61997-12-17 00:24:13 +00003028const uschar *p = md->start_subject + md->offset_vector[number];
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003029
3030#ifdef DEBUG
3031if (eptr >= md->end_subject)
3032 printf("matching subject <null>");
3033else
3034 {
3035 printf("matching subject ");
3036 pchars(eptr, length, TRUE, md);
3037 }
3038printf(" against backref ");
3039pchars(p, length, FALSE, md);
3040printf("\n");
3041#endif
3042
3043/* Always fail if not enough characters left */
3044
3045if (length > md->end_subject - p) return FALSE;
3046
Guido van Rossum58132c61997-12-17 00:24:13 +00003047/* Separate the caseless case for speed */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003048
3049if (md->caseless)
3050 { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }
3051else
3052 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3053
3054return TRUE;
3055}
3056
3057static int free_stack(match_data *md)
3058{
3059/* Free any stack space that was allocated by the call to match(). */
Andrew M. Kuchlingfc9d2252000-02-18 19:16:45 +00003060if (md->off_num) PyMem_DEL(md->off_num);
3061if (md->offset_top) PyMem_DEL(md->offset_top);
3062if (md->r1) PyMem_DEL(md->r1);
3063if (md->r2) PyMem_DEL(md->r2);
3064if (md->eptr) PyMem_DEL((char *)md->eptr);
3065if (md->ecode) PyMem_DEL((char *)md->ecode);
Guido van Rossumc3861071997-10-08 02:07:40 +00003066return 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003067}
3068
3069static int grow_stack(match_data *md)
3070{
Guido van Rossum50700601997-12-08 17:15:20 +00003071 if (md->length != 0)
3072 {
3073 md->length = md->length + md->length/2;
3074 }
3075 else
3076 {
3077 int string_len = md->end_subject - md->start_subject + 1;
3078 if (string_len < 80) {md->length = string_len; }
3079 else {md->length = 80;}
3080 }
3081 PyMem_RESIZE(md->offset_top, int, md->length);
Guido van Rossum58132c61997-12-17 00:24:13 +00003082 PyMem_RESIZE(md->eptr, const uschar *, md->length);
3083 PyMem_RESIZE(md->ecode, const uschar *, md->length);
Guido van Rossum50700601997-12-08 17:15:20 +00003084 PyMem_RESIZE(md->off_num, int, md->length);
3085 PyMem_RESIZE(md->r1, int, md->length);
3086 PyMem_RESIZE(md->r2, int, md->length);
3087 if (md->offset_top == NULL || md->eptr == NULL || md->ecode == NULL ||
3088 md->off_num == NULL || md->r1 == NULL || md->r2 == NULL)
3089 {
Guido van Rossumdda66961998-05-07 15:32:44 +00003090 PyErr_NoMemory();
Guido van Rossum50700601997-12-08 17:15:20 +00003091 longjmp(md->error_env, 1);
3092 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003093 return 0;
3094}
3095
Guido van Rossum50700601997-12-08 17:15:20 +00003096
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003097/*************************************************
3098* Match from current position *
3099*************************************************/
3100
3101/* On entry ecode points to the first opcode, and eptr to the first character.
3102
3103Arguments:
3104 eptr pointer in subject
3105 ecode position in code
3106 offset_top current top pointer
3107 md pointer to "static" info for the match
3108
3109Returns: TRUE if matched
3110*/
3111
3112static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00003113match(register const uschar *eptr, register const uschar *ecode, int offset_top,
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003114 match_data *md)
3115{
3116 int save_stack_position = md->point;
3117match_loop:
3118
3119#define SUCCEED goto succeed
3120#define FAIL goto fail
3121
3122for (;;)
3123 {
3124 int min, max, ctype;
3125 register int i;
3126 register int c;
Guido van Rossum58132c61997-12-17 00:24:13 +00003127 BOOL minimize = FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003128
3129 /* Opening bracket. Check the alternative branches in turn, failing if none
3130 match. We have to set the start offset if required and there is space
3131 in the offset vector so that it is available for subsequent back references
3132 if the bracket matches. However, if the bracket fails, we must put back the
3133 previous value of both offsets in case they were set by a previous copy of
3134 the same bracket. Don't worry about setting the flag for the error case here;
3135 that is handled in the code for KET. */
3136
3137 if ((int)*ecode >= OP_BRA)
3138 {
3139 int number = (*ecode - OP_BRA) << 1;
Guido van Rossum58132c61997-12-17 00:24:13 +00003140 int save_offset1 = 0, save_offset2 = 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003141
Guido van Rossum557dea11997-12-22 22:46:52 +00003142 DPRINTF(("start bracket %d\n", number/2));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003143
3144 if (number > 0 && number < md->offset_end)
3145 {
3146 save_offset1 = md->offset_vector[number];
3147 save_offset2 = md->offset_vector[number+1];
3148 md->offset_vector[number] = eptr - md->start_subject;
3149
Guido van Rossum557dea11997-12-22 22:46:52 +00003150 DPRINTF(("saving %d %d\n", save_offset1, save_offset2));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003151 }
3152
3153 /* Recurse for all the alternatives. */
3154
3155 do
3156 {
3157 if (match(eptr, ecode+3, offset_top, md)) SUCCEED;
3158 ecode += (ecode[1] << 8) + ecode[2];
3159 }
3160 while (*ecode == OP_ALT);
3161
Guido van Rossum557dea11997-12-22 22:46:52 +00003162 DPRINTF(("bracket %d failed\n", number/2));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003163
3164 if (number > 0 && number < md->offset_end)
3165 {
3166 md->offset_vector[number] = save_offset1;
3167 md->offset_vector[number+1] = save_offset2;
3168 }
3169
3170 FAIL;
3171 }
3172
3173 /* Other types of node can be handled by a switch */
3174
3175 switch(*ecode)
3176 {
3177 case OP_END:
3178 md->end_match_ptr = eptr; /* Record where we ended */
3179 md->end_offset_top = offset_top; /* and how many extracts were taken */
3180 SUCCEED;
3181
Guido van Rossum50700601997-12-08 17:15:20 +00003182 /* The equivalent of Prolog's "cut" - if the rest doesn't match, the
3183 whole thing doesn't match, so we have to get out via a longjmp(). */
3184
3185 case OP_CUT:
3186 if (match(eptr, ecode+1, offset_top, md)) SUCCEED;
3187 longjmp(md->fail_env, 1);
3188
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003189 /* Assertion brackets. Check the alternative branches in turn - the
3190 matching won't pass the KET for an assertion. If any one branch matches,
3191 the assertion is true. */
3192
3193 case OP_ASSERT:
3194 do
3195 {
3196 if (match(eptr, ecode+3, offset_top, md)) break;
3197 ecode += (ecode[1] << 8) + ecode[2];
3198 }
3199 while (*ecode == OP_ALT);
3200 if (*ecode == OP_KET) FAIL;
3201
3202 /* Continue from after the assertion, updating the offsets high water
3203 mark, since extracts may have been taken during the assertion. */
3204
3205 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3206 ecode += 3;
3207 offset_top = md->end_offset_top;
3208 continue;
3209
3210 /* Negative assertion: all branches must fail to match */
3211
3212 case OP_ASSERT_NOT:
3213 do
3214 {
3215 if (match(eptr, ecode+3, offset_top, md)) FAIL;
3216 ecode += (ecode[1] << 8) + ecode[2];
3217 }
3218 while (*ecode == OP_ALT);
3219 ecode += 3;
3220 continue;
3221
Guido van Rossum50700601997-12-08 17:15:20 +00003222 /* "Once" brackets are like assertion brackets except that after a match,
3223 the point in the subject string is not moved back. Thus there can never be
3224 a move back into the brackets. Check the alternative branches in turn - the
3225 matching won't pass the KET for this kind of subpattern. If any one branch
3226 matches, we carry on, leaving the subject pointer. */
3227
3228 case OP_ONCE:
3229 do
3230 {
3231 if (match(eptr, ecode+3, offset_top, md)) break;
3232 ecode += (ecode[1] << 8) + ecode[2];
3233 }
3234 while (*ecode == OP_ALT);
Guido van Rossum557dea11997-12-22 22:46:52 +00003235 if (*ecode == OP_KET) FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00003236
3237 /* Continue as from after the assertion, updating the offsets high water
3238 mark, since extracts may have been taken. */
3239
3240 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3241 ecode += 3;
3242 offset_top = md->end_offset_top;
3243 eptr = md->end_match_ptr;
3244 continue;
3245
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003246 /* An alternation is the end of a branch; scan along to find the end of the
3247 bracketed group and go to there. */
3248
3249 case OP_ALT:
3250 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3251 break;
3252
3253 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3254 that it may occur zero times. It may repeat infinitely, or not at all -
3255 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3256 repeat limits are compiled as a number of copies, with the optional ones
3257 preceded by BRAZERO or BRAMINZERO. */
3258
3259 case OP_BRAZERO:
3260 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003261 const uschar *next = ecode+1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003262 if (match(eptr, next, offset_top, md)) SUCCEED;
3263 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3264 ecode = next + 3;
3265 }
3266 break;
3267
3268 case OP_BRAMINZERO:
3269 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003270 const uschar *next = ecode+1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003271 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3272 if (match(eptr, next+3, offset_top, md)) SUCCEED;
3273 ecode++;
3274 }
3275 break;;
3276
3277 /* End of a group, repeated or non-repeating. If we are at the end of
3278 an assertion "group", stop matching and SUCCEED, but record the
3279 current high water mark for use by positive assertions. */
3280
3281 case OP_KET:
3282 case OP_KETRMIN:
3283 case OP_KETRMAX:
3284 {
Guido van Rossum50700601997-12-08 17:15:20 +00003285 int number;
Guido van Rossum58132c61997-12-17 00:24:13 +00003286 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003287
Guido van Rossum50700601997-12-08 17:15:20 +00003288 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003289 {
Guido van Rossum50700601997-12-08 17:15:20 +00003290 md->end_match_ptr = eptr; /* For ONCE */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003291 md->end_offset_top = offset_top;
3292 SUCCEED;
3293 }
3294
3295 /* In all other cases we have to check the group number back at the
3296 start and if necessary complete handling an extraction by setting the
3297 final offset and bumping the high water mark. */
3298
3299 number = (*prev - OP_BRA) << 1;
3300
Guido van Rossum557dea11997-12-22 22:46:52 +00003301 DPRINTF(("end bracket %d\n", number/2));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003302
3303 if (number > 0)
3304 {
3305 if (number >= md->offset_end) md->offset_overflow = TRUE; else
3306 {
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003307 md->offset_vector[number+1] = eptr - md->start_subject;
3308 if (offset_top <= number) offset_top = number + 2;
3309 }
3310 }
3311
3312 /* For a non-repeating ket, just advance to the next node and continue at
3313 this level. */
3314
3315 if (*ecode == OP_KET)
3316 {
3317 ecode += 3;
3318 break;
3319 }
3320
3321 /* The repeating kets try the rest of the pattern or restart from the
3322 preceding bracket, in the appropriate order. */
3323
3324 if (*ecode == OP_KETRMIN)
3325 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003326 const uschar *ptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003327 if (match(eptr, ecode+3, offset_top, md)) goto succeed;
3328 /* Handle alternation inside the BRA...KET; push the additional
Guido van Rossum58132c61997-12-17 00:24:13 +00003329 alternatives onto the stack */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003330 ptr=prev;
3331 do {
3332 ptr += (ptr[1]<<8)+ ptr[2];
3333 if (*ptr==OP_ALT)
3334 {
Guido van Rossum50700601997-12-08 17:15:20 +00003335 if (md->length == md->point)
3336 {
3337 grow_stack(md);
3338 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003339 md->offset_top[md->point] = offset_top;
3340 md->eptr[md->point] = eptr;
3341 md->ecode[md->point] = ptr+3;
3342 md->r1[md->point] = 0;
3343 md->r2[md->point] = 0;
3344 md->off_num[md->point] = 0;
3345 md->point++;
3346 }
3347 } while (*ptr==OP_ALT);
3348 ecode=prev+3; goto match_loop;
3349 }
3350 else /* OP_KETRMAX */
3351 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003352 const uschar *ptr;
3353 /*int points_pushed=0;*/
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003354
3355 /* Push one failure point, that will resume matching at the code after
3356 the KETRMAX opcode. */
Guido van Rossum50700601997-12-08 17:15:20 +00003357 if (md->length == md->point)
3358 {
3359 grow_stack(md);
3360 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003361 md->offset_top[md->point] = offset_top;
3362 md->eptr[md->point] = eptr;
3363 md->ecode[md->point] = ecode+3;
3364 md->r1[md->point] = md->offset_vector[number];
3365 md->r2[md->point] = md->offset_vector[number+1];
3366 md->off_num[md->point] = number;
3367 md->point++;
3368
3369 md->offset_vector[number] = eptr - md->start_subject;
3370 /* Handle alternation inside the BRA...KET; push each of the
Guido van Rossum58132c61997-12-17 00:24:13 +00003371 additional alternatives onto the stack */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003372 ptr=prev;
3373 do {
3374 ptr += (ptr[1]<<8)+ ptr[2];
3375 if (*ptr==OP_ALT)
3376 {
Guido van Rossum50700601997-12-08 17:15:20 +00003377 if (md->length == md->point)
3378 if (md->length == md->point)
3379 {
3380 grow_stack(md);
3381 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003382 md->offset_top[md->point] = offset_top;
3383 md->eptr[md->point] = eptr;
3384 md->ecode[md->point] = ptr+3;
3385 md->r1[md->point] = 0;
3386 md->r2[md->point] = 0;
3387 md->off_num[md->point] = 0;
3388 md->point++;
Guido van Rossum58132c61997-12-17 00:24:13 +00003389 /*points_pushed++;*/
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003390 }
3391 } while (*ptr==OP_ALT);
3392 /* Jump to the first (or only) alternative and resume trying to match */
3393 ecode=prev+3; goto match_loop;
3394 }
3395 }
Guido van Rossum58132c61997-12-17 00:24:13 +00003396
Guido van Rossum50700601997-12-08 17:15:20 +00003397 /* Start of subject unless notbol, or after internal newline if multiline */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003398
3399 case OP_CIRC:
Guido van Rossum50700601997-12-08 17:15:20 +00003400 if (md->notbol && eptr == md->start_subject) FAIL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003401 if (md->multiline)
3402 {
3403 if (eptr != md->start_subject && eptr[-1] != '\n') FAIL;
3404 ecode++;
3405 break;
3406 }
3407 /* ... else fall through */
3408
3409 /* Start of subject assertion */
3410
3411 case OP_SOD:
3412 if (eptr != md->start_subject) FAIL;
3413 ecode++;
3414 break;
3415
Guido van Rossum50700601997-12-08 17:15:20 +00003416 /* Assert before internal newline if multiline, or before
3417 a terminating newline unless endonly is set, else end of subject unless
3418 noteol is set. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003419
3420 case OP_DOLL:
Guido van Rossum50700601997-12-08 17:15:20 +00003421 if (md->noteol && eptr >= md->end_subject) FAIL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003422 if (md->multiline)
3423 {
3424 if (eptr < md->end_subject && *eptr != '\n') FAIL;
3425 ecode++;
3426 break;
3427 }
Guido van Rossum50700601997-12-08 17:15:20 +00003428 else if (!md->endonly)
3429 {
3430 if (eptr < md->end_subject - 1 ||
3431 (eptr == md->end_subject - 1 && *eptr != '\n')) FAIL;
3432 ecode++;
3433 break;
3434 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003435 /* ... else fall through */
3436
3437 /* End of subject assertion */
3438
3439 case OP_EOD:
3440 if (eptr < md->end_subject) FAIL;
3441 ecode++;
3442 break;
3443
3444 /* Word boundary assertions */
3445
3446 case OP_NOT_WORD_BOUNDARY:
3447 case OP_WORD_BOUNDARY:
3448 {
3449 BOOL prev_is_word = (eptr != md->start_subject) &&
3450 ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
3451 BOOL cur_is_word = (eptr < md->end_subject) &&
3452 ((pcre_ctypes[*eptr] & ctype_word) != 0);
3453 if ((*ecode++ == OP_WORD_BOUNDARY)?
3454 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3455 FAIL;
3456 }
3457 break;
3458
Guido van Rossum50700601997-12-08 17:15:20 +00003459 case OP_NOT_WORD_BOUNDARY_L:
3460 case OP_WORD_BOUNDARY_L:
3461 {
3462 BOOL prev_is_word = (eptr != md->start_subject) &&
Guido van Rossum58132c61997-12-17 00:24:13 +00003463 (isalnum(eptr[-1]) || eptr[-1]=='_');
Guido van Rossum50700601997-12-08 17:15:20 +00003464 BOOL cur_is_word = (eptr < md->end_subject) &&
Guido van Rossum58132c61997-12-17 00:24:13 +00003465 (isalnum(*eptr) || *eptr=='_');
Guido van Rossum50700601997-12-08 17:15:20 +00003466 if ((*ecode++ == OP_WORD_BOUNDARY_L)?
3467 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3468 FAIL;
3469 }
3470 break;
3471
3472
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003473 /* Match a single character type; inline for speed */
3474
3475 case OP_ANY:
3476 if (!md->dotall && eptr < md->end_subject && *eptr == '\n') FAIL;
3477 if (eptr++ >= md->end_subject) FAIL;
3478 ecode++;
3479 break;
3480
3481 case OP_NOT_DIGIT:
3482 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)
3483 FAIL;
3484 ecode++;
3485 break;
3486
3487 case OP_DIGIT:
3488 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)
3489 FAIL;
3490 ecode++;
3491 break;
3492
3493 case OP_NOT_WHITESPACE:
3494 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)
3495 FAIL;
3496 ecode++;
3497 break;
3498
3499 case OP_WHITESPACE:
3500 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)
3501 FAIL;
3502 ecode++;
3503 break;
3504
3505 case OP_NOT_WORDCHAR:
3506 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)
3507 FAIL;
3508 ecode++;
3509 break;
3510
3511 case OP_WORDCHAR:
3512 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)
3513 FAIL;
3514 ecode++;
3515 break;
3516
Guido van Rossum50700601997-12-08 17:15:20 +00003517 case OP_NOT_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00003518 if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) ))
Guido van Rossum557dea11997-12-22 22:46:52 +00003519 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00003520 eptr++;
3521 ecode++;
3522 break;
3523
3524 case OP_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00003525 if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) ))
Guido van Rossum557dea11997-12-22 22:46:52 +00003526 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00003527 eptr++;
3528 ecode++;
3529 break;
3530
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003531 /* Match a back reference, possibly repeatedly. Look past the end of the
3532 item to see if there is repeat information following. The code is similar
3533 to that for character classes, but repeated for efficiency. Then obey
3534 similar code to character type repeats - written out again for speed.
3535 However, if the referenced string is the empty string, always treat
3536 it as matched, any number of times (otherwise there could be infinite
3537 loops). */
3538
3539 case OP_REF:
3540 {
3541 int length;
3542 int number = ecode[1] << 1; /* Doubled reference number */
3543 ecode += 2; /* Advance past the item */
3544
3545 if (number >= offset_top || md->offset_vector[number] < 0)
3546 {
3547 md->errorcode = PCRE_ERROR_BADREF;
3548 FAIL;
3549 }
3550
3551 length = md->offset_vector[number+1] - md->offset_vector[number];
3552
3553 switch (*ecode)
3554 {
3555 case OP_CRSTAR:
3556 case OP_CRMINSTAR:
3557 case OP_CRPLUS:
3558 case OP_CRMINPLUS:
3559 case OP_CRQUERY:
3560 case OP_CRMINQUERY:
3561 c = *ecode++ - OP_CRSTAR;
3562 minimize = (c & 1) != 0;
3563 min = rep_min[c]; /* Pick up values from tables; */
3564 max = rep_max[c]; /* zero for max => infinity */
3565 if (max == 0) max = INT_MAX;
3566 break;
3567
3568 case OP_CRRANGE:
3569 case OP_CRMINRANGE:
3570 minimize = (*ecode == OP_CRMINRANGE);
3571 min = (ecode[1] << 8) + ecode[2];
3572 max = (ecode[3] << 8) + ecode[4];
3573 if (max == 0) max = INT_MAX;
3574 ecode += 5;
3575 break;
3576
3577 default: /* No repeat follows */
3578 if (!match_ref(number, eptr, length, md)) FAIL;
3579 eptr += length;
3580 continue; /* With the main loop */
3581 }
3582
3583 /* If the length of the reference is zero, just continue with the
3584 main loop. */
3585
3586 if (length == 0) continue;
3587
3588 /* First, ensure the minimum number of matches are present. We get back
3589 the length of the reference string explicitly rather than passing the
3590 address of eptr, so that eptr can be a register variable. */
3591
3592 for (i = 1; i <= min; i++)
3593 {
3594 if (!match_ref(number, eptr, length, md)) FAIL;
3595 eptr += length;
3596 }
3597
3598 /* If min = max, continue at the same level without recursion.
3599 They are not both allowed to be zero. */
3600
3601 if (min == max) continue;
3602
3603 /* If minimizing, keep trying and advancing the pointer */
3604
3605 if (minimize)
3606 {
3607 for (i = min;; i++)
3608 {
3609 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3610 if (i >= max || !match_ref(number, eptr, length, md))
3611 FAIL;
3612 eptr += length;
3613 }
3614 /* Control never gets here */
3615 }
3616
3617 /* If maximizing, find the longest string and work backwards */
3618
3619 else
3620 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003621 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003622 for (i = min; i < max; i++)
3623 {
3624 if (!match_ref(number, eptr, length, md)) break;
3625 eptr += length;
3626 }
3627 while (eptr >= pp)
3628 {
3629 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3630 eptr -= length;
3631 }
3632 FAIL;
3633 }
3634 }
3635 /* Control never gets here */
3636
3637 /* Match a character class, possibly repeatedly. Look past the end of the
3638 item to see if there is repeat information following. Then obey similar
Guido van Rossum50700601997-12-08 17:15:20 +00003639 code to character type repeats - written out again for speed. If caseless
3640 matching was set at runtime but not at compile time, we have to check both
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003641 versions of a character, and we have to behave differently for positive and
3642 negative classes. This is the only time where OP_CLASS and OP_NEGCLASS are
3643 treated differently. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003644
3645 case OP_CLASS:
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003646 case OP_NEGCLASS:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003647 {
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003648 BOOL nasty_case = *ecode == OP_NEGCLASS && md->runtime_caseless;
Guido van Rossum58132c61997-12-17 00:24:13 +00003649 const uschar *data = ecode + 1; /* Save for matching */
3650 ecode += 33; /* Advance past the item */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003651
3652 switch (*ecode)
3653 {
3654 case OP_CRSTAR:
3655 case OP_CRMINSTAR:
3656 case OP_CRPLUS:
3657 case OP_CRMINPLUS:
3658 case OP_CRQUERY:
3659 case OP_CRMINQUERY:
3660 c = *ecode++ - OP_CRSTAR;
3661 minimize = (c & 1) != 0;
3662 min = rep_min[c]; /* Pick up values from tables; */
3663 max = rep_max[c]; /* zero for max => infinity */
3664 if (max == 0) max = INT_MAX;
3665 break;
3666
3667 case OP_CRRANGE:
3668 case OP_CRMINRANGE:
3669 minimize = (*ecode == OP_CRMINRANGE);
3670 min = (ecode[1] << 8) + ecode[2];
3671 max = (ecode[3] << 8) + ecode[4];
3672 if (max == 0) max = INT_MAX;
3673 ecode += 5;
3674 break;
3675
3676 default: /* No repeat follows */
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003677 min = max = 1;
3678 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003679 }
3680
3681 /* First, ensure the minimum number of matches are present. */
3682
3683 for (i = 1; i <= min; i++)
Guido van Rossum50700601997-12-08 17:15:20 +00003684 {
3685 if (eptr >= md->end_subject) FAIL;
3686 c = *eptr++;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003687
3688 /* Either not runtime caseless, or it was a positive class. For
3689 runtime caseless, continue if either case is in the map. */
3690
3691 if (!nasty_case)
Guido van Rossum50700601997-12-08 17:15:20 +00003692 {
Guido van Rossum50700601997-12-08 17:15:20 +00003693 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003694 if (md->runtime_caseless)
3695 {
3696 c = pcre_fcc[c];
3697 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3698 }
Guido van Rossum50700601997-12-08 17:15:20 +00003699 }
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003700
3701 /* Runtime caseless and it was a negative class. Continue only if
3702 both cases are in the map. */
3703
3704 else
3705 {
3706 if ((data[c/8] & (1 << (c&7))) == 0) FAIL;
3707 c = pcre_fcc[c];
3708 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3709 }
3710
3711 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00003712 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003713
3714 /* If max == min we can continue with the main loop without the
3715 need to recurse. */
3716
3717 if (min == max) continue;
3718
3719 /* If minimizing, keep testing the rest of the expression and advancing
3720 the pointer while it matches the class. */
3721
3722 if (minimize)
3723 {
3724 for (i = min;; i++)
3725 {
3726 if (match(eptr, ecode, offset_top, md)) SUCCEED;
Guido van Rossum50700601997-12-08 17:15:20 +00003727 if (i >= max || eptr >= md->end_subject) FAIL;
3728 c = *eptr++;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003729
3730 /* Either not runtime caseless, or it was a positive class. For
3731 runtime caseless, continue if either case is in the map. */
3732
3733 if (!nasty_case)
Guido van Rossum50700601997-12-08 17:15:20 +00003734 {
Guido van Rossum50700601997-12-08 17:15:20 +00003735 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003736 if (md->runtime_caseless)
3737 {
3738 c = pcre_fcc[c];
3739 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3740 }
Guido van Rossum50700601997-12-08 17:15:20 +00003741 }
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003742
3743 /* Runtime caseless and it was a negative class. Continue only if
3744 both cases are in the map. */
3745
3746 else
3747 {
3748 if ((data[c/8] & (1 << (c&7))) == 0) return FALSE;
3749 c = pcre_fcc[c];
3750 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3751 }
3752
Guido van Rossum50700601997-12-08 17:15:20 +00003753 FAIL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003754 }
3755 /* Control never gets here */
3756 }
3757
3758 /* If maximizing, find the longest possible run, then work backwards. */
3759
3760 else
3761 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003762 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00003763 for (i = min; i < max; eptr++, i++)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003764 {
Guido van Rossum50700601997-12-08 17:15:20 +00003765 if (eptr >= md->end_subject) break;
3766 c = *eptr;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003767
3768 /* Either not runtime caseless, or it was a positive class. For
3769 runtime caseless, continue if either case is in the map. */
3770
3771 if (!nasty_case)
Guido van Rossum50700601997-12-08 17:15:20 +00003772 {
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003773 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3774 if (md->runtime_caseless)
3775 {
3776 c = pcre_fcc[c];
3777 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3778 }
3779 }
3780
3781 /* Runtime caseless and it was a negative class. Continue only if
3782 both cases are in the map. */
3783
3784 else
3785 {
3786 if ((data[c/8] & (1 << (c&7))) == 0) break;
Guido van Rossum50700601997-12-08 17:15:20 +00003787 c = pcre_fcc[c];
3788 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3789 }
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003790
Guido van Rossum50700601997-12-08 17:15:20 +00003791 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003792 }
Guido van Rossum50700601997-12-08 17:15:20 +00003793
3794 while (eptr >= pp)
3795 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3796 FAIL;
3797 }
3798 }
3799 /* Control never gets here */
3800
3801 /* OP_CLASS_L opcode: handles localized character classes */
3802
3803 case OP_CLASS_L:
3804 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003805 const uschar *data = ecode + 1; /* Save for matching */
3806 const uschar locale_flag = *data;
Guido van Rossum50700601997-12-08 17:15:20 +00003807 ecode++; data++; /* The localization support adds an extra byte */
3808
3809 ecode += 33; /* Advance past the item */
3810
3811 switch (*ecode)
3812 {
3813 case OP_CRSTAR:
3814 case OP_CRMINSTAR:
3815 case OP_CRPLUS:
3816 case OP_CRMINPLUS:
3817 case OP_CRQUERY:
3818 case OP_CRMINQUERY:
3819 c = *ecode++ - OP_CRSTAR;
3820 minimize = (c & 1) != 0;
3821 min = rep_min[c]; /* Pick up values from tables; */
3822 max = rep_max[c]; /* zero for max => infinity */
3823 if (max == 0) max = INT_MAX;
3824 break;
3825
3826 case OP_CRRANGE:
3827 case OP_CRMINRANGE:
3828 minimize = (*ecode == OP_CRMINRANGE);
3829 min = (ecode[1] << 8) + ecode[2];
3830 max = (ecode[3] << 8) + ecode[4];
3831 if (max == 0) max = INT_MAX;
3832 ecode += 5;
3833 break;
3834
3835 default: /* No repeat follows */
3836 if (eptr >= md->end_subject) FAIL;
3837 c = *eptr++;
3838 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
Guido van Rossum58132c61997-12-17 00:24:13 +00003839 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3840 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003841#if 0
3842 if ( (locale_flag & 4) && isdigit(c) ) continue; /* Locale \d */
3843 if ( (locale_flag & 8) && !isdigit(c) ) continue; /* Locale \D */
3844 if ( (locale_flag & 16) && isspace(c) ) continue; /* Locale \s */
3845 if ( (locale_flag & 32) && !isspace(c) ) continue; /* Locale \S */
3846#endif
3847
3848 if (md->runtime_caseless)
3849 {
3850 c = pcre_fcc[c];
3851 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
3852
Guido van Rossum58132c61997-12-17 00:24:13 +00003853 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3854 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003855 }
3856 FAIL;
3857 }
3858
3859 /* First, ensure the minimum number of matches are present. */
3860
3861 for (i = 1; i <= min; i++)
3862 {
3863 if (eptr >= md->end_subject) FAIL;
3864 c = *eptr++;
3865 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003866 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3867 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003868
3869 if (md->runtime_caseless)
3870 {
3871 c = pcre_fcc[c];
3872 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003873 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3874 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003875 }
3876 FAIL;
3877 }
3878
3879 /* If max == min we can continue with the main loop without the
3880 need to recurse. */
3881
3882 if (min == max) continue;
3883
3884 /* If minimizing, keep testing the rest of the expression and advancing
3885 the pointer while it matches the class. */
3886
3887 if (minimize)
3888 {
3889 for (i = min;; i++)
3890 {
3891 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3892 if (i >= max || eptr >= md->end_subject) FAIL;
3893 c = *eptr++;
3894 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003895 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3896 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003897
3898 if (md->runtime_caseless)
3899 {
3900 c = pcre_fcc[c];
3901 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003902 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3903 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003904 }
3905 FAIL;
3906 }
3907 /* Control never gets here */
3908 }
3909
3910 /* If maximizing, find the longest possible run, then work backwards. */
3911
3912 else
3913 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003914 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00003915 for (i = min; i < max; eptr++, i++)
3916 {
3917 if (eptr >= md->end_subject) break;
3918 c = *eptr;
3919 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003920 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3921 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003922 if (md->runtime_caseless)
3923 {
3924 c = pcre_fcc[c];
3925 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003926 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3927 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003928 }
3929 break;
3930 }
3931
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003932 while (eptr >= pp)
3933 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3934 FAIL;
3935 }
3936 }
3937 /* Control never gets here */
3938
3939 /* Match a run of characters */
3940
3941 case OP_CHARS:
3942 {
3943 register int length = ecode[1];
3944 ecode += 2;
3945
Guido van Rossum557dea11997-12-22 22:46:52 +00003946#ifdef DEBUG /* Sigh. Some compilers never learn. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003947 if (eptr >= md->end_subject)
3948 printf("matching subject <null> against pattern ");
3949 else
3950 {
3951 printf("matching subject ");
3952 pchars(eptr, length, TRUE, md);
3953 printf(" against pattern ");
3954 }
3955 pchars(ecode, length, FALSE, md);
3956 printf("\n");
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003957#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003958
3959 if (length > md->end_subject - eptr) FAIL;
3960 if (md->caseless)
3961 {
3962 while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) FAIL;
3963 }
3964 else
3965 {
3966 while (length-- > 0) if (*ecode++ != *eptr++) FAIL;
3967 }
3968 }
3969 break;
3970
3971 /* Match a single character repeatedly; different opcodes share code. */
3972
3973 case OP_EXACT:
3974 min = max = (ecode[1] << 8) + ecode[2];
3975 ecode += 3;
3976 goto REPEATCHAR;
3977
3978 case OP_UPTO:
3979 case OP_MINUPTO:
3980 min = 0;
3981 max = (ecode[1] << 8) + ecode[2];
3982 minimize = *ecode == OP_MINUPTO;
3983 ecode += 3;
3984 goto REPEATCHAR;
3985
3986 case OP_STAR:
3987 case OP_MINSTAR:
3988 case OP_PLUS:
3989 case OP_MINPLUS:
3990 case OP_QUERY:
3991 case OP_MINQUERY:
3992 c = *ecode++ - OP_STAR;
3993 minimize = (c & 1) != 0;
3994 min = rep_min[c]; /* Pick up values from tables; */
3995 max = rep_max[c]; /* zero for max => infinity */
3996 if (max == 0) max = INT_MAX;
3997
3998 /* Common code for all repeated single-character matches. We can give
3999 up quickly if there are fewer than the minimum number of characters left in
4000 the subject. */
4001
4002 REPEATCHAR:
4003 if (min > md->end_subject - eptr) FAIL;
4004 c = *ecode++;
4005
4006 /* The code is duplicated for the caseless and caseful cases, for speed,
4007 since matching characters is likely to be quite common. First, ensure the
4008 minimum number of matches are present. If min = max, continue at the same
4009 level without recursing. Otherwise, if minimizing, keep trying the rest of
4010 the expression and advancing one matching character if failing, up to the
4011 maximum. Alternatively, if maximizing, find the maximum number of
4012 characters and work backwards. */
4013
Guido van Rossum557dea11997-12-22 22:46:52 +00004014 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
4015 max, eptr));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004016
4017 if (md->caseless)
4018 {
4019 c = pcre_lcc[c];
4020 for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) FAIL;
4021 if (min == max) continue;
4022 if (minimize)
4023 {
4024 for (i = min;; i++)
4025 {
4026 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4027 if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])
4028 FAIL;
4029 }
4030 /* Control never gets here */
4031 }
4032 else
4033 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004034 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004035 for (i = min; i < max; i++)
4036 {
4037 if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;
4038 eptr++;
4039 }
4040 while (eptr >= pp)
4041 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4042 FAIL;
4043 }
Guido van Rossum50700601997-12-08 17:15:20 +00004044 /* Control never gets here */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004045 }
4046
4047 /* Caseful comparisons */
4048
4049 else
4050 {
4051 for (i = 1; i <= min; i++) if (c != *eptr++) FAIL;
4052 if (min == max) continue;
4053 if (minimize)
4054 {
4055 for (i = min;; i++)
4056 {
4057 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4058 if (i >= max || eptr >= md->end_subject || c != *eptr++) FAIL;
4059 }
4060 /* Control never gets here */
4061 }
4062 else
4063 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004064 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004065 for (i = min; i < max; i++)
4066 {
4067 if (eptr >= md->end_subject || c != *eptr) break;
4068 eptr++;
4069 }
4070 while (eptr >= pp)
4071 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4072 FAIL;
4073 }
4074 }
4075 /* Control never gets here */
4076
Guido van Rossum50700601997-12-08 17:15:20 +00004077 /* Match a negated single character */
4078
4079 case OP_NOT:
Guido van Rossum557dea11997-12-22 22:46:52 +00004080 if (eptr >= md->end_subject) FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00004081 ecode++;
4082 if (md->caseless)
4083 {
4084 if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) FAIL;
4085 }
4086 else
4087 {
4088 if (*ecode++ == *eptr++) FAIL;
4089 }
4090 break;
4091
4092 /* Match a negated single character repeatedly. This is almost a repeat of
4093 the code for a repeated single character, but I haven't found a nice way of
4094 commoning these up that doesn't require a test of the positive/negative
4095 option for each character match. Maybe that wouldn't add very much to the
4096 time taken, but character matching *is* what this is all about... */
4097
4098 case OP_NOTEXACT:
4099 min = max = (ecode[1] << 8) + ecode[2];
4100 ecode += 3;
4101 goto REPEATNOTCHAR;
4102
4103 case OP_NOTUPTO:
4104 case OP_NOTMINUPTO:
4105 min = 0;
4106 max = (ecode[1] << 8) + ecode[2];
4107 minimize = *ecode == OP_NOTMINUPTO;
4108 ecode += 3;
4109 goto REPEATNOTCHAR;
4110
4111 case OP_NOTSTAR:
4112 case OP_NOTMINSTAR:
4113 case OP_NOTPLUS:
4114 case OP_NOTMINPLUS:
4115 case OP_NOTQUERY:
4116 case OP_NOTMINQUERY:
4117 c = *ecode++ - OP_NOTSTAR;
4118 minimize = (c & 1) != 0;
4119 min = rep_min[c]; /* Pick up values from tables; */
4120 max = rep_max[c]; /* zero for max => infinity */
4121 if (max == 0) max = INT_MAX;
4122
4123 /* Common code for all repeated single-character matches. We can give
4124 up quickly if there are fewer than the minimum number of characters left in
4125 the subject. */
4126
4127 REPEATNOTCHAR:
4128 if (min > md->end_subject - eptr) FAIL;
4129 c = *ecode++;
4130
4131 /* The code is duplicated for the caseless and caseful cases, for speed,
4132 since matching characters is likely to be quite common. First, ensure the
4133 minimum number of matches are present. If min = max, continue at the same
4134 level without recursing. Otherwise, if minimizing, keep trying the rest of
4135 the expression and advancing one matching character if failing, up to the
4136 maximum. Alternatively, if maximizing, find the maximum number of
4137 characters and work backwards. */
4138
Guido van Rossum557dea11997-12-22 22:46:52 +00004139 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
4140 max, eptr));
Guido van Rossum50700601997-12-08 17:15:20 +00004141
4142 if (md->caseless)
4143 {
4144 c = pcre_lcc[c];
4145 for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) FAIL;
4146 if (min == max) continue;
4147 if (minimize)
4148 {
4149 for (i = min;; i++)
4150 {
4151 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4152 if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])
4153 FAIL;
4154 }
4155 /* Control never gets here */
4156 }
4157 else
4158 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004159 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00004160 for (i = min; i < max; i++)
4161 {
4162 if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;
4163 eptr++;
4164 }
4165 while (eptr >= pp)
4166 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4167 FAIL;
4168 }
4169 /* Control never gets here */
4170 }
4171
4172 /* Caseful comparisons */
4173
4174 else
4175 {
4176 for (i = 1; i <= min; i++) if (c == *eptr++) FAIL;
4177 if (min == max) continue;
4178 if (minimize)
4179 {
4180 for (i = min;; i++)
4181 {
4182 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4183 if (i >= max || eptr >= md->end_subject || c == *eptr++) FAIL;
4184 }
4185 /* Control never gets here */
4186 }
4187 else
4188 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004189 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00004190 for (i = min; i < max; i++)
4191 {
4192 if (eptr >= md->end_subject || c == *eptr) break;
4193 eptr++;
4194 }
4195 while (eptr >= pp)
4196 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4197 FAIL;
4198 }
4199 }
4200 /* Control never gets here */
4201
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004202 /* Match a single character type repeatedly; several different opcodes
4203 share code. This is very similar to the code for single characters, but we
4204 repeat it in the interests of efficiency. */
4205
4206 case OP_TYPEEXACT:
4207 min = max = (ecode[1] << 8) + ecode[2];
4208 minimize = TRUE;
4209 ecode += 3;
4210 goto REPEATTYPE;
4211
4212 case OP_TYPEUPTO:
4213 case OP_TYPEMINUPTO:
4214 min = 0;
4215 max = (ecode[1] << 8) + ecode[2];
4216 minimize = *ecode == OP_TYPEMINUPTO;
4217 ecode += 3;
4218 goto REPEATTYPE;
4219
4220 case OP_TYPESTAR:
4221 case OP_TYPEMINSTAR:
4222 case OP_TYPEPLUS:
4223 case OP_TYPEMINPLUS:
4224 case OP_TYPEQUERY:
4225 case OP_TYPEMINQUERY:
4226 c = *ecode++ - OP_TYPESTAR;
4227 minimize = (c & 1) != 0;
4228 min = rep_min[c]; /* Pick up values from tables; */
4229 max = rep_max[c]; /* zero for max => infinity */
4230 if (max == 0) max = INT_MAX;
4231
4232 /* Common code for all repeated single character type matches */
4233
4234 REPEATTYPE:
4235 ctype = *ecode++; /* Code for the character type */
4236
4237 /* First, ensure the minimum number of matches are present. Use inline
4238 code for maximizing the speed, and do the type test once at the start
4239 (i.e. keep it out of the loop). Also test that there are at least the
4240 minimum number of characters before we start. */
4241
4242 if (min > md->end_subject - eptr) FAIL;
4243 if (min > 0) switch(ctype)
4244 {
4245 case OP_ANY:
4246 if (!md->dotall)
4247 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') FAIL; }
4248 else eptr += min;
4249 break;
4250
4251 case OP_NOT_DIGIT:
4252 for (i = 1; i <= min; i++)
4253 if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) FAIL;
4254 break;
4255
4256 case OP_DIGIT:
4257 for (i = 1; i <= min; i++)
4258 if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) FAIL;
4259 break;
4260
4261 case OP_NOT_WHITESPACE:
4262 for (i = 1; i <= min; i++)
4263 if ((pcre_ctypes[*eptr++] & ctype_space) != 0) FAIL;
4264 break;
4265
4266 case OP_WHITESPACE:
4267 for (i = 1; i <= min; i++)
4268 if ((pcre_ctypes[*eptr++] & ctype_space) == 0) FAIL;
4269 break;
4270
4271 case OP_NOT_WORDCHAR:
4272 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
4273 FAIL;
4274 break;
4275
4276 case OP_WORDCHAR:
4277 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
4278 FAIL;
4279 break;
Guido van Rossum50700601997-12-08 17:15:20 +00004280
4281 case OP_NOT_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00004282 for (i = 1; i <= min; i++, eptr++) if (*eptr=='_' || isalnum(*eptr))
Guido van Rossum557dea11997-12-22 22:46:52 +00004283 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00004284 break;
4285
4286 case OP_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00004287 for (i = 1; i <= min; i++, eptr++) if (*eptr!='_' && !isalnum(*eptr))
Guido van Rossum557dea11997-12-22 22:46:52 +00004288 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00004289 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004290 }
4291
4292 /* If min = max, continue at the same level without recursing */
4293
4294 if (min == max) continue;
4295
4296 /* If minimizing, we have to test the rest of the pattern before each
4297 subsequent match, so inlining isn't much help; just use the function. */
4298
4299 if (minimize)
4300 {
4301 for (i = min;; i++)
4302 {
4303 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4304 if (i >= max || eptr >= md->end_subject ||
4305 !match_type(ctype, *eptr++, md->dotall))
4306 FAIL;
4307 }
4308 /* Control never gets here */
4309 }
4310
4311 /* If maximizing it is worth using inline code for speed, doing the type
4312 test once at the start (i.e. keep it out of the loop). */
4313
4314 else
4315 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004316 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004317 switch(ctype)
4318 {
4319 case OP_ANY:
4320 if (!md->dotall)
4321 {
4322 for (i = min; i < max; i++)
4323 {
4324 if (eptr >= md->end_subject || *eptr == '\n') break;
4325 eptr++;
4326 }
4327 }
4328 else
4329 {
4330 c = max - min;
4331 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4332 eptr += c;
4333 }
4334 break;
4335
4336 case OP_NOT_DIGIT:
4337 for (i = min; i < max; i++)
4338 {
4339 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)
4340 break;
4341 eptr++;
4342 }
4343 break;
4344
4345 case OP_DIGIT:
4346 for (i = min; i < max; i++)
4347 {
4348 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)
4349 break;
4350 eptr++;
4351 }
4352 break;
4353
4354 case OP_NOT_WHITESPACE:
4355 for (i = min; i < max; i++)
4356 {
4357 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)
4358 break;
4359 eptr++;
4360 }
4361 break;
4362
4363 case OP_WHITESPACE:
4364 for (i = min; i < max; i++)
4365 {
4366 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)
4367 break;
4368 eptr++;
4369 }
4370 break;
4371
4372 case OP_NOT_WORDCHAR:
4373 for (i = min; i < max; i++)
4374 {
4375 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)
4376 break;
4377 eptr++;
4378 }
4379 break;
4380
4381 case OP_WORDCHAR:
4382 for (i = min; i < max; i++)
4383 {
Guido van Rossum50700601997-12-08 17:15:20 +00004384 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)
4385 break;
4386 eptr++;
4387 }
4388 break;
4389 case OP_NOT_WORDCHAR_L:
4390 for (i = min; i < max; i++)
4391 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004392 if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) ) )
Guido van Rossum50700601997-12-08 17:15:20 +00004393 break;
4394 eptr++;
4395 }
4396 break;
4397
4398 case OP_WORDCHAR_L:
4399 for (i = min; i < max; i++)
4400 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004401 if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) ) )
Guido van Rossum50700601997-12-08 17:15:20 +00004402 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004403 eptr++;
4404 }
4405 break;
4406 }
4407
4408 while (eptr >= pp)
4409 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4410 FAIL;
4411 }
4412 /* Control never gets here */
4413
4414 /* There's been some horrible disaster. */
4415
4416 default:
Guido van Rossum557dea11997-12-22 22:46:52 +00004417 DPRINTF(("Unknown opcode %d\n", *ecode));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004418 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4419 FAIL;
4420 }
4421
4422 /* Do not stick any code in here without much thought; it is assumed
4423 that "continue" in the code above comes out to here to repeat the main
4424 loop. */
4425
4426 } /* End of main loop */
4427/* Control never reaches here */
4428
4429fail:
4430 if (md->point > save_stack_position)
4431 {
4432 /* If there are still points remaining on the stack, pop the next one off */
Guido van Rossumc3861071997-10-08 02:07:40 +00004433 int off_num;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004434
4435 md->point--;
4436 offset_top = md->offset_top[md->point];
4437 eptr = md->eptr[md->point];
4438 ecode = md->ecode[md->point];
4439 off_num = md->off_num[md->point];
4440 md->offset_vector[off_num] = md->r1[md->point];
4441 md->offset_vector[off_num+1] = md->r2[md->point];
4442 goto match_loop;
4443 }
4444 /* Failure, and nothing left on the stack, so end this function call */
4445
4446 /* Restore the top of the stack to where it was before this function
4447 call. This lets us use one stack for everything; recursive calls
4448 can push and pop information, and may increase the stack. When
4449 the call returns, the parent function can resume pushing and
4450 popping wherever it was. */
4451
4452 md->point = save_stack_position;
4453 return FALSE;
4454
4455succeed:
4456 return TRUE;
4457}
4458
4459
Guido van Rossum50700601997-12-08 17:15:20 +00004460
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004461/*************************************************
Guido van Rossum557dea11997-12-22 22:46:52 +00004462* Segregate setjmp() *
4463*************************************************/
4464
4465/* The -Wall option of gcc gives warnings for all local variables when setjmp()
4466is used, even if the coding conforms to the rules of ANSI C. To avoid this, we
4467hide it in a separate function. This is called only when PCRE_EXTRA is set,
4468since it's needed only for the extension \X option, and with any luck, a good
4469compiler will spot the tail recursion and compile it efficiently.
4470
4471Arguments:
4472 eptr pointer in subject
4473 ecode position in code
4474 offset_top current top pointer
4475 md pointer to "static" info for the match
4476
4477Returns: TRUE if matched
4478*/
4479
4480static BOOL
4481match_with_setjmp(const uschar *eptr, const uschar *ecode, int offset_top,
4482 match_data *match_block)
4483{
4484return setjmp(match_block->fail_env) == 0 &&
4485 match(eptr, ecode, offset_top, match_block);
4486}
4487
4488
4489
4490/*************************************************
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004491* Execute a Regular Expression *
4492*************************************************/
4493
4494/* This function applies a compiled re to a subject string and picks out
4495portions of the string if it matches. Two elements in the vector are set for
4496each substring: the offsets to the start and end of the substring.
4497
4498Arguments:
Guido van Rossum50700601997-12-08 17:15:20 +00004499 external_re points to the compiled expression
4500 external_extra points to "hints" from pcre_study() or is NULL
4501 subject points to the subject string
4502 length length of subject string (may contain binary zeros)
4503 options option bits
4504 offsets points to a vector of ints to be filled in with offsets
4505 offsetcount the number of elements in the vector
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004506
Guido van Rossum50700601997-12-08 17:15:20 +00004507Returns: > 0 => success; value is the number of elements filled in
4508 = 0 => success, but offsets is not big enough
4509 -1 => failed to match
4510 < -1 => some kind of unexpected problem
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004511*/
4512
4513int
Guido van Rossum50700601997-12-08 17:15:20 +00004514pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
Guido van Rossum816671c1998-03-10 04:55:29 +00004515 const char *subject, int length, int start_pos, int options,
4516 int *offsets, int offsetcount)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004517{
Guido van Rossum58132c61997-12-17 00:24:13 +00004518 /* The "volatile" directives are to make gcc -Wall stop complaining
4519 that these variables can be clobbered by the longjmp. Hopefully
4520 they won't cost too much performance. */
Guido van Rossum042ff9e1998-04-03 21:13:31 +00004521volatile int resetcount, ocount;
4522volatile int first_char = -1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004523match_data match_block;
Guido van Rossum557dea11997-12-22 22:46:52 +00004524const uschar *start_bits = NULL;
Guido van Rossum816671c1998-03-10 04:55:29 +00004525const uschar *start_match = (const uschar *)subject + start_pos;
Guido van Rossum58132c61997-12-17 00:24:13 +00004526const uschar *end_subject;
4527const real_pcre *re = (const real_pcre *)external_re;
4528const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00004529volatile BOOL using_temporary_offsets = FALSE;
4530volatile BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4531volatile BOOL startline = (re->options & PCRE_STARTLINE) != 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004532
4533if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4534
4535if (re == NULL || subject == NULL ||
4536 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4537if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4538
Guido van Rossum58132c61997-12-17 00:24:13 +00004539match_block.start_subject = (const uschar *)subject;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004540match_block.end_subject = match_block.start_subject + length;
4541end_subject = match_block.end_subject;
4542
Guido van Rossum50700601997-12-08 17:15:20 +00004543match_block.caseless = ((re->options | options) & PCRE_CASELESS) != 0;
4544match_block.runtime_caseless = match_block.caseless &&
4545 (re->options & PCRE_CASELESS) == 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004546
Guido van Rossum50700601997-12-08 17:15:20 +00004547match_block.multiline = ((re->options | options) & PCRE_MULTILINE) != 0;
4548match_block.dotall = ((re->options | options) & PCRE_DOTALL) != 0;
4549match_block.endonly = ((re->options | options) & PCRE_DOLLAR_ENDONLY) != 0;
4550
4551match_block.notbol = (options & PCRE_NOTBOL) != 0;
4552match_block.noteol = (options & PCRE_NOTEOL) != 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004553
4554match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
4555
4556/* Set the stack state to empty */
4557 match_block.off_num = match_block.offset_top = NULL;
4558 match_block.r1 = match_block.r2 = NULL;
4559 match_block.eptr = match_block.ecode = NULL;
4560 match_block.point = match_block.length = 0;
4561
Guido van Rossum50700601997-12-08 17:15:20 +00004562/* If the expression has got more back references than the offsets supplied can
4563hold, we get a temporary bit of working store to use during the matching.
Guido van Rossum557dea11997-12-22 22:46:52 +00004564Otherwise, we can use the vector supplied, rounding down its size to a multiple
4565of 2. */
Guido van Rossum50700601997-12-08 17:15:20 +00004566
Guido van Rossum557dea11997-12-22 22:46:52 +00004567ocount = offsetcount & (-2);
4568if (re->top_backref > 0 && re->top_backref >= ocount/2)
Guido van Rossum50700601997-12-08 17:15:20 +00004569 {
4570 ocount = re->top_backref * 2 + 2;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00004571 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
Guido van Rossum50700601997-12-08 17:15:20 +00004572 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
Guido van Rossum557dea11997-12-22 22:46:52 +00004573 using_temporary_offsets = TRUE;
4574 DPRINTF(("Got memory to hold back references\n"));
Guido van Rossum50700601997-12-08 17:15:20 +00004575 }
4576else match_block.offset_vector = offsets;
4577
4578match_block.offset_end = ocount;
4579match_block.offset_overflow = FALSE;
4580
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004581/* Compute the minimum number of offsets that we need to reset each time. Doing
4582this makes a huge difference to execution time when there aren't many brackets
4583in the pattern. */
4584
4585resetcount = 2 + re->top_bracket * 2;
Guido van Rossum50700601997-12-08 17:15:20 +00004586if (resetcount > offsetcount) resetcount = ocount;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004587
4588/* If MULTILINE is set at exec time but was not set at compile time, and the
4589anchored flag is set, we must re-check because a setting provoked by ^ in the
4590pattern is not right in multi-line mode. Calling is_anchored() again here does
4591the right check, because multiline is now set. If it now yields FALSE, the
4592expression must have had ^ starting some of its branches. Check to see if
4593that is true for *all* branches, and if so, set the startline flag. */
4594
Guido van Rossum557dea11997-12-22 22:46:52 +00004595if (match_block.multiline && anchored && (re->options & PCRE_MULTILINE) == 0 &&
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004596 !is_anchored(re->code, match_block.multiline))
4597 {
4598 anchored = FALSE;
4599 if (is_startline(re->code)) startline = TRUE;
4600 }
4601
4602/* Set up the first character to match, if available. The first_char value is
4603never set for an anchored regular expression, but the anchoring may be forced
4604at run time, so we have to test for anchoring. The first char may be unset for
4605an unanchored pattern, of course. If there's no first char and the pattern was
4606studied, the may be a bitmap of possible first characters. However, we can
4607use this only if the caseless state of the studying was correct. */
4608
4609if (!anchored)
4610 {
4611 if ((re->options & PCRE_FIRSTSET) != 0)
4612 {
4613 first_char = re->first_char;
4614 if (match_block.caseless) first_char = pcre_lcc[first_char];
4615 }
4616 else
4617 if (!startline && extra != NULL &&
4618 (extra->options & PCRE_STUDY_MAPPED) != 0 &&
4619 ((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless)
4620 start_bits = extra->start_bits;
4621 }
4622
4623/* Loop for unanchored matches; for anchored regexps the loop runs just once. */
4624
4625do
4626 {
Guido van Rossum557dea11997-12-22 22:46:52 +00004627 int rc;
Guido van Rossum50700601997-12-08 17:15:20 +00004628 register int *iptr = match_block.offset_vector;
4629 register int *iend = iptr + resetcount;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004630
4631 /* Reset the maximum number of extractions we might see. */
4632
4633 while (iptr < iend) *iptr++ = -1;
4634
4635 /* Advance to a unique first char if possible */
4636
4637 if (first_char >= 0)
4638 {
4639 if (match_block.caseless)
4640 while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
4641 start_match++;
4642 else
4643 while (start_match < end_subject && *start_match != first_char)
4644 start_match++;
4645 }
4646
4647 /* Or to just after \n for a multiline match if possible */
4648
4649 else if (startline)
4650 {
4651 if (start_match > match_block.start_subject)
4652 {
4653 while (start_match < end_subject && start_match[-1] != '\n')
4654 start_match++;
4655 }
4656 }
4657
4658 /* Or to a non-unique first char */
4659
4660 else if (start_bits != NULL)
4661 {
4662 while (start_match < end_subject)
4663 {
4664 register int c = *start_match;
Guido van Rossum50700601997-12-08 17:15:20 +00004665 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004666 }
4667 }
4668
Guido van Rossum557dea11997-12-22 22:46:52 +00004669#ifdef DEBUG /* Sigh. Some compilers never learn. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004670 printf(">>>> Match against: ");
4671 pchars(start_match, end_subject - start_match, TRUE, &match_block);
4672 printf("\n");
Guido van Rossum57ba4f31997-12-02 20:40:28 +00004673#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004674
4675 /* When a match occurs, substrings will be set for all internal extractions;
4676 we just need to set up the whole thing as substring 0 before returning. If
Guido van Rossum50700601997-12-08 17:15:20 +00004677 there were too many extractions, set the return code to zero. In the case
4678 where we had to get some local store to hold offsets for backreferences, copy
4679 those back references that we can. In this case there need not be overflow
4680 if certain parts of the pattern were not used.
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004681
Guido van Rossum50700601997-12-08 17:15:20 +00004682 Before starting the match, we have to set up a longjmp() target to enable
Guido van Rossum557dea11997-12-22 22:46:52 +00004683 the "cut" operation to fail a match completely without backtracking. This
4684 is done in a separate function to avoid compiler warnings. We need not do
4685 it unless PCRE_EXTRA is set, since only in that case is the "cut" operation
4686 enabled. */
Guido van Rossum50700601997-12-08 17:15:20 +00004687
4688 /* To handle errors such as running out of memory for the failure
4689 stack, we need to save this location via setjmp(), so
4690 error-handling code can call longjmp() to jump out of deeply-nested code. */
4691 if (setjmp(match_block.error_env)==0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004692 {
Guido van Rossum50700601997-12-08 17:15:20 +00004693
Guido van Rossum557dea11997-12-22 22:46:52 +00004694 if ((re->options & PCRE_EXTRA) != 0)
Guido van Rossum50700601997-12-08 17:15:20 +00004695 {
Guido van Rossum557dea11997-12-22 22:46:52 +00004696 if (!match_with_setjmp(start_match, re->code, 2, &match_block))
4697 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004698 }
Guido van Rossum557dea11997-12-22 22:46:52 +00004699 else if (!match(start_match, re->code, 2, &match_block)) continue;
4700
4701 /* Copy the offset information from temporary store if necessary */
4702
4703 if (using_temporary_offsets)
4704 {
4705 if (offsetcount >= 4)
4706 {
4707 memcpy(offsets + 2, match_block.offset_vector + 2,
4708 (offsetcount - 2) * sizeof(int));
4709 DPRINTF(("Copied offsets from temporary memory\n"));
4710 }
4711 if (match_block.end_offset_top > offsetcount)
4712 match_block.offset_overflow = TRUE;
4713
4714 DPRINTF(("Freeing temporary memory\n"));
4715 (pcre_free)(match_block.offset_vector);
4716 }
4717
4718 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
4719
4720 if (match_block.offset_end < 2) rc = 0; else
4721 {
4722 offsets[0] = start_match - match_block.start_subject;
4723 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
4724 }
4725
4726 DPRINTF((">>>> returning %d\n", rc));
4727 free_stack(&match_block);
4728 return rc;
Guido van Rossum50700601997-12-08 17:15:20 +00004729 } /* End of (if setjmp(match_block.error_env)...) */
Guido van Rossum042ff9e1998-04-03 21:13:31 +00004730 free_stack(&match_block);
4731
Guido van Rossum50700601997-12-08 17:15:20 +00004732 /* Return an error code; pcremodule.c will preserve the exception */
4733 if (PyErr_Occurred()) return PCRE_ERROR_NOMEMORY;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004734 }
4735while (!anchored &&
4736 match_block.errorcode == PCRE_ERROR_NOMATCH &&
4737 start_match++ < end_subject);
4738
Guido van Rossum557dea11997-12-22 22:46:52 +00004739if (using_temporary_offsets)
4740 {
4741 DPRINTF(("Freeing temporary memory\n"));
4742 (pcre_free)(match_block.offset_vector);
4743 }
4744
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004745#ifdef DEBUG
4746printf(">>>> returning %d\n", match_block.errorcode);
4747#endif
Guido van Rossum50700601997-12-08 17:15:20 +00004748
Barry Warsawb80667d1999-01-27 21:41:08 +00004749 free_stack(&match_block);
Guido van Rossum557dea11997-12-22 22:46:52 +00004750 return match_block.errorcode;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004751}
4752
4753/* End of pcre.c */