blob: 699932fc5e4c369bc68ead7fe3c5729883c0cff2 [file] [log] [blame]
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001
2/*************************************************
3* Perl-Compatible Regular Expressions *
4*************************************************/
5
6/* DO NOT EDIT THIS FILE! */
7
8/* This file is automatically written by the merge-files.py script
9included with the PCRE distribution for Python; it's produced from
10several C files, and code is removed in the process. If you want to
11modify the code or track down bugs, it will be much easier to work
12with the code in its original, multiple-file form. Don't edit this
13file by hand, or submit patches to it.
14
15The Python-specific PCRE distribution can be retrieved from
16 http://starship.skyport.net/crew/amk/regex/
17
18The unmodified original PCRE distribution doesn't have a fixed URL
19yet; write Philip Hazel <ph10@cam.ac.uk> for the latest version.
20
21Written by: Philip Hazel <ph10@cam.ac.uk>
22
23Extensively modified by the Python String-SIG: <string-sig@python.org>
24Send bug reports to: <string-sig@python.org>
25(They'll figure out if it's a bug in PCRE or in the Python-specific
26changes.)
27
28 Copyright (c) 1997 University of Cambridge
29
30-----------------------------------------------------------------------------
31Permission is granted to anyone to use this software for any purpose on any
32computer system, and to redistribute it freely, subject to the following
33restrictions:
34
351. This software is distributed in the hope that it will be useful,
36 but WITHOUT ANY WARRANTY; without even the implied warranty of
37 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
38
392. The origin of this software must not be misrepresented, either by
40 explicit claim or by omission.
41
423. Altered versions must be plainly marked as such, and must not be
43 misrepresented as being the original software.
44-----------------------------------------------------------------------------
45*/
46
47
48#define FOR_PYTHON
49#include "pcre-internal.h"
50#include "Python.h"
Guido van Rossum50700601997-12-08 17:15:20 +000051#include "mymalloc.h"
52#include <ctype.h>
Guido van Rossum51b3aa31997-10-06 14:43:11 +000053#include "graminit.h"
54
55/*************************************************
56* Perl-Compatible Regular Expressions *
57*************************************************/
58
59/* This file is automatically written by the makechartables auxiliary
60program. If you edit it by hand, you might like to edit the Makefile to
61prevent its ever being regenerated. */
62
63/* This table is a lower casing table. */
64
65unsigned char pcre_lcc[] = {
66 0, 1, 2, 3, 4, 5, 6, 7,
67 8, 9, 10, 11, 12, 13, 14, 15,
68 16, 17, 18, 19, 20, 21, 22, 23,
69 24, 25, 26, 27, 28, 29, 30, 31,
70 32, 33, 34, 35, 36, 37, 38, 39,
71 40, 41, 42, 43, 44, 45, 46, 47,
72 48, 49, 50, 51, 52, 53, 54, 55,
73 56, 57, 58, 59, 60, 61, 62, 63,
74 64, 97, 98, 99,100,101,102,103,
75 104,105,106,107,108,109,110,111,
76 112,113,114,115,116,117,118,119,
77 120,121,122, 91, 92, 93, 94, 95,
78 96, 97, 98, 99,100,101,102,103,
79 104,105,106,107,108,109,110,111,
80 112,113,114,115,116,117,118,119,
81 120,121,122,123,124,125,126,127,
82 128,129,130,131,132,133,134,135,
83 136,137,138,139,140,141,142,143,
84 144,145,146,147,148,149,150,151,
85 152,153,154,155,156,157,158,159,
86 160,161,162,163,164,165,166,167,
87 168,169,170,171,172,173,174,175,
88 176,177,178,179,180,181,182,183,
89 184,185,186,187,188,189,190,191,
90 192,193,194,195,196,197,198,199,
91 200,201,202,203,204,205,206,207,
92 208,209,210,211,212,213,214,215,
93 216,217,218,219,220,221,222,223,
94 224,225,226,227,228,229,230,231,
95 232,233,234,235,236,237,238,239,
96 240,241,242,243,244,245,246,247,
97 248,249,250,251,252,253,254,255 };
98
Guido van Rossum50700601997-12-08 17:15:20 +000099/* This table is a case flipping table. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000100
Guido van Rossum50700601997-12-08 17:15:20 +0000101unsigned char pcre_fcc[] = {
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000102 0, 1, 2, 3, 4, 5, 6, 7,
103 8, 9, 10, 11, 12, 13, 14, 15,
104 16, 17, 18, 19, 20, 21, 22, 23,
105 24, 25, 26, 27, 28, 29, 30, 31,
106 32, 33, 34, 35, 36, 37, 38, 39,
107 40, 41, 42, 43, 44, 45, 46, 47,
108 48, 49, 50, 51, 52, 53, 54, 55,
109 56, 57, 58, 59, 60, 61, 62, 63,
Guido van Rossum50700601997-12-08 17:15:20 +0000110 64, 97, 98, 99,100,101,102,103,
111 104,105,106,107,108,109,110,111,
112 112,113,114,115,116,117,118,119,
113 120,121,122, 91, 92, 93, 94, 95,
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000114 96, 65, 66, 67, 68, 69, 70, 71,
115 72, 73, 74, 75, 76, 77, 78, 79,
116 80, 81, 82, 83, 84, 85, 86, 87,
117 88, 89, 90,123,124,125,126,127,
118 128,129,130,131,132,133,134,135,
119 136,137,138,139,140,141,142,143,
120 144,145,146,147,148,149,150,151,
121 152,153,154,155,156,157,158,159,
122 160,161,162,163,164,165,166,167,
123 168,169,170,171,172,173,174,175,
124 176,177,178,179,180,181,182,183,
125 184,185,186,187,188,189,190,191,
126 192,193,194,195,196,197,198,199,
127 200,201,202,203,204,205,206,207,
128 208,209,210,211,212,213,214,215,
129 216,217,218,219,220,221,222,223,
130 224,225,226,227,228,229,230,231,
131 232,233,234,235,236,237,238,239,
132 240,241,242,243,244,245,246,247,
133 248,249,250,251,252,253,254,255 };
134
Guido van Rossum50700601997-12-08 17:15:20 +0000135/* This table contains bit maps for digits, letters, 'word' chars, and
136white space. Each map is 32 bytes long and the bits run from the least
137significant end of each byte. */
138
139unsigned char pcre_cbits[] = {
140 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
141 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
142 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
143 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
144
145 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
146 0xfe,0xff,0xff,0x07,0xfe,0xff,0xff,0x07,
147 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
148 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
149
150 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
151 0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07,
152 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
153 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
154
155 0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00,
156 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
157 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
158 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 };
159
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000160/* This table identifies various classes of character by individual bits:
Guido van Rossum50700601997-12-08 17:15:20 +0000161 0x01 white space character
162 0x02 letter
163 0x04 decimal digit
164 0x08 hexadecimal digit
165 0x10 alphanumeric or '_'
166 0x80 regular expression metacharacter or binary zero
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000167*/
168
169unsigned char pcre_ctypes[] = {
170 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
171 0x00,0x01,0x01,0x01,0x01,0x01,0x00,0x00, /* 8- 15 */
172 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
173 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
174 0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /* - ' */
175 0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /* ( - / */
Guido van Rossum50700601997-12-08 17:15:20 +0000176 0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c, /* 0 - 7 */
177 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x80, /* 8 - ? */
178 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* @ - G */
179 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* H - O */
180 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* P - W */
181 0x12,0x12,0x12,0x80,0x00,0x00,0x80,0x10, /* X - _ */
182 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* ` - g */
183 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* h - o */
184 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* p - w */
185 0x12,0x12,0x12,0x80,0x80,0x00,0x00,0x00, /* x -127 */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000186 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
187 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
188 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
189 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
190 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
191 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
192 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
193 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
194 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
195 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
196 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
197 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
198 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
199 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
200 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
201 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
202
Guido van Rossum50700601997-12-08 17:15:20 +0000203/* End of chartables.c */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000204/*************************************************
205* Perl-Compatible Regular Expressions *
206*************************************************/
207
208/*
209This is a library of functions to support regular expressions whose syntax
210and semantics are as close as possible to those of the Perl 5 language. See
211the file Tech.Notes for some information on the internals.
212
213Written by: Philip Hazel <ph10@cam.ac.uk>
214
215 Copyright (c) 1997 University of Cambridge
216
217-----------------------------------------------------------------------------
218Permission is granted to anyone to use this software for any purpose on any
219computer system, and to redistribute it freely, subject to the following
220restrictions:
221
2221. This software is distributed in the hope that it will be useful,
223 but WITHOUT ANY WARRANTY; without even the implied warranty of
224 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
225
2262. The origin of this software must not be misrepresented, either by
227 explicit claim or by omission.
228
2293. Altered versions must be plainly marked as such, and must not be
230 misrepresented as being the original software.
231-----------------------------------------------------------------------------
232*/
233
234
235/* Include the internals header, which itself includes Standard C headers plus
236the external pcre header. */
237
238
239
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000240
241/*************************************************
242* Create bitmap of starting chars *
243*************************************************/
244
245/* This function scans a compiled unanchored expression and attempts to build a
246bitmap of the set of initial characters. If it can't, it returns FALSE. As time
247goes by, we may be able to get more clever at doing this.
248
249Arguments:
250 code points to an expression
251 start_bits points to a 32-byte table, initialized to 0
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000252
253Returns: TRUE if table built, FALSE otherwise
254*/
255
256static BOOL
Guido van Rossum50700601997-12-08 17:15:20 +0000257set_start_bits(uschar *code, uschar *start_bits)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000258{
Guido van Rossum50700601997-12-08 17:15:20 +0000259register int c;
260
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000261do
262 {
263 uschar *tcode = code + 3;
264 BOOL try_next = TRUE;
265
266 while (try_next)
267 {
268 try_next = FALSE;
269
270 if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
271 {
Guido van Rossum50700601997-12-08 17:15:20 +0000272 if (!set_start_bits(tcode, start_bits)) return FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000273 }
274
275 else switch(*tcode)
276 {
277 default:
278 return FALSE;
279
280 /* BRAZERO does the bracket, but carries on. */
281
282 case OP_BRAZERO:
283 case OP_BRAMINZERO:
Guido van Rossum50700601997-12-08 17:15:20 +0000284 if (!set_start_bits(++tcode, start_bits)) return FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000285 do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
286 tcode += 3;
287 try_next = TRUE;
288 break;
289
290 /* Single-char * or ? sets the bit and tries the next item */
291
292 case OP_STAR:
293 case OP_MINSTAR:
294 case OP_QUERY:
295 case OP_MINQUERY:
Guido van Rossum50700601997-12-08 17:15:20 +0000296 start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000297 tcode += 2;
298 try_next = TRUE;
299 break;
300
301 /* Single-char upto sets the bit and tries the next */
302
303 case OP_UPTO:
304 case OP_MINUPTO:
Guido van Rossum50700601997-12-08 17:15:20 +0000305 start_bits[tcode[3]/8] |= (1 << (tcode[3]&7));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000306 tcode += 4;
307 try_next = TRUE;
308 break;
309
310 /* At least one single char sets the bit and stops */
311
312 case OP_EXACT: /* Fall through */
313 tcode++;
314
315 case OP_CHARS: /* Fall through */
316 tcode++;
317
318 case OP_PLUS:
319 case OP_MINPLUS:
Guido van Rossum50700601997-12-08 17:15:20 +0000320 start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000321 break;
322
323 /* Single character type sets the bits and stops */
324
325 case OP_NOT_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000326 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000327 break;
328
329 case OP_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000330 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000331 break;
332
333 case OP_NOT_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000334 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000335 break;
336
337 case OP_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000338 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000339 break;
340
341 case OP_NOT_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000342 for (c = 0; c < 32; c++)
343 start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000344 break;
345
346 case OP_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000347 for (c = 0; c < 32; c++)
348 start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000349 break;
350
351 /* One or more character type fudges the pointer and restarts, knowing
352 it will hit a single character type and stop there. */
353
354 case OP_TYPEPLUS:
355 case OP_TYPEMINPLUS:
356 tcode++;
357 try_next = TRUE;
358 break;
359
360 case OP_TYPEEXACT:
361 tcode += 3;
362 try_next = TRUE;
363 break;
364
365 /* Zero or more repeats of character types set the bits and then
366 try again. */
367
368 case OP_TYPEUPTO:
369 case OP_TYPEMINUPTO:
370 tcode += 2; /* Fall through */
371
372 case OP_TYPESTAR:
373 case OP_TYPEMINSTAR:
374 case OP_TYPEQUERY:
375 case OP_TYPEMINQUERY:
376 switch(tcode[1])
377 {
378 case OP_NOT_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000379 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000380 break;
381
382 case OP_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000383 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000384 break;
385
386 case OP_NOT_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000387 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000388 break;
389
390 case OP_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000391 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000392 break;
393
394 case OP_NOT_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000395 for (c = 0; c < 32; c++)
396 start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000397 break;
398
399 case OP_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000400 for (c = 0; c < 32; c++)
401 start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000402 break;
403 }
404
405 tcode += 2;
406 try_next = TRUE;
407 break;
408
409 /* Character class: set the bits and either carry on or not,
410 according to the repeat count. */
411
412 case OP_CLASS:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000413 {
Guido van Rossum50700601997-12-08 17:15:20 +0000414 tcode++;
415 for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
416 tcode += 32;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000417 switch (*tcode)
418 {
419 case OP_CRSTAR:
420 case OP_CRMINSTAR:
421 case OP_CRQUERY:
422 case OP_CRMINQUERY:
423 tcode++;
424 try_next = TRUE;
425 break;
426
427 case OP_CRRANGE:
428 case OP_CRMINRANGE:
429 if (((tcode[1] << 8) + tcode[2]) == 0)
430 {
431 tcode += 5;
432 try_next = TRUE;
433 }
434 break;
435 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000436 }
437 break; /* End of class handling */
438
439 } /* End of switch */
440 } /* End of try_next loop */
441
442 code += (code[1] << 8) + code[2]; /* Advance to next branch */
443 }
444while (*code == OP_ALT);
445return TRUE;
446}
447
448
449
450/*************************************************
451* Study a compiled expression *
452*************************************************/
453
454/* This function is handed a compiled expression that it must study to produce
455information that will speed up the matching. It returns a pcre_extra block
456which then gets handed back to pcre_exec().
457
458Arguments:
459 re points to the compiled expression
460 options contains option bits
461 errorptr points to where to place error messages;
462 set NULL unless error
463
464Returns: pointer to a pcre_extra block,
465 NULL on error or if no optimization possible
466*/
467
468pcre_extra *
Guido van Rossum50700601997-12-08 17:15:20 +0000469pcre_study(const pcre *external_re, int options, char **errorptr)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000470{
471BOOL caseless;
472uschar start_bits[32];
473real_pcre_extra *extra;
474real_pcre *re = (real_pcre *)external_re;
475
476*errorptr = NULL;
477
478if (re == NULL || re->magic_number != MAGIC_NUMBER)
479 {
480 *errorptr = "argument is not a compiled regular expression";
481 return NULL;
482 }
483
484if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
485 {
486 *errorptr = "unknown or incorrect option bit(s) set";
487 return NULL;
488 }
489
Guido van Rossum50700601997-12-08 17:15:20 +0000490/* Caseless can either be from the compiled regex or from options. */
491
492caseless = ((re->options | options) & PCRE_CASELESS) != 0;
493
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000494/* For an anchored pattern, or an unchored pattern that has a first char, or a
495multiline pattern that matches only at "line starts", no further processing at
496present. */
497
498if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
499 return NULL;
500
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000501/* See if we can find a fixed set of initial characters for the pattern. */
502
Guido van Rossum50700601997-12-08 17:15:20 +0000503memset(start_bits, 0, 32 * sizeof(uschar));
504if (!set_start_bits(re->code, start_bits)) return NULL;
505
506/* If this studying is caseless, scan the created bit map and duplicate the
507bits for any letters. */
508
509if (caseless)
510 {
511 register int c;
512 for (c = 0; c < 256; c++)
513 {
514 if ((start_bits[c/8] & (1 << (c&7))) != 0 &&
515 (pcre_ctypes[c] & ctype_letter) != 0)
516 {
517 int d = pcre_fcc[c];
518 start_bits[d/8] |= (1 << (d&7));
519 }
520 }
521 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000522
523/* Get an "extra" block and put the information therein. */
524
525extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
526
527if (extra == NULL)
528 {
529 *errorptr = "failed to get memory";
530 return NULL;
531 }
Guido van Rossum50700601997-12-08 17:15:20 +0000532
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000533extra->options = PCRE_STUDY_MAPPED | (caseless? PCRE_STUDY_CASELESS : 0);
Guido van Rossum50700601997-12-08 17:15:20 +0000534memcpy(extra->start_bits, start_bits, sizeof(start_bits));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000535
536return (pcre_extra *)extra;
537}
538
Guido van Rossum50700601997-12-08 17:15:20 +0000539/* End of study.c */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000540/*************************************************
541* Perl-Compatible Regular Expressions *
542*************************************************/
543
544/*
545This is a library of functions to support regular expressions whose syntax
546and semantics are as close as possible to those of the Perl 5 language. See
547the file Tech.Notes for some information on the internals.
548
549Written by: Philip Hazel <ph10@cam.ac.uk>
550
551 Copyright (c) 1997 University of Cambridge
552
553-----------------------------------------------------------------------------
554Permission is granted to anyone to use this software for any purpose on any
555computer system, and to redistribute it freely, subject to the following
556restrictions:
557
5581. This software is distributed in the hope that it will be useful,
559 but WITHOUT ANY WARRANTY; without even the implied warranty of
560 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
561
5622. The origin of this software must not be misrepresented, either by
563 explicit claim or by omission.
564
5653. Altered versions must be plainly marked as such, and must not be
566 misrepresented as being the original software.
567-----------------------------------------------------------------------------
568*/
569
570
571/* Define DEBUG to get debugging output on stdout. */
572
573/* #define DEBUG */
574
575
576/* Include the internals header, which itself includes Standard C headers plus
577the external pcre header. */
578
579
Guido van Rossum50700601997-12-08 17:15:20 +0000580
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000581#ifndef Py_eval_input
582/* For Python 1.4, graminit.h has to be explicitly included */
583#define Py_eval_input eval_input
Guido van Rossum50700601997-12-08 17:15:20 +0000584
585#endif /* FOR_PYTHON */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000586
587/* Min and max values for the common repeats; for the maxima, 0 => infinity */
588
589static char rep_min[] = { 0, 0, 1, 1, 0, 0 };
590static char rep_max[] = { 0, 0, 0, 0, 1, 1 };
591
592/* Text forms of OP_ values and things, for debugging */
593
594#ifdef DEBUG
595static char *OP_names[] = { "End", "\\A", "\\B", "\\b", "\\D", "\\d",
Guido van Rossum50700601997-12-08 17:15:20 +0000596 "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z",
597 "localized \\B", "localized \\b", "localized \\W", "localized \\w",
598 "^", "$", "Any", "chars",
599 "not",
600 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000601 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
602 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
603 "*", "*?", "+", "+?", "?", "??", "{", "{",
Guido van Rossum50700601997-12-08 17:15:20 +0000604 "class", "classL", "Ref",
605 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000606 "Brazero", "Braminzero", "Bra"
607};
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000608#endif
609
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000610/* Table for handling escaped characters in the range '0'-'z'. Positive returns
611are simple data values; negative values are for special things like \d and so
612on. Zero means further processing is needed (for things like \x), or the escape
613is invalid. */
614
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000615static short int escapes[] = {
616 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
617 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
618 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
619 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
620 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
621 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
622 '`', 7, -ESC_b, 0, -ESC_d, 0, '\f', 0, /* ` - g */
623 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
624 0, 0, '\r', -ESC_s, '\t', 0, '\v', -ESC_w, /* p - w */
625 0, 0, 0 /* x - z */
626};
627
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000628/* Definition to allow mutual recursion */
629
Guido van Rossum50700601997-12-08 17:15:20 +0000630static BOOL compile_regex(int, int *, uschar **, uschar **,
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000631 char **, PyObject *);
632
633/* Structure for passing "static" information around between the functions
634doing the matching, so that they are thread-safe. */
635
636typedef struct match_data {
637 int errorcode; /* As it says */
638 int *offset_vector; /* Offset vector */
639 int offset_end; /* One past the end */
640 BOOL offset_overflow; /* Set if too many extractions */
641 BOOL caseless; /* Case-independent flag */
Guido van Rossum50700601997-12-08 17:15:20 +0000642 BOOL runtime_caseless; /* Caseless forced at run time */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000643 BOOL multiline; /* Multiline flag */
Guido van Rossum50700601997-12-08 17:15:20 +0000644 BOOL notbol; /* NOTBOL flag */
645 BOOL noteol; /* NOTEOL flag */
646 BOOL dotall; /* Dot matches any char */
647 BOOL endonly; /* Dollar not before final \n */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000648 uschar *start_subject; /* Start of the subject string */
649 uschar *end_subject; /* End of the subject string */
Guido van Rossum50700601997-12-08 17:15:20 +0000650 jmp_buf fail_env; /* Environment for longjump() break out */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000651 uschar *end_match_ptr; /* Subject position at end match */
652 int end_offset_top; /* Highwater mark at end of match */
Guido van Rossum50700601997-12-08 17:15:20 +0000653 jmp_buf error_env; /* For longjmp() if an error occurs deep inside a
654 matching operation */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000655 int length; /* Length of the allocated stacks */
656 int point; /* Point to add next item pushed onto stacks */
657 /* Pointers to the 6 stacks */
658 int *off_num, *offset_top, *r1, *r2;
659 uschar **eptr, **ecode;
660} match_data;
661
662
663
664/*************************************************
Guido van Rossum50700601997-12-08 17:15:20 +0000665* Global variables *
666*************************************************/
667
668/* PCRE is thread-clean and doesn't use any global variables in the normal
669sense. However, it calls memory allocation and free functions via the two
670indirections below, which are can be changed by the caller, but are shared
671between all threads. */
672
673void *(*pcre_malloc)(size_t) = malloc;
674void (*pcre_free)(void *) = free;
675
676
677
678
679/*************************************************
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000680* Return version string *
681*************************************************/
682
683char *
684pcre_version(void)
685{
686return PCRE_VERSION;
687}
688
689
690
691
692/*************************************************
693* Return info about a compiled pattern *
694*************************************************/
695
696/* This function picks potentially useful data out of the private
697structure.
698
699Arguments:
700 external_re points to compiled code
701 optptr where to pass back the options
702 first_char where to pass back the first character,
703 or -1 if multiline and all branches start ^,
704 or -2 otherwise
705
706Returns: number of identifying extraction brackets
707 or negative values on error
708*/
709
710int
Guido van Rossum50700601997-12-08 17:15:20 +0000711pcre_info(const pcre *external_re, int *optptr, int *first_char)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000712{
713real_pcre *re = (real_pcre *)external_re;
714if (re == NULL) return PCRE_ERROR_NULL;
715if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
716if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
717if (first_char != NULL)
718 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
719 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
720return re->top_bracket;
721}
722
723
724
725
726#ifdef DEBUG
727/*************************************************
728* Debugging function to print chars *
729*************************************************/
730
731/* Print a sequence of chars in printable format, stopping at the end of the
732subject if the requested.
733
734Arguments:
735 p points to characters
736 length number to print
737 is_subject TRUE if printing from within md->start_subject
738 md pointer to matching data block, if is_subject is TRUE
739
740Returns: nothing
741*/
742
743static pchars(uschar *p, int length, BOOL is_subject, match_data *md)
744{
745int c;
746if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
747while (length-- > 0)
748 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
749}
750#endif
751
752
753
754
755/*************************************************
756* Check subpattern for empty operand *
757*************************************************/
758
759/* This function checks a bracketed subpattern to see if any of the paths
760through it could match an empty string. This is used to diagnose an error if
761such a subpattern is followed by a quantifier with an unlimited upper bound.
762
763Argument:
764 code points to the opening bracket
765
766Returns: TRUE or FALSE
767*/
768
769static BOOL
770could_be_empty(uschar *code)
771{
772do {
773 uschar *cc = code + 3;
774
775 /* Scan along the opcodes for this branch; as soon as we find something
776 that matches a non-empty string, break out and advance to test the next
777 branch. If we get to the end of the branch, return TRUE for the whole
778 sub-expression. */
779
780 for (;;)
781 {
782 /* Test an embedded subpattern; if it could not be empty, break the
783 loop. Otherwise carry on in the branch. */
784
Guido van Rossum50700601997-12-08 17:15:20 +0000785 if ((int)(*cc) >= OP_BRA || (int)(*cc) == OP_ONCE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000786 {
787 if (!could_be_empty(cc)) break;
788 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
789 cc += 3;
790 }
791
792 else switch (*cc)
793 {
794 /* Reached end of a branch: the subpattern may match the empty string */
795
796 case OP_ALT:
797 case OP_KET:
798 case OP_KETRMAX:
799 case OP_KETRMIN:
800 return TRUE;
801
802 /* Skip over assertive subpatterns */
803
804 case OP_ASSERT:
805 case OP_ASSERT_NOT:
806 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
807 cc += 3;
808 break;
809
810 /* Skip over things that don't match chars */
811
812 case OP_SOD:
813 case OP_EOD:
814 case OP_CIRC:
815 case OP_DOLL:
816 case OP_BRAZERO:
817 case OP_BRAMINZERO:
818 case OP_NOT_WORD_BOUNDARY:
819 case OP_WORD_BOUNDARY:
Guido van Rossum50700601997-12-08 17:15:20 +0000820 case OP_NOT_WORD_BOUNDARY_L:
821 case OP_WORD_BOUNDARY_L:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000822 cc++;
823 break;
824
825 /* Skip over simple repeats with zero lower bound */
826
827 case OP_STAR:
828 case OP_MINSTAR:
829 case OP_QUERY:
830 case OP_MINQUERY:
Guido van Rossum50700601997-12-08 17:15:20 +0000831 case OP_NOTSTAR:
832 case OP_NOTMINSTAR:
833 case OP_NOTQUERY:
834 case OP_NOTMINQUERY:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000835 case OP_TYPESTAR:
836 case OP_TYPEMINSTAR:
837 case OP_TYPEQUERY:
838 case OP_TYPEMINQUERY:
839 cc += 2;
840 break;
841
842 /* Skip over UPTOs (lower bound is zero) */
843
844 case OP_UPTO:
845 case OP_MINUPTO:
846 case OP_TYPEUPTO:
847 case OP_TYPEMINUPTO:
848 cc += 4;
849 break;
850
851 /* Check a class or a back reference for a zero minimum */
852
853 case OP_CLASS:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000854 case OP_REF:
Guido van Rossum50700601997-12-08 17:15:20 +0000855 case OP_CLASS_L:
856 switch(*cc)
857 {
858 case (OP_REF): cc += 2; break;
859 case (OP_CLASS): cc += 1+32; break;
860 case (OP_CLASS_L): cc += 1+1+32; break;
861 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000862
863 switch (*cc)
864 {
865 case OP_CRSTAR:
866 case OP_CRMINSTAR:
867 case OP_CRQUERY:
868 case OP_CRMINQUERY:
869 cc++;
870 break;
871
872 case OP_CRRANGE:
873 case OP_CRMINRANGE:
874 if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
875 cc += 3;
876 break;
877
878 default:
879 goto NEXT_BRANCH;
880 }
881 break;
882
883 /* Anything else matches at least one character */
884
885 default:
886 goto NEXT_BRANCH;
887 }
888 }
889
890 NEXT_BRANCH:
891 code += (code[1] << 8) + code[2];
892 }
893while (*code == OP_ALT);
894
895/* No branches match the empty string */
896
897return FALSE;
898}
899
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000900/* Determine the length of a group ID in an expression like
901 (?P<foo_123>...)
902Arguments:
903 ptr pattern position pointer (say that 3 times fast)
904 finalchar the character that will mark the end of the ID
905 errorptr points to the pointer to the error message
906*/
907
908static int
909get_group_id(uschar *ptr, char finalchar, char **errorptr)
910{
911 uschar *start = ptr;
912
913 /* If the first character is not in \w, or is in \w but is a digit,
914 report an error */
915 if (!(pcre_ctypes[*ptr] & ctype_word) ||
916 (pcre_ctypes[*ptr++] & ctype_digit))
917 {
918 *errorptr = "(?P identifier must start with a letter or underscore";
919 return 0;
920 }
921
922 /* Increment ptr until we either hit a null byte, the desired
923 final character, or a non-word character */
924 for(; (*ptr != 0) && (*ptr != finalchar) &&
925 (pcre_ctypes[*ptr] & ctype_word); ptr++)
926 {
Guido van Rossumc3861071997-10-08 02:07:40 +0000927 /* Empty loop body */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000928 }
929 if (*ptr==finalchar)
930 return ptr-start;
931 if (*ptr==0)
932 {
933 *errorptr = "unterminated (?P identifier";
934 return 0;
935 }
936 *errorptr = "illegal character in (?P identifier";
937 return 0;
938}
939
940/*************************************************
941* Handle escapes *
942*************************************************/
943
944/* This function is called when a \ has been encountered. It either returns a
945positive value for a simple escape such as \n, or a negative value which
946encodes one of the more complicated things such as \d. On entry, ptr is
947pointing at the \. On exit, it is on the final character of the escape
948sequence.
949
950Arguments:
951 ptrptr points to the pattern position pointer
952 errorptr points to the pointer to the error message
Guido van Rossum50700601997-12-08 17:15:20 +0000953 bracount number of previous extracting brackets
954 options the options bits
955 isclass TRUE if inside a character class
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000956
957Returns: zero or positive => a data character
958 negative => a special escape sequence
959 on error, errorptr is set
960*/
961
962static int
Guido van Rossum50700601997-12-08 17:15:20 +0000963check_escape(uschar **ptrptr, char **errorptr, int bracount, int options,
964 BOOL isclass)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000965{
966uschar *ptr = *ptrptr;
967int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
968int i;
969
Guido van Rossum50700601997-12-08 17:15:20 +0000970if (c == 0) *errorptr = ERR1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000971
972/* Digits or letters may have special meaning; all others are literals. */
973
974else if (c < '0' || c > 'z') {}
975
976/* Do an initial lookup in a table. A non-zero result is something that can be
977returned immediately. Otherwise further processing may be required. */
978
979else if ((i = escapes[c - '0']) != 0) c = i;
980
981/* Escapes that need further processing, or are illegal. */
982
Guido van Rossum50700601997-12-08 17:15:20 +0000983else
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000984 {
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000985
Guido van Rossum50700601997-12-08 17:15:20 +0000986 switch (c)
987 {
988 /* The handling of escape sequences consisting of a string of digits
989 starting with one that is not zero is not straightforward. By experiment,
990 the way Perl works seems to be as follows:
991
992 Outside a character class, the digits are read as a decimal number. If the
993 number is less than 10, or if there are that many previous extracting
994 left brackets, then it is a back reference. Otherwise, up to three octal
995 digits are read to form an escaped byte. Thus \123 is likely to be octal
996 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
997 value is greater than 377, the least significant 8 bits are taken. Inside a
998 character class, \ followed by a digit is always an octal number. */
999
1000 case '1': case '2': case '3': case '4': case '5':
1001 case '6': case '7': case '8': case '9':
1002
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001003 {
1004 /* PYTHON: Try to compute an octal value for a character */
1005 for(c=0, i=0; c!=-1 && ptr[i]!=0 && i<3; i++)
1006 {
1007 if (( pcre_ctypes[ ptr[i] ] & ctype_odigit) != 0)
1008 c = c * 8 + ptr[i]-'0';
1009 else
1010 c = -1; /* Non-octal character */
1011 }
1012 /* Aha! There were 3 octal digits, so it must be a character */
1013 if (c != -1 && i == 3)
1014 {
1015 ptr += i-1;
1016 break;
1017 }
1018 c = ptr[0]; /* Restore the first character after the \ */
1019 c -= '0'; i = 1;
1020 while (i<2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0)
1021 {
1022 c = c * 10 + ptr[1] - '0';
1023 ptr++; i++;
1024 }
1025 if (c > 255 - ESC_REF) *errorptr = "back reference too big";
1026 c = -(ESC_REF + c);
1027 }
1028 break;
1029
Guido van Rossum50700601997-12-08 17:15:20 +00001030 /* \0 always starts an octal number, but we may drop through to here with a
1031 larger first octal digit */
1032
1033 case '0':
1034 c -= '0';
1035 while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&
1036 ptr[1] != '8' && ptr[1] != '9')
1037 c = c * 8 + *(++ptr) - '0';
1038 break;
1039
1040 /* Special escapes not starting with a digit are straightforward */
1041
1042 case 'x':
1043 c = 0;
1044 while ( (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001045 {
Guido van Rossum50700601997-12-08 17:15:20 +00001046 ptr++;
1047 c = c * 16 + pcre_lcc[*ptr] -
1048 (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
1049 c &= 255;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001050 }
1051 break;
1052
1053
Guido van Rossum50700601997-12-08 17:15:20 +00001054 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1055 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
1056 for Perl compatibility, it is a literal. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001057
Guido van Rossum50700601997-12-08 17:15:20 +00001058 default:
1059 if ((options & PCRE_EXTRA) != 0) switch(c)
1060 {
1061 case 'X':
1062 c = -ESC_X; /* This could be a lookup if it ever got into Perl */
1063 break;
1064
1065 default:
1066 *errorptr = ERR3;
1067 break;
1068 }
1069 break;
1070 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001071 }
1072
1073*ptrptr = ptr;
1074return c;
1075}
1076
1077
1078
1079/*************************************************
Guido van Rossum50700601997-12-08 17:15:20 +00001080* Check for counted repeat *
1081*************************************************/
1082
1083/* This function is called when a '{' is encountered in a place where it might
1084start a quantifier. It looks ahead to see if it really is a quantifier or not.
1085It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1086where the ddds are digits.
1087
1088Arguments:
1089 p pointer to the first char after '{'
1090
1091Returns: TRUE or FALSE
1092*/
1093
1094static BOOL
1095is_counted_repeat(uschar *p)
1096{
1097if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1098while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1099if (*p == '}') return TRUE;
1100
1101if (*p++ != ',') return FALSE;
1102if (*p == '}') return TRUE;
1103
1104if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1105while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1106return (*p == '}');
1107}
1108
1109
1110
1111/*************************************************
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001112* Read repeat counts *
1113*************************************************/
1114
Guido van Rossum50700601997-12-08 17:15:20 +00001115/* Read an item of the form {n,m} and return the values. This is called only
1116after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1117so the syntax is guaranteed to be correct, but we need to check the values.
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001118
1119Arguments:
1120 p pointer to first char after '{'
1121 minp pointer to int for min
1122 maxp pointer to int for max
1123 returned as -1 if no max
1124 errorptr points to pointer to error message
1125
1126Returns: pointer to '}' on success;
1127 current ptr on error, with errorptr set
1128*/
1129
1130static uschar *
1131read_repeat_counts(uschar *p, int *minp, int *maxp, char **errorptr)
1132{
1133int min = 0;
1134int max = -1;
1135
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001136while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
1137
1138if (*p == '}') max = min; else
1139 {
Guido van Rossum50700601997-12-08 17:15:20 +00001140 if (*(++p) != '}')
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001141 {
1142 max = 0;
1143 while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001144 if (max < min)
1145 {
Guido van Rossum50700601997-12-08 17:15:20 +00001146 *errorptr = ERR4;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001147 return p;
1148 }
1149 }
1150 }
1151
1152/* Do paranoid checks, then fill in the required variables, and pass back the
1153pointer to the terminating '}'. */
1154
Guido van Rossum50700601997-12-08 17:15:20 +00001155if (min > 65535 || max > 65535)
1156 *errorptr = ERR5;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001157else
1158 {
1159 *minp = min;
1160 *maxp = max;
1161 }
1162return p;
1163}
1164
1165
1166
1167/*************************************************
1168* Compile one branch *
1169*************************************************/
1170
1171/* Scan the pattern, compiling it into the code vector.
1172
1173Arguments:
Guido van Rossum50700601997-12-08 17:15:20 +00001174 options the option bits
1175 bracket points to number of brackets used
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001176 code points to the pointer to the current code point
1177 ptrptr points to the current pattern pointer
1178 errorptr points to pointer to error message
1179
1180Returns: TRUE on success
1181 FALSE, with *errorptr set on error
1182*/
1183
1184static BOOL
Guido van Rossum50700601997-12-08 17:15:20 +00001185compile_branch(int options, int *brackets, uschar **codeptr,
1186 uschar **ptrptr, char **errorptr, PyObject *dictionary)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001187{
1188int repeat_type, op_type;
1189int repeat_min, repeat_max;
1190int bravalue, length;
1191register int c;
1192register uschar *code = *codeptr;
1193uschar *ptr = *ptrptr;
1194uschar *previous = NULL;
1195uschar *oldptr;
Guido van Rossum50700601997-12-08 17:15:20 +00001196uschar class[32];
1197uschar *class_flag; /* Pointer to the single-byte flag for OP_CLASS_L */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001198
1199/* Switch on next character until the end of the branch */
1200
1201for (;; ptr++)
1202 {
Guido van Rossum50700601997-12-08 17:15:20 +00001203 BOOL negate_class;
1204 int class_charcount;
1205 int class_lastchar;
1206
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001207 c = *ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00001208 if ((options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001209 {
1210 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1211 if (c == '#')
1212 {
1213 while ((c = *(++ptr)) != 0 && c != '\n');
1214 continue;
1215 }
1216 }
1217
1218 switch(c)
1219 {
1220 /* The branch terminates at end of string, |, or ). */
1221
1222 case 0:
1223 case '|':
1224 case ')':
1225 *codeptr = code;
1226 *ptrptr = ptr;
1227 return TRUE;
1228
1229 /* Handle single-character metacharacters */
1230
1231 case '^':
1232 previous = NULL;
1233 *code++ = OP_CIRC;
1234 break;
1235
1236 case '$':
1237 previous = NULL;
1238 *code++ = OP_DOLL;
1239 break;
1240
1241 case '.':
1242 previous = code;
1243 *code++ = OP_ANY;
1244 break;
1245
Guido van Rossum50700601997-12-08 17:15:20 +00001246 /* Character classes. These always build a 32-byte bitmap of the permitted
1247 characters, except in the special case where there is only one character.
1248 For negated classes, we build the map as usual, then invert it at the end.
1249 */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001250
1251 case '[':
Guido van Rossum50700601997-12-08 17:15:20 +00001252 previous = code;
1253 if (options & PCRE_LOCALE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001254 {
Guido van Rossum50700601997-12-08 17:15:20 +00001255 *code++ = OP_CLASS_L;
1256 /* Set the flag for localized classes (like \w) to 0 */
1257 class_flag = code;
1258 *class_flag = 0;
1259 }
1260 else
1261 {
1262 *code++ = OP_CLASS;
1263 class_flag = NULL;
1264 }
1265
1266 /* If the first character is '^', set the negation flag */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001267
Guido van Rossum50700601997-12-08 17:15:20 +00001268 if ((c = *(++ptr)) == '^')
1269 {
1270 negate_class = TRUE;
1271 c = *(++ptr);
1272 }
1273 else negate_class = FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001274
Guido van Rossum50700601997-12-08 17:15:20 +00001275 /* Keep a count of chars so that we can optimize the case of just a single
1276 character. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001277
Guido van Rossum50700601997-12-08 17:15:20 +00001278 class_charcount = 0;
1279 class_lastchar = -1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001280
Guido van Rossum50700601997-12-08 17:15:20 +00001281 /* Initialize the 32-char bit map to all zeros. We have to build the
1282 map in a temporary bit of store, in case the class contains only 1
1283 character, because in that case the compiled code doesn't use the
1284 bit map. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001285
Guido van Rossum50700601997-12-08 17:15:20 +00001286 memset(class, 0, 32 * sizeof(uschar));
1287
1288 /* Process characters until ] is reached. By writing this as a "do" it
1289 means that an initial ] is taken as a data character. */
1290
1291 do
1292 {
1293 if (c == 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001294 {
Guido van Rossum50700601997-12-08 17:15:20 +00001295 *errorptr = ERR6;
1296 goto FAILED;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001297 }
1298
Guido van Rossum50700601997-12-08 17:15:20 +00001299 /* Backslash may introduce a single character, or it may introduce one
1300 of the specials, which just set a flag. Escaped items are checked for
1301 validity in the pre-compiling pass. The sequence \b is a special case.
1302 Inside a class (and only there) it is treated as backslash. Elsewhere
1303 it marks a word boundary. Other escapes have preset maps ready to
1304 or into the one we are building. We assume they have more than one
1305 character in them, so set class_count bigger than one. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001306
Guido van Rossum50700601997-12-08 17:15:20 +00001307 if (c == '\\')
1308 {
1309 c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1310 if (-c == ESC_b) c = '\b';
1311 else if (c < 0)
1312 {
1313 class_charcount = 10;
1314 switch (-c)
1315 {
1316 case ESC_d:
1317 if (options & PCRE_LOCALE)
1318 {
1319 *class_flag |= 4;
1320 }
1321 else
1322 {
1323 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];
1324 }
1325 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001326
Guido van Rossum50700601997-12-08 17:15:20 +00001327 case ESC_D:
1328 if (options & PCRE_LOCALE)
1329 {
1330 *class_flag |= 8;
1331 }
1332 else
1333 {
1334 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];
1335 }
1336 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001337
Guido van Rossum50700601997-12-08 17:15:20 +00001338 case ESC_w:
1339 if (options & PCRE_LOCALE)
1340 {
1341 *class_flag |= 1;
1342 }
1343 else
1344 {
1345 for (c = 0; c < 32; c++)
1346 class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1347 }
1348 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001349
Guido van Rossum50700601997-12-08 17:15:20 +00001350 case ESC_W:
1351 if (options & PCRE_LOCALE)
1352 {
1353 *class_flag |= 2;
1354 }
1355 else
1356 {
1357 for (c = 0; c < 32; c++)
1358 class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1359 }
1360 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001361
Guido van Rossum50700601997-12-08 17:15:20 +00001362 case ESC_s:
1363 if (options & PCRE_LOCALE)
1364 {
1365 *class_flag |= 32;
1366 }
1367 else
1368 {
1369 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];
1370 }
1371 continue;
1372
1373 case ESC_S:
1374 if (options & PCRE_LOCALE)
1375 {
1376 *class_flag |= 32;
1377 }
1378 else
1379 {
1380 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];
1381 }
1382 continue;
1383
1384 default:
1385 *errorptr = ERR7;
1386 goto FAILED;
1387 }
1388 }
1389 /* Fall through if single character */
1390 }
1391
1392 /* A single character may be followed by '-' to form a range. However,
1393 Perl does not permit ']' to be the end of the range. A '-' character
1394 here is treated as a literal. */
1395
1396 if (ptr[1] == '-' && ptr[2] != ']')
1397 {
1398 int d;
1399 ptr += 2;
1400 d = *ptr;
1401
1402 if (d == 0)
1403 {
1404 *errorptr = ERR6;
1405 goto FAILED;
1406 }
1407
1408 /* The second part of a range can be a single-character escape, but
1409 not any of the other escapes. */
1410
1411 if (d == '\\')
1412 {
1413 d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1414 if (d < 0)
1415 {
1416 if (d == -ESC_b) d = '\b'; else
1417 {
1418 *errorptr = ERR7;
1419 goto FAILED;
1420 }
1421 }
1422 }
1423
1424 if (d < c)
1425 {
1426 *errorptr = ERR8;
1427 goto FAILED;
1428 }
1429
1430 for (; c <= d; c++)
1431 {
1432 class[c/8] |= (1 << (c&7));
1433 if ((options & PCRE_CASELESS) != 0)
1434 {
1435 int uc = pcre_fcc[c]; /* flip case */
1436 class[uc/8] |= (1 << (uc&7));
1437 }
1438 class_charcount++; /* in case a one-char range */
1439 class_lastchar = c;
1440 }
1441 continue; /* Go get the next char in the class */
1442 }
1443
1444 /* Handle a lone single character - we can get here for a normal
1445 non-escape char, or after \ that introduces a single character. */
1446
1447 class [c/8] |= (1 << (c&7));
1448 if ((options & PCRE_CASELESS) != 0)
1449 {
1450 c = pcre_fcc[c]; /* flip case */
1451 class[c/8] |= (1 << (c&7));
1452 }
1453 class_charcount++;
1454 class_lastchar = c;
1455 }
1456
1457 /* Loop until ']' reached; the check for end of string happens inside the
1458 loop. This "while" is the end of the "do" above. */
1459
1460 while ((c = *(++ptr)) != ']');
1461
1462 /* If class_charcount is 1 and class_lastchar is not negative, we saw
1463 precisely one character. This doesn't need the whole 32-byte bit map.
1464 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1465 it's negative. */
1466
1467 if (class_charcount == 1 && class_lastchar >= 0)
1468 {
1469 if (negate_class)
1470 {
1471 code[-1] = OP_NOT;
1472 }
1473 else
1474 {
1475 code[-1] = OP_CHARS;
1476 *code++ = 1;
1477 }
1478 *code++ = class_lastchar;
1479 }
1480
1481 /* Otherwise, negate the 32-byte map if necessary, and copy it into
1482 the code vector. */
1483
1484 else
1485 {
1486 /* If this is a localized opcode, bump the code pointer up */
1487 if (class_flag) code++;
1488 if (negate_class)
1489 {
1490 if (class_flag) *class_flag = (*class_flag) ^ 63;
1491 for (c = 0; c < 32; c++) code[c] = ~class[c];
1492 }
1493 else
1494 memcpy(code, class, 32);
1495 code += 32;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001496 }
1497 break;
1498
1499 /* Various kinds of repeat */
1500
1501 case '{':
Guido van Rossum50700601997-12-08 17:15:20 +00001502 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001503 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
1504 if (*errorptr != NULL) goto FAILED;
1505 goto REPEAT;
1506
1507 case '*':
1508 repeat_min = 0;
1509 repeat_max = -1;
1510 goto REPEAT;
1511
1512 case '+':
1513 repeat_min = 1;
1514 repeat_max = -1;
1515 goto REPEAT;
1516
1517 case '?':
1518 repeat_min = 0;
1519 repeat_max = 1;
1520
1521 REPEAT:
1522 if (previous == NULL)
1523 {
Guido van Rossum50700601997-12-08 17:15:20 +00001524 *errorptr = ERR9;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001525 goto FAILED;
1526 }
1527
1528 /* If the next character is '?' this is a minimizing repeat. Advance to the
1529 next character. */
1530
1531 if (ptr[1] == '?') { repeat_type = 1; ptr++; } else repeat_type = 0;
1532
Guido van Rossum50700601997-12-08 17:15:20 +00001533 /* If the maximum is zero then the minimum must also be zero; Perl allows
1534 this case, so we do too - by simply omitting the item altogether. */
1535
1536 if (repeat_max == 0) code = previous;
1537
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001538 /* If previous was a string of characters, chop off the last one and use it
1539 as the subject of the repeat. If there was only one character, we can
1540 abolish the previous item altogether. */
1541
Guido van Rossum50700601997-12-08 17:15:20 +00001542 else if (*previous == OP_CHARS)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001543 {
1544 int len = previous[1];
1545 if (len == 1)
1546 {
1547 c = previous[2];
1548 code = previous;
1549 }
1550 else
1551 {
1552 c = previous[len+1];
1553 previous[1]--;
1554 code--;
1555 }
1556 op_type = 0; /* Use single-char op codes */
1557 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1558 }
1559
Guido van Rossum50700601997-12-08 17:15:20 +00001560 /* If previous was a single negated character ([^a] or similar), we use
1561 one of the special opcodes, replacing it. The code is shared with single-
1562 character repeats by adding a suitable offset into repeat_type. */
1563
1564 else if ((int)*previous == OP_NOT)
1565 {
1566 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
1567 c = previous[1];
1568 code = previous;
1569 goto OUTPUT_SINGLE_REPEAT;
1570 }
1571
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001572 /* If previous was a character type match (\d or similar), abolish it and
1573 create a suitable repeat item. The code is shared with single-character
1574 repeats by adding a suitable offset into repeat_type. */
1575
Guido van Rossum50700601997-12-08 17:15:20 +00001576 else if ((int)*previous < OP_CIRC || *previous == OP_ANY)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001577 {
1578 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1579 c = *previous;
1580 code = previous;
1581
1582 OUTPUT_SINGLE_REPEAT:
1583 repeat_type += op_type; /* Combine both values for many cases */
1584
1585 /* A minimum of zero is handled either as the special case * or ?, or as
1586 an UPTO, with the maximum given. */
1587
1588 if (repeat_min == 0)
1589 {
1590 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1591 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1592 else
1593 {
1594 *code++ = OP_UPTO + repeat_type;
1595 *code++ = repeat_max >> 8;
1596 *code++ = (repeat_max & 255);
1597 }
1598 }
1599
1600 /* The case {1,} is handled as the special case + */
1601
1602 else if (repeat_min == 1 && repeat_max == -1)
1603 *code++ = OP_PLUS + repeat_type;
1604
1605 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1606 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1607
1608 else
1609 {
1610 if (repeat_min != 1)
1611 {
1612 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1613 *code++ = repeat_min >> 8;
1614 *code++ = (repeat_min & 255);
1615 }
1616
1617 /* If the mininum is 1 and the previous item was a character string,
1618 we either have to put back the item that got cancelled if the string
1619 length was 1, or add the character back onto the end of a longer
1620 string. For a character type nothing need be done; it will just get put
1621 back naturally. */
1622
1623 else if (*previous == OP_CHARS)
1624 {
1625 if (code == previous) code += 2; else previous[1]++;
1626 }
1627
1628 /* Insert an UPTO if the max is greater than the min. */
1629
1630 if (repeat_max != repeat_min)
1631 {
1632 *code++ = c;
1633 repeat_max -= repeat_min;
1634 *code++ = OP_UPTO + repeat_type;
1635 *code++ = repeat_max >> 8;
1636 *code++ = (repeat_max & 255);
1637 }
1638 }
1639
1640 /* The character or character type itself comes last in all cases. */
1641
1642 *code++ = c;
1643 }
1644
1645 /* If previous was a character class or a back reference, we put the repeat
1646 stuff after it. */
1647
Guido van Rossum50700601997-12-08 17:15:20 +00001648 else if (*previous == OP_CLASS || *previous==OP_CLASS_L || *previous == OP_REF)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001649 {
1650 if (repeat_min == 0 && repeat_max == -1)
1651 *code++ = OP_CRSTAR + repeat_type;
1652 else if (repeat_min == 1 && repeat_max == -1)
1653 *code++ = OP_CRPLUS + repeat_type;
1654 else if (repeat_min == 0 && repeat_max == 1)
1655 *code++ = OP_CRQUERY + repeat_type;
1656 else
1657 {
1658 *code++ = OP_CRRANGE + repeat_type;
1659 *code++ = repeat_min >> 8;
1660 *code++ = repeat_min & 255;
1661 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1662 *code++ = repeat_max >> 8;
1663 *code++ = repeat_max & 255;
1664 }
1665 }
1666
1667 /* If previous was a bracket group, we may have to replicate it in certain
1668 cases. If the maximum repeat count is unlimited, check that the bracket
1669 group cannot match the empty string, and diagnose an error if it can. */
1670
1671 else if ((int)*previous >= OP_BRA)
1672 {
1673 int i;
1674 int length = code - previous;
1675
1676 if (repeat_max == -1 && could_be_empty(previous))
1677 {
Guido van Rossum50700601997-12-08 17:15:20 +00001678 *errorptr = ERR10;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001679 goto FAILED;
1680 }
1681
1682 /* If the minimum is greater than zero, and the maximum is unlimited or
1683 equal to the minimum, the first copy remains where it is, and is
1684 replicated up to the minimum number of times. This case includes the +
1685 repeat, but of course no replication is needed in that case. */
1686
1687 if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1688 {
1689 for (i = 1; i < repeat_min; i++)
1690 {
1691 memcpy(code, previous, length);
1692 code += length;
1693 }
1694 }
1695
1696 /* If the minimum is zero, stick BRAZERO in front of the first copy.
1697 Then, if there is a fixed upper limit, replicated up to that many times,
1698 sticking BRAZERO in front of all the optional ones. */
1699
1700 else
1701 {
1702 if (repeat_min == 0)
1703 {
1704 memmove(previous+1, previous, length);
1705 code++;
1706 *previous++ = OP_BRAZERO + repeat_type;
1707 }
1708
1709 for (i = 1; i < repeat_min; i++)
1710 {
1711 memcpy(code, previous, length);
1712 code += length;
1713 }
1714
1715 for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1716 {
1717 *code++ = OP_BRAZERO + repeat_type;
1718 memcpy(code, previous, length);
1719 code += length;
1720 }
1721 }
1722
1723 /* If the maximum is unlimited, set a repeater in the final copy. */
1724
1725 if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
1726 }
1727
1728 /* Else there's some kind of shambles */
1729
1730 else
1731 {
Guido van Rossum50700601997-12-08 17:15:20 +00001732 *errorptr = ERR11;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001733 goto FAILED;
1734 }
1735
1736 /* In all case we no longer have a previous item. */
1737
1738 previous = NULL;
1739 break;
1740
1741
1742 /* Start of nested bracket sub-expression, or comment or lookahead.
1743 First deal with special things that can come after a bracket; all are
1744 introduced by ?, and the appearance of any of them means that this is not a
1745 referencing group. They were checked for validity in the first pass over
1746 the string, so we don't have to check for syntax errors here. */
1747
1748 case '(':
1749 previous = code; /* Only real brackets can be repeated */
1750 if (*(++ptr) == '?')
1751 {
1752 bravalue = OP_BRA;
1753
1754 switch (*(++ptr))
1755 {
1756 case '#':
1757 case 'i':
Guido van Rossumbd49ac41997-12-10 23:05:53 +00001758 case 'L':
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001759 case 'm':
1760 case 's':
1761 case 'x':
1762 ptr++;
1763 while (*ptr != ')') ptr++;
1764 previous = NULL;
1765 continue;
1766
1767 case ':': /* Non-extracting bracket */
1768 ptr++;
1769 break;
1770
1771 case '=': /* Assertions can't be repeated */
1772 bravalue = OP_ASSERT;
1773 ptr++;
1774 previous = NULL;
1775 break;
1776
1777 case '!':
1778 bravalue = OP_ASSERT_NOT;
1779 ptr++;
1780 previous = NULL;
1781 break;
1782
1783 case ('P'):
1784 ptr++;
1785 if (*ptr=='<')
1786 {
1787 /* (?P<groupname>...) */
1788 int idlen;
1789 PyObject *string, *intobj;
1790
1791 ptr++;
1792 idlen = get_group_id(ptr, '>', errorptr);
1793 if (*errorptr) {
1794 goto FAILED;
1795 }
Guido van Rossum57ba4f31997-12-02 20:40:28 +00001796 string = PyString_FromStringAndSize((char*)ptr, idlen);
Guido van Rossum50700601997-12-08 17:15:20 +00001797 intobj = PyInt_FromLong( brackets[0] + 1 );
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001798 if (intobj == NULL || string==NULL)
1799 {
1800 Py_XDECREF(string);
1801 Py_XDECREF(intobj);
1802 *errorptr = "exception raised";
1803 goto FAILED;
1804 }
1805 PyDict_SetItem(dictionary, string, intobj);
1806 Py_DECREF(string); Py_DECREF(intobj);
1807 ptr += idlen+1; /* Point to rest of expression */
1808 goto do_grouping_bracket;
1809 }
1810 if (*ptr=='=')
1811 {
1812 /* (?P=groupname) */
1813 int idlen, refnum;
1814 PyObject *string, *intobj;
1815
1816 ptr++;
1817 idlen = get_group_id(ptr, ')', errorptr);
1818 if (*errorptr) {
1819 goto FAILED;
1820 }
Guido van Rossum50700601997-12-08 17:15:20 +00001821 string = PyString_FromStringAndSize((char *)ptr, idlen);
Guido van Rossumc3861071997-10-08 02:07:40 +00001822 if (string==NULL) {
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001823 Py_XDECREF(string);
1824 *errorptr = "exception raised";
1825 goto FAILED;
1826 }
1827 intobj = PyDict_GetItem(dictionary, string);
1828 if (intobj==NULL) {
Guido van Rossumc3861071997-10-08 02:07:40 +00001829 Py_DECREF(string);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001830 *errorptr = "?P= group identifier isn't defined";
1831 goto FAILED;
1832 }
1833
1834 refnum = PyInt_AsLong(intobj);
Guido van Rossum1eadb411997-12-15 17:33:24 +00001835 Py_DECREF(string);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001836 *code++ = OP_REF;
1837 *code++ = refnum;
1838 /* The continue will cause the top-level for() loop to
1839 be resumed, so ptr will be immediately incremented.
1840 Therefore, the following line adds just idlen, not
1841 idlen+1 */
1842 ptr += idlen;
1843 continue;
1844 }
1845 /* The character after ?P is neither < nor =, so
1846 report an error. Add more Python-extensions here. */
1847 *errorptr="unknown after (?P";
1848 goto FAILED;
1849 break;
Guido van Rossum50700601997-12-08 17:15:20 +00001850
1851 case '>': /* "Match once" brackets */
1852 if ((options & PCRE_EXTRA) != 0) /* Not yet standard */
1853 {
1854 bravalue = OP_ONCE;
1855 ptr++;
1856 previous = NULL;
1857 break;
1858 }
1859 /* Else fall through */
1860
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001861 default:
Guido van Rossum50700601997-12-08 17:15:20 +00001862 *errorptr = ERR12;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001863 goto FAILED;
1864 }
1865 }
1866
1867 /* Else we have a referencing group */
1868
1869 else
1870 {
1871 do_grouping_bracket:
Guido van Rossum50700601997-12-08 17:15:20 +00001872 if (++(*brackets) > EXTRACT_MAX)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001873 {
Guido van Rossum50700601997-12-08 17:15:20 +00001874 *errorptr = ERR13;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001875 goto FAILED;
1876 }
Guido van Rossum50700601997-12-08 17:15:20 +00001877 bravalue = OP_BRA + *brackets;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001878 }
1879
1880 /* Process nested bracketed re; at end pointer is on the bracket. We copy
1881 code into a non-register variable in order to be able to pass its address
1882 because some compilers complain otherwise. */
1883
1884 *code = bravalue;
1885 {
1886 uschar *mcode = code;
Guido van Rossum50700601997-12-08 17:15:20 +00001887 if (!compile_regex(options, brackets, &mcode, &ptr, errorptr, dictionary))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001888 goto FAILED;
1889 code = mcode;
1890 }
1891
1892 if (*ptr != ')')
1893 {
Guido van Rossum50700601997-12-08 17:15:20 +00001894 *errorptr = ERR14;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001895 goto FAILED;
1896 }
1897 break;
1898
1899 /* Check \ for being a real metacharacter; if not, fall through and handle
1900 it as a data character at the start of a string. Escape items are checked
1901 for validity in the pre-compiling pass. */
1902
1903 case '\\':
1904 oldptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00001905 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001906
1907 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1908 are arranged to be the negation of the corresponding OP_values. For the
1909 back references, the values are ESC_REF plus the reference number. Only
1910 back references and those types that consume a character may be repeated.
1911 We can test for values between ESC_b and ESC_Z for the latter; this may
1912 have to change if any new ones are ever created. */
1913
1914 if (c < 0)
1915 {
1916 if (-c >= ESC_REF)
1917 {
Guido van Rossum50700601997-12-08 17:15:20 +00001918 int refnum = -c - ESC_REF;
1919 if (*brackets < refnum)
1920 {
1921 *errorptr = ERR15;
1922 goto FAILED;
1923 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001924 previous = code;
1925 *code++ = OP_REF;
1926 *code++ = refnum;
1927 }
1928 else
1929 {
Guido van Rossum50700601997-12-08 17:15:20 +00001930 previous = (-c > ESC_b && -c < ESC_X)? code : NULL;
1931 if ( (options & PCRE_LOCALE) != 0)
1932 {
1933 switch (c)
1934 {
1935 case (-ESC_b): c = -OP_WORD_BOUNDARY_L; break;
1936 case (-ESC_B): c = -OP_NOT_WORD_BOUNDARY_L; break;
1937 case (-ESC_w): c = -OP_WORDCHAR_L; break;
1938 case (-ESC_W): c = -OP_NOT_WORDCHAR_L; break;
1939 }
1940 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001941 *code++ = -c;
1942 }
1943 continue;
1944 }
1945
1946 /* Reset and fall through */
1947
1948 ptr = oldptr;
1949 c = '\\';
1950
1951 /* Handle a run of data characters until a metacharacter is encountered.
1952 The first character is guaranteed not to be whitespace or # when the
1953 extended flag is set. */
1954
Guido van Rossum50700601997-12-08 17:15:20 +00001955 NORMAL_CHAR:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001956 default:
1957 previous = code;
1958 *code = OP_CHARS;
1959 code += 2;
1960 length = 0;
1961
1962 do
1963 {
Guido van Rossum50700601997-12-08 17:15:20 +00001964 if ((options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001965 {
1966 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1967 if (c == '#')
1968 {
1969 while ((c = *(++ptr)) != 0 && c != '\n');
1970 if (c == 0) break;
1971 continue;
1972 }
1973 }
1974
1975 /* Backslash may introduce a data char or a metacharacter. Escaped items
1976 are checked for validity in the pre-compiling pass. Stop the string
1977 before a metaitem. */
1978
1979 if (c == '\\')
1980 {
1981 oldptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00001982 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001983 if (c < 0) { ptr = oldptr; break; }
1984 }
1985
1986 /* Ordinary character or single-char escape */
1987
1988 *code++ = c;
1989 length++;
1990 }
1991
1992 /* This "while" is the end of the "do" above. */
1993
1994 while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
1995
1996 /* Compute the length and set it in the data vector, and advance to
1997 the next state. */
1998
1999 previous[1] = length;
2000 ptr--;
2001 break;
2002 }
2003 } /* end of big loop */
2004
2005/* Control never reaches here by falling through, only by a goto for all the
2006error states. Pass back the position in the pattern so that it can be displayed
2007to the user for diagnosing the error. */
2008
2009FAILED:
2010*ptrptr = ptr;
2011return FALSE;
2012}
2013
2014
2015
2016
2017/*************************************************
2018* Compile sequence of alternatives *
2019*************************************************/
2020
2021/* On entry, ptr is pointing past the bracket character, but on return
2022it points to the closing bracket, or vertical bar, or end of string.
2023The code variable is pointing at the byte into which the BRA operator has been
2024stored.
2025
2026Argument:
Guido van Rossum50700601997-12-08 17:15:20 +00002027 options the option bits
2028 brackets -> int containing the number of extracting brackets used
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002029 codeptr -> the address of the current code pointer
2030 ptrptr -> the address of the current pattern pointer
2031 errorptr -> pointer to error message
2032
2033Returns: TRUE on success
2034*/
2035
2036static BOOL
Guido van Rossum50700601997-12-08 17:15:20 +00002037compile_regex(int options, int *brackets, uschar **codeptr,
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002038 uschar **ptrptr, char **errorptr, PyObject *dictionary)
2039{
2040uschar *ptr = *ptrptr;
2041uschar *code = *codeptr;
2042uschar *start_bracket = code;
2043
2044for (;;)
2045 {
2046 int length;
2047 uschar *last_branch = code;
2048
2049 code += 3;
Guido van Rossum50700601997-12-08 17:15:20 +00002050 if (!compile_branch(options, brackets, &code, &ptr, errorptr, dictionary))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002051 {
2052 *ptrptr = ptr;
2053 return FALSE;
2054 }
2055
2056 /* Fill in the length of the last branch */
2057
2058 length = code - last_branch;
2059 last_branch[1] = length >> 8;
2060 last_branch[2] = length & 255;
2061
2062 /* Reached end of expression, either ')' or end of pattern. Insert a
2063 terminating ket and the length of the whole bracketed item, and return,
2064 leaving the pointer at the terminating char. */
2065
2066 if (*ptr != '|')
2067 {
2068 length = code - start_bracket;
2069 *code++ = OP_KET;
2070 *code++ = length >> 8;
2071 *code++ = length & 255;
2072 *codeptr = code;
2073 *ptrptr = ptr;
2074 return TRUE;
2075 }
2076
2077 /* Another branch follows; insert an "or" node and advance the pointer. */
2078
2079 *code = OP_ALT;
2080 ptr++;
2081 }
2082/* Control never reaches here */
2083}
2084
2085
2086
2087/*************************************************
2088* Check for anchored expression *
2089*************************************************/
2090
2091/* Try to find out if this is an anchored regular expression. Consider each
2092alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2093all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2094it's anchored. However, if this is a multiline pattern, then only OP_SOD
2095counts, since OP_CIRC can match in the middle.
2096
2097A branch is also implicitly anchored if it starts with .* because that will try
2098the rest of the pattern at all possible matching points, so there is no point
2099trying them again.
2100
2101Argument: points to start of expression (the bracket)
2102Returns: TRUE or FALSE
2103*/
2104
2105static BOOL
2106is_anchored(register uschar *code, BOOL multiline)
2107{
2108do {
2109 int op = (int)code[3];
Guido van Rossum50700601997-12-08 17:15:20 +00002110 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002111 { if (!is_anchored(code+3, multiline)) return FALSE; }
2112 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2113 { if (code[4] != OP_ANY) return FALSE; }
2114 else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;
2115 code += (code[1] << 8) + code[2];
2116 }
2117while (*code == OP_ALT);
2118return TRUE;
2119}
2120
2121
2122
2123/*************************************************
2124* Check for start with \n line expression *
2125*************************************************/
2126
2127/* This is called for multiline expressions to try to find out if every branch
2128starts with ^ so that "first char" processing can be done to speed things up.
2129
2130Argument: points to start of expression (the bracket)
2131Returns: TRUE or FALSE
2132*/
2133
2134static BOOL
2135is_startline(uschar *code)
2136{
2137do {
2138 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2139 { if (!is_startline(code+3)) return FALSE; }
2140 else if (code[3] != OP_CIRC) return FALSE;
2141 code += (code[1] << 8) + code[2];
2142 }
2143while (*code == OP_ALT);
2144return TRUE;
2145}
2146
2147
2148
2149/*************************************************
2150* Check for fixed first char *
2151*************************************************/
2152
2153/* Try to find out if there is a fixed first character. This is called for
2154unanchored expressions, as it speeds up their processing quite considerably.
2155Consider each alternative branch. If they all start with the same char, or with
2156a bracket all of whose alternatives start with the same char (recurse ad lib),
2157then we return that char, otherwise -1.
2158
2159Argument: points to start of expression (the bracket)
2160Returns: -1 or the fixed first char
2161*/
2162
2163static int
2164find_firstchar(uschar *code)
2165{
2166register int c = -1;
2167do
2168 {
2169 register int charoffset = 4;
2170
2171 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2172 {
2173 register int d;
2174 if ((d = find_firstchar(code+3)) < 0) return -1;
2175 if (c < 0) c = d; else if (c != d) return -1;
2176 }
2177
2178 else switch(code[3])
2179 {
2180 default:
2181 return -1;
2182
2183 case OP_EXACT: /* Fall through */
2184 charoffset++;
2185
2186 case OP_CHARS: /* Fall through */
2187 charoffset++;
2188
2189 case OP_PLUS:
2190 case OP_MINPLUS:
2191 if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;
2192 break;
2193 }
2194 code += (code[1] << 8) + code[2];
2195 }
2196while (*code == OP_ALT);
2197return c;
2198}
2199
2200
2201
2202/*************************************************
2203* Compile a Regular Expression *
2204*************************************************/
2205
2206/* This function takes a string and returns a pointer to a block of store
2207holding a compiled version of the expression.
2208
2209Arguments:
2210 pattern the regular expression
2211 options various option bits
2212 errorptr pointer to pointer to error text
2213 erroroffset ptr offset in pattern where error was detected
2214
2215Returns: pointer to compiled data block, or NULL on error,
2216 with errorptr and erroroffset set
2217*/
2218
2219pcre *
Guido van Rossum50700601997-12-08 17:15:20 +00002220pcre_compile(const char *pattern, int options, char **errorptr,
2221 int *erroroffset, PyObject *dictionary)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002222{
2223real_pcre *re;
2224int spaces = 0;
2225int length = 3; /* For initial BRA plus length */
2226int runlength;
2227int c, size;
Guido van Rossum50700601997-12-08 17:15:20 +00002228int bracount = 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002229int brastack[200];
2230int brastackptr = 0;
Guido van Rossum50700601997-12-08 17:15:20 +00002231int top_backref = 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002232uschar *code, *ptr;
2233
2234#ifdef DEBUG
2235uschar *code_base, *code_end;
2236#endif
2237
Guido van Rossum50700601997-12-08 17:15:20 +00002238/* We can't pass back an error message if errorptr is NULL; I guess the best we
2239can do is just return NULL. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002240
Guido van Rossum50700601997-12-08 17:15:20 +00002241if (errorptr == NULL) return NULL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002242*errorptr = NULL;
Guido van Rossum50700601997-12-08 17:15:20 +00002243
2244/* However, we can give a message for this error */
2245
2246if (erroroffset == NULL)
2247 {
2248 *errorptr = ERR16;
2249 return NULL;
2250 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002251*erroroffset = 0;
2252
2253if ((options & ~PUBLIC_OPTIONS) != 0)
2254 {
Guido van Rossum50700601997-12-08 17:15:20 +00002255 *errorptr = ERR17;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002256 return NULL;
2257 }
2258
2259#ifdef DEBUG
2260printf("------------------------------------------------------------------\n");
2261printf("%s\n", pattern);
2262#endif
2263
2264/* The first thing to do is to make a pass over the pattern to compute the
2265amount of store required to hold the compiled code. This does not have to be
2266perfect as long as errors are overestimates. At the same time we can detect any
2267internal flag settings. Make an attempt to correct for any counted white space
2268if an "extended" flag setting appears late in the pattern. We can't be so
2269clever for #-comments. */
2270
2271ptr = (uschar *)(pattern - 1);
2272while ((c = *(++ptr)) != 0)
2273 {
Guido van Rossum50700601997-12-08 17:15:20 +00002274 int min, max;
2275 int class_charcount;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002276
2277 if ((pcre_ctypes[c] & ctype_space) != 0)
2278 {
Guido van Rossum50700601997-12-08 17:15:20 +00002279 if ((options & PCRE_EXTENDED) != 0) continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002280 spaces++;
2281 }
2282
Guido van Rossum50700601997-12-08 17:15:20 +00002283 if (c == '#' && (options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002284 {
2285 while ((c = *(++ptr)) != 0 && c != '\n');
2286 continue;
2287 }
2288
2289 switch(c)
2290 {
2291 /* A backslashed item may be an escaped "normal" character or a
2292 character type. For a "normal" character, put the pointers and
2293 character back so that tests for whitespace etc. in the input
2294 are done correctly. */
2295
2296 case '\\':
2297 {
2298 uschar *save_ptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00002299 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002300 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2301 if (c >= 0)
2302 {
2303 ptr = save_ptr;
2304 c = '\\';
2305 goto NORMAL_CHAR;
2306 }
2307 }
2308 length++;
2309
2310 /* A back reference needs an additional char, plus either one or 5
Guido van Rossum50700601997-12-08 17:15:20 +00002311 bytes for a repeat. We also need to keep the value of the highest
2312 back reference. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002313
2314 if (c <= -ESC_REF)
2315 {
Guido van Rossum50700601997-12-08 17:15:20 +00002316 int refnum = -c - ESC_REF;
2317 if (refnum > top_backref) top_backref = refnum;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002318 length++; /* For single back reference */
Guido van Rossum50700601997-12-08 17:15:20 +00002319 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002320 {
2321 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2322 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2323 if ((min == 0 && (max == 1 || max == -1)) ||
2324 (min == 1 && max == -1))
2325 length++;
2326 else length += 5;
2327 if (ptr[1] == '?') ptr++;
2328 }
2329 }
2330 continue;
2331
2332 case '^':
2333 case '.':
2334 case '$':
2335 case '*': /* These repeats won't be after brackets; */
2336 case '+': /* those are handled separately */
2337 case '?':
2338 length++;
2339 continue;
2340
2341 /* This covers the cases of repeats after a single char, metachar, class,
2342 or back reference. */
2343
2344 case '{':
Guido van Rossum50700601997-12-08 17:15:20 +00002345 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002346 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
2347 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2348 if ((min == 0 && (max == 1 || max == -1)) ||
2349 (min == 1 && max == -1))
2350 length++;
2351 else
2352 {
2353 length--; /* Uncount the original char or metachar */
2354 if (min == 1) length++; else if (min > 0) length += 4;
2355 if (max > 0) length += 4; else length += 2;
2356 }
2357 if (ptr[1] == '?') ptr++;
2358 continue;
2359
2360 /* An alternation contains an offset to the next branch or ket. */
2361 case '|':
2362 length += 3;
2363 continue;
2364
Guido van Rossum50700601997-12-08 17:15:20 +00002365 /* A character class uses 33 characters. Don't worry about character types
2366 that aren't allowed in classes - they'll get picked up during the compile.
2367 A character class that contains only one character uses 2 or 3 bytes,
2368 depending on whether it is negated or not. Notice this where we can. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002369
2370 case '[':
Guido van Rossum50700601997-12-08 17:15:20 +00002371 class_charcount = 0;
2372 if (*(++ptr) == '^') ptr++;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002373 do
2374 {
Guido van Rossum50700601997-12-08 17:15:20 +00002375 if (*ptr == '\\')
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002376 {
Guido van Rossum50700601997-12-08 17:15:20 +00002377 int c = check_escape(&ptr, errorptr, bracount, options, TRUE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002378 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
Guido van Rossum50700601997-12-08 17:15:20 +00002379 if (-c == ESC_b) class_charcount++; else class_charcount = 10;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002380 }
Guido van Rossum50700601997-12-08 17:15:20 +00002381 else class_charcount++;
2382 ptr++;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002383 }
2384 while (*ptr != 0 && *ptr != ']');
2385
Guido van Rossum50700601997-12-08 17:15:20 +00002386 /* Repeats for negated single chars are handled by the general code */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002387
Guido van Rossum50700601997-12-08 17:15:20 +00002388 if (class_charcount == 1) length += 3; else
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002389 {
Guido van Rossum50700601997-12-08 17:15:20 +00002390 length += 33;
2391 if (options & PCRE_LOCALE) length++; /* Add a byte for the localization flag */
2392
2393 /* A repeat needs either 1 or 5 bytes. */
2394
2395 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
2396 {
2397 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2398 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2399 if ((min == 0 && (max == 1 || max == -1)) ||
2400 (min == 1 && max == -1))
2401 length++;
2402 else length += 5;
2403 if (ptr[1] == '?') ptr++;
2404 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002405 }
2406 continue;
2407
2408 /* Brackets may be genuine groups or special things */
2409
2410 case '(':
2411
2412 /* Handle special forms of bracket, which all start (? */
2413
2414 if (ptr[1] == '?') switch (c = ptr[2])
2415 {
2416 /* Skip over comments entirely */
2417 case '#':
2418 ptr += 3;
2419 while (*ptr != 0 && *ptr != ')') ptr++;
2420 if (*ptr == 0)
2421 {
Guido van Rossum50700601997-12-08 17:15:20 +00002422 *errorptr = ERR18;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002423 goto PCRE_ERROR_RETURN;
2424 }
2425 continue;
2426
2427 /* Non-referencing groups and lookaheads just move the pointer on, and
Guido van Rossum50700601997-12-08 17:15:20 +00002428 then behave like a non-special bracket, except that they don't increment
2429 the count of extracting brackets. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002430
2431 case ':':
2432 case '=':
2433 case '!':
2434 ptr += 2;
2435 break;
2436
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002437 case ('P'):
2438 {
2439 int idlen;
2440 switch (*ptr++) {
2441 case ('<'):
2442 idlen = get_group_id(ptr++, '>', errorptr);
2443 if (*errorptr) goto PCRE_ERROR_RETURN;
2444 ptr += idlen+1;
2445 break;
2446 case ('='):
2447 idlen = get_group_id(ptr++, ')', errorptr);
2448 if (*errorptr) goto PCRE_ERROR_RETURN;
2449 ptr += idlen+1;
2450 length++;
2451 break;
2452 }
2453 }
2454 break;
2455
Guido van Rossum50700601997-12-08 17:15:20 +00002456 /* Ditto for the "once only" bracket, allowed only if the extra bit
2457 is set. */
2458
2459 case '>':
2460 if ((options & PCRE_EXTRA) != 0)
2461 {
2462 ptr += 2;
2463 break;
2464 }
2465 /* Else fall thourh */
2466
2467 /* Else loop setting valid options until ) is met. Anything else is an
2468 error. */
2469
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002470 default:
2471 ptr += 2;
2472 for (;; ptr++)
2473 {
2474 if ((c = *ptr) == 'i')
2475 {
2476 options |= PCRE_CASELESS;
2477 continue;
2478 }
Guido van Rossumbd49ac41997-12-10 23:05:53 +00002479 else if ((c = *ptr) == 'L')
Guido van Rossum50700601997-12-08 17:15:20 +00002480 {
2481 options |= PCRE_LOCALE;
2482 continue;
2483 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002484 else if ((c = *ptr) == 'm')
2485 {
2486 options |= PCRE_MULTILINE;
2487 continue;
2488 }
Guido van Rossum50700601997-12-08 17:15:20 +00002489 else if (c == 's')
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002490 {
2491 options |= PCRE_DOTALL;
2492 continue;
2493 }
2494 else if (c == 'x')
2495 {
2496 options |= PCRE_EXTENDED;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002497 length -= spaces; /* Already counted spaces */
2498 continue;
2499 }
2500 else if (c == ')') break;
2501
Guido van Rossum50700601997-12-08 17:15:20 +00002502 *errorptr = ERR12;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002503 goto PCRE_ERROR_RETURN;
2504 }
2505 continue; /* End of this bracket handling */
2506 }
2507
Guido van Rossum50700601997-12-08 17:15:20 +00002508 /* Extracting brackets must be counted so we can process escapes in a
2509 Perlish way. */
2510
2511 else bracount++;
2512
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002513 /* Non-special forms of bracket. Save length for computing whole length
2514 at end if there's a repeat that requires duplication of the group. */
2515
2516 if (brastackptr >= sizeof(brastack)/sizeof(int))
2517 {
Guido van Rossum50700601997-12-08 17:15:20 +00002518 *errorptr = ERR19;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002519 goto PCRE_ERROR_RETURN;
2520 }
2521
2522 brastack[brastackptr++] = length;
2523 length += 3;
2524 continue;
2525
2526 /* Handle ket. Look for subsequent max/min; for certain sets of values we
2527 have to replicate this bracket up to that many times. */
2528
2529 case ')':
2530 length += 3;
2531 {
2532 int min = 1;
2533 int max = 1;
2534 int duplength = length - brastack[--brastackptr];
2535
2536 /* Leave ptr at the final char; for read_repeat_counts this happens
2537 automatically; for the others we need an increment. */
2538
Guido van Rossum50700601997-12-08 17:15:20 +00002539 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002540 {
2541 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2542 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2543 }
2544 else if (c == '*') { min = 0; max = -1; ptr++; }
2545 else if (c == '+') { max = -1; ptr++; }
2546 else if (c == '?') { min = 0; ptr++; }
2547
2548 /* If there is a minimum > 1 we have to replicate up to min-1 times; if
2549 there is a limited maximum we have to replicate up to max-1 times and
2550 allow for a BRAZERO item before each optional copy, as we also have to
2551 do before the first copy if the minimum is zero. */
2552
2553 if (min == 0) length++;
2554 else if (min > 1) length += (min - 1) * duplength;
2555 if (max > min) length += (max - min) * (duplength + 1);
2556 }
2557
2558 continue;
2559
2560 /* Non-special character. For a run of such characters the length required
2561 is the number of characters + 2, except that the maximum run length is 255.
2562 We won't get a skipped space or a non-data escape or the start of a #
2563 comment as the first character, so the length can't be zero. */
2564
2565 NORMAL_CHAR:
2566 default:
2567 length += 2;
2568 runlength = 0;
2569 do
2570 {
2571 if ((pcre_ctypes[c] & ctype_space) != 0)
2572 {
Guido van Rossum50700601997-12-08 17:15:20 +00002573 if ((options & PCRE_EXTENDED) != 0) continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002574 spaces++;
2575 }
2576
Guido van Rossum50700601997-12-08 17:15:20 +00002577 if (c == '#' && (options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002578 {
2579 while ((c = *(++ptr)) != 0 && c != '\n');
2580 continue;
2581 }
2582
2583 /* Backslash may introduce a data char or a metacharacter; stop the
2584 string before the latter. */
2585
2586 if (c == '\\')
2587 {
2588 uschar *saveptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00002589 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002590 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2591 if (c < 0) { ptr = saveptr; break; }
2592 }
2593
2594 /* Ordinary character or single-char escape */
2595
2596 runlength++;
2597 }
2598
2599 /* This "while" is the end of the "do" above. */
2600
2601 while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2602
2603 ptr--;
2604 length += runlength;
2605 continue;
2606 }
2607 }
2608
2609length += 4; /* For final KET and END */
2610
2611if (length > 65539)
2612 {
Guido van Rossum50700601997-12-08 17:15:20 +00002613 *errorptr = ERR20;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002614 return NULL;
2615 }
2616
2617/* Compute the size of data block needed and get it, either from malloc or
2618externally provided function. Put in the magic number and the options. */
2619
Guido van Rossum50700601997-12-08 17:15:20 +00002620size = length + offsetof(real_pcre, code);
2621re = (real_pcre *)(pcre_malloc)(size+50);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002622
2623if (re == NULL)
2624 {
Guido van Rossum50700601997-12-08 17:15:20 +00002625 *errorptr = ERR21;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002626 return NULL;
2627 }
2628
2629re->magic_number = MAGIC_NUMBER;
2630re->options = options;
2631
2632/* Set up a starting, non-extracting bracket, then compile the expression. On
2633error, *errorptr will be set non-NULL, so we don't need to look at the result
2634of the function here. */
2635
2636ptr = (uschar *)pattern;
2637code = re->code;
2638*code = OP_BRA;
Guido van Rossum50700601997-12-08 17:15:20 +00002639bracount = 0;
2640(void)compile_regex(options, &bracount, &code, &ptr, errorptr, dictionary);
2641re->top_bracket = bracount;
2642re->top_backref = top_backref;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002643
2644/* If not reached end of pattern on success, there's an excess bracket. */
2645
Guido van Rossum50700601997-12-08 17:15:20 +00002646if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
2647
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002648/* Fill in the terminating state and check for disastrous overflow, but
2649if debugging, leave the test till after things are printed out. */
2650
2651*code++ = OP_END;
2652
Guido van Rossum50700601997-12-08 17:15:20 +00002653
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002654#ifndef DEBUG
Guido van Rossum50700601997-12-08 17:15:20 +00002655if (code - re->code > length) *errorptr = ERR23;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002656#endif
2657
2658/* Failed to compile */
2659
2660if (*errorptr != NULL)
2661 {
2662 (pcre_free)(re);
2663 PCRE_ERROR_RETURN:
2664 *erroroffset = ptr - (uschar *)pattern;
2665 return NULL;
2666 }
2667
2668/* If the anchored option was not passed, set flag if we can determine that it
2669is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
2670we can determine what the first character has to be, because that speeds up
2671unanchored matches no end. In the case of multiline matches, an alternative is
2672to set the PCRE_STARTLINE flag if all branches start with ^. */
2673
2674if ((options & PCRE_ANCHORED) == 0)
2675 {
2676 if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))
2677 re->options |= PCRE_ANCHORED;
2678 else
2679 {
2680 int c = find_firstchar(re->code);
2681 if (c >= 0)
2682 {
2683 re->first_char = c;
2684 re->options |= PCRE_FIRSTSET;
2685 }
2686 else if (is_startline(re->code))
2687 re->options |= PCRE_STARTLINE;
2688 }
2689 }
2690
2691/* Print out the compiled data for debugging */
2692
2693#ifdef DEBUG
2694
Guido van Rossum50700601997-12-08 17:15:20 +00002695printf("Length = %d top_bracket = %d top_backref=%d\n",
2696 length, re->top_bracket, re->top_backref);
2697
2698if (re->options != 0)
2699 {
2700 printf("%s%s%s%s%s%s%s\n",
2701 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2702 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2703 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2704 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2705 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2706 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
2707 ((re->options & PCRE_EXTRA) != 0)? "extra " : "");
2708 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002709
2710if ((re->options & PCRE_FIRSTSET) != 0)
2711 {
2712 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2713 else printf("First char = \\x%02x\n", re->first_char);
2714 }
2715
2716code_end = code;
2717code_base = code = re->code;
2718
2719while (code < code_end)
2720 {
2721 int charlength;
2722
2723 printf("%3d ", code - code_base);
2724
2725 if (*code >= OP_BRA)
2726 {
2727 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2728 code += 2;
2729 }
2730
2731 else switch(*code)
2732 {
2733 case OP_CHARS:
2734 charlength = *(++code);
2735 printf("%3d ", charlength);
2736 while (charlength-- > 0)
2737 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2738 break;
2739
2740 case OP_KETRMAX:
2741 case OP_KETRMIN:
2742 case OP_ALT:
2743 case OP_KET:
2744 case OP_ASSERT:
2745 case OP_ASSERT_NOT:
Guido van Rossum50700601997-12-08 17:15:20 +00002746 case OP_ONCE:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002747 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2748 code += 2;
2749 break;
2750
2751 case OP_STAR:
2752 case OP_MINSTAR:
2753 case OP_PLUS:
2754 case OP_MINPLUS:
2755 case OP_QUERY:
2756 case OP_MINQUERY:
2757 case OP_TYPESTAR:
2758 case OP_TYPEMINSTAR:
2759 case OP_TYPEPLUS:
2760 case OP_TYPEMINPLUS:
2761 case OP_TYPEQUERY:
2762 case OP_TYPEMINQUERY:
2763 if (*code >= OP_TYPESTAR)
2764 printf(" %s", OP_names[code[1]]);
2765 else if (isprint(c = code[1])) printf(" %c", c);
2766 else printf(" \\x%02x", c);
2767 printf("%s", OP_names[*code++]);
2768 break;
2769
2770 case OP_EXACT:
2771 case OP_UPTO:
2772 case OP_MINUPTO:
2773 if (isprint(c = code[3])) printf(" %c{", c);
2774 else printf(" \\x%02x{", c);
2775 if (*code != OP_EXACT) printf(",");
2776 printf("%d}", (code[1] << 8) + code[2]);
2777 if (*code == OP_MINUPTO) printf("?");
2778 code += 3;
2779 break;
2780
2781 case OP_TYPEEXACT:
2782 case OP_TYPEUPTO:
2783 case OP_TYPEMINUPTO:
2784 printf(" %s{", OP_names[code[3]]);
2785 if (*code != OP_TYPEEXACT) printf(",");
2786 printf("%d}", (code[1] << 8) + code[2]);
2787 if (*code == OP_TYPEMINUPTO) printf("?");
2788 code += 3;
2789 break;
2790
Guido van Rossum50700601997-12-08 17:15:20 +00002791 case OP_NOT:
2792 if (isprint(c = *(++code))) printf(" [^%c]", c);
2793 else printf(" [^\\x%02x]", c);
2794 break;
2795
2796 case OP_NOTSTAR:
2797 case OP_NOTMINSTAR:
2798 case OP_NOTPLUS:
2799 case OP_NOTMINPLUS:
2800 case OP_NOTQUERY:
2801 case OP_NOTMINQUERY:
2802 if (isprint(c = code[1])) printf(" [^%c]", c);
2803 else printf(" [^\\x%02x]", c);
2804 printf("%s", OP_names[*code++]);
2805 break;
2806
2807 case OP_NOTEXACT:
2808 case OP_NOTUPTO:
2809 case OP_NOTMINUPTO:
2810 if (isprint(c = code[3])) printf(" [^%c]{", c);
2811 else printf(" [^\\x%02x]{", c);
2812 if (*code != OP_NOTEXACT) printf(",");
2813 printf("%d}", (code[1] << 8) + code[2]);
2814 if (*code == OP_NOTMINUPTO) printf("?");
2815 code += 3;
2816 break;
2817
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002818 case OP_REF:
2819 printf(" \\%d", *(++code));
2820 break;
2821
2822 case OP_CLASS:
Guido van Rossum50700601997-12-08 17:15:20 +00002823 case OP_CLASS_L:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002824 {
2825 int i, min, max;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002826
Guido van Rossum50700601997-12-08 17:15:20 +00002827 if (*code==OP_CLASS_L)
2828 {
2829 code++;
2830 printf("Locflag = %i ", *code++);
2831 }
2832 else
2833 code++;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002834
Guido van Rossum50700601997-12-08 17:15:20 +00002835 printf(" [");
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002836
Guido van Rossum50700601997-12-08 17:15:20 +00002837 for (i = 0; i < 256; i++)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002838 {
Guido van Rossum50700601997-12-08 17:15:20 +00002839 if ((code[i/8] & (1 << (i&7))) != 0)
2840 {
2841 int j;
2842 for (j = i+1; j < 256; j++)
2843 if ((code[j/8] & (1 << (j&7))) == 0) break;
2844 if (i == '-' || i == ']') printf("\\");
2845 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2846 if (--j > i)
2847 {
2848 printf("-");
2849 if (j == '-' || j == ']') printf("\\");
2850 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2851 }
2852 i = j;
2853 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002854 }
2855 printf("]");
Guido van Rossum50700601997-12-08 17:15:20 +00002856 code += 32;
2857 /* code ++;*/
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002858
Guido van Rossum50700601997-12-08 17:15:20 +00002859 switch(*code)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002860 {
2861 case OP_CRSTAR:
2862 case OP_CRMINSTAR:
2863 case OP_CRPLUS:
2864 case OP_CRMINPLUS:
2865 case OP_CRQUERY:
2866 case OP_CRMINQUERY:
2867 printf("%s", OP_names[*code]);
2868 break;
2869
2870 case OP_CRRANGE:
2871 case OP_CRMINRANGE:
2872 min = (code[1] << 8) + code[2];
2873 max = (code[3] << 8) + code[4];
2874 if (max == 0) printf("{%d,}", min);
2875 else printf("{%d,%d}", min, max);
2876 if (*code == OP_CRMINRANGE) printf("?");
2877 code += 4;
2878 break;
2879
2880 default:
2881 code--;
2882 }
2883 }
2884 break;
2885
2886 /* Anything else is just a one-node item */
2887
2888 default:
2889 printf(" %s", OP_names[*code]);
2890 break;
2891 }
2892
2893 code++;
2894 printf("\n");
2895 }
2896printf("------------------------------------------------------------------\n");
2897
2898/* This check is done here in the debugging case so that the code that
2899was compiled can be seen. */
2900
2901if (code - re->code > length)
2902 {
Guido van Rossum50700601997-12-08 17:15:20 +00002903 printf("length=%i, code length=%i\n", length, code-re->code);
2904 *errorptr = ERR23;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002905 (pcre_free)(re);
2906 *erroroffset = ptr - (uschar *)pattern;
2907 return NULL;
2908 }
2909#endif
2910
2911return (pcre *)re;
2912}
2913
2914
2915
2916/*************************************************
2917* Match a character type *
2918*************************************************/
2919
2920/* Not used in all the places it might be as it's sometimes faster
2921to put the code inline.
2922
2923Arguments:
2924 type the character type
2925 c the character
Guido van Rossum50700601997-12-08 17:15:20 +00002926 dotall the dotall flag
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002927
2928Returns: TRUE if character is of the type
2929*/
2930
2931static BOOL
2932match_type(int type, int c, BOOL dotall)
2933{
2934
2935#ifdef DEBUG
2936if (isprint(c)) printf("matching subject %c against ", c);
2937 else printf("matching subject \\x%02x against ", c);
2938printf("%s\n", OP_names[type]);
2939#endif
2940
2941switch(type)
2942 {
2943 case OP_ANY: return dotall || c != '\n';
2944 case OP_NOT_DIGIT: return (pcre_ctypes[c] & ctype_digit) == 0;
2945 case OP_DIGIT: return (pcre_ctypes[c] & ctype_digit) != 0;
2946 case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
2947 case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0;
2948 case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0;
2949 case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0;
Guido van Rossum50700601997-12-08 17:15:20 +00002950 case OP_NOT_WORDCHAR_L: return (c!='_' && !isalpha(c));
2951 case OP_WORDCHAR_L: return (c=='_' || isalpha(c));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002952 }
2953return FALSE;
2954}
2955
2956
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002957
2958/*************************************************
2959* Match a back-reference *
2960*************************************************/
2961
2962/* If a back reference hasn't been set, the match fails.
2963
2964Arguments:
2965 number reference number
2966 eptr points into the subject
2967 length length to be matched
2968 md points to match data block
2969
2970Returns: TRUE if matched
2971*/
2972
2973static BOOL
2974match_ref(int number, register uschar *eptr, int length, match_data *md)
2975{
2976uschar *p = md->start_subject + md->offset_vector[number];
2977
2978#ifdef DEBUG
2979if (eptr >= md->end_subject)
2980 printf("matching subject <null>");
2981else
2982 {
2983 printf("matching subject ");
2984 pchars(eptr, length, TRUE, md);
2985 }
2986printf(" against backref ");
2987pchars(p, length, FALSE, md);
2988printf("\n");
2989#endif
2990
2991/* Always fail if not enough characters left */
2992
2993if (length > md->end_subject - p) return FALSE;
2994
2995/* Separate the caselesss case for speed */
2996
2997if (md->caseless)
2998 { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }
2999else
3000 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3001
3002return TRUE;
3003}
3004
3005static int free_stack(match_data *md)
3006{
3007/* Free any stack space that was allocated by the call to match(). */
3008if (md->off_num) free(md->off_num);
3009if (md->offset_top) free(md->offset_top);
3010if (md->r1) free(md->r1);
3011if (md->r2) free(md->r2);
3012if (md->eptr) free(md->eptr);
Guido van Rossumc3861071997-10-08 02:07:40 +00003013if (md->ecode) free(md->ecode);
3014return 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003015}
3016
3017static int grow_stack(match_data *md)
3018{
Guido van Rossum50700601997-12-08 17:15:20 +00003019 if (md->length != 0)
3020 {
3021 md->length = md->length + md->length/2;
3022 }
3023 else
3024 {
3025 int string_len = md->end_subject - md->start_subject + 1;
3026 if (string_len < 80) {md->length = string_len; }
3027 else {md->length = 80;}
3028 }
3029 PyMem_RESIZE(md->offset_top, int, md->length);
3030 PyMem_RESIZE(md->eptr, uschar *, md->length);
3031 PyMem_RESIZE(md->ecode, uschar *, md->length);
3032 PyMem_RESIZE(md->off_num, int, md->length);
3033 PyMem_RESIZE(md->r1, int, md->length);
3034 PyMem_RESIZE(md->r2, int, md->length);
3035 if (md->offset_top == NULL || md->eptr == NULL || md->ecode == NULL ||
3036 md->off_num == NULL || md->r1 == NULL || md->r2 == NULL)
3037 {
3038 PyErr_SetString(PyExc_MemoryError, "Can't increase failure stack for re operation");
3039 longjmp(md->error_env, 1);
3040 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003041 return 0;
3042}
3043
Guido van Rossum50700601997-12-08 17:15:20 +00003044
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003045/*************************************************
3046* Match from current position *
3047*************************************************/
3048
3049/* On entry ecode points to the first opcode, and eptr to the first character.
3050
3051Arguments:
3052 eptr pointer in subject
3053 ecode position in code
3054 offset_top current top pointer
3055 md pointer to "static" info for the match
3056
3057Returns: TRUE if matched
3058*/
3059
3060static BOOL
3061match(register uschar *eptr, register uschar *ecode, int offset_top,
3062 match_data *md)
3063{
3064 int save_stack_position = md->point;
3065match_loop:
3066
3067#define SUCCEED goto succeed
3068#define FAIL goto fail
3069
3070for (;;)
3071 {
3072 int min, max, ctype;
3073 register int i;
3074 register int c;
Guido van Rossum50700601997-12-08 17:15:20 +00003075 BOOL minimize;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003076
3077 /* Opening bracket. Check the alternative branches in turn, failing if none
3078 match. We have to set the start offset if required and there is space
3079 in the offset vector so that it is available for subsequent back references
3080 if the bracket matches. However, if the bracket fails, we must put back the
3081 previous value of both offsets in case they were set by a previous copy of
3082 the same bracket. Don't worry about setting the flag for the error case here;
3083 that is handled in the code for KET. */
3084
3085 if ((int)*ecode >= OP_BRA)
3086 {
3087 int number = (*ecode - OP_BRA) << 1;
Guido van Rossum50700601997-12-08 17:15:20 +00003088 int save_offset1, save_offset2;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003089
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003090#ifdef DEBUG
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003091 printf("start bracket %d\n", number/2);
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003092#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003093
3094 if (number > 0 && number < md->offset_end)
3095 {
3096 save_offset1 = md->offset_vector[number];
3097 save_offset2 = md->offset_vector[number+1];
3098 md->offset_vector[number] = eptr - md->start_subject;
3099
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003100#ifdef DEBUG
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003101 printf("saving %d %d\n", save_offset1, save_offset2);
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003102#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003103 }
3104
3105 /* Recurse for all the alternatives. */
3106
3107 do
3108 {
3109 if (match(eptr, ecode+3, offset_top, md)) SUCCEED;
3110 ecode += (ecode[1] << 8) + ecode[2];
3111 }
3112 while (*ecode == OP_ALT);
3113
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003114#ifdef DEBUG
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003115 printf("bracket %d failed\n", number/2);
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003116#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003117
3118 if (number > 0 && number < md->offset_end)
3119 {
3120 md->offset_vector[number] = save_offset1;
3121 md->offset_vector[number+1] = save_offset2;
3122 }
3123
3124 FAIL;
3125 }
3126
3127 /* Other types of node can be handled by a switch */
3128
3129 switch(*ecode)
3130 {
3131 case OP_END:
3132 md->end_match_ptr = eptr; /* Record where we ended */
3133 md->end_offset_top = offset_top; /* and how many extracts were taken */
3134 SUCCEED;
3135
Guido van Rossum50700601997-12-08 17:15:20 +00003136 /* The equivalent of Prolog's "cut" - if the rest doesn't match, the
3137 whole thing doesn't match, so we have to get out via a longjmp(). */
3138
3139 case OP_CUT:
3140 if (match(eptr, ecode+1, offset_top, md)) SUCCEED;
3141 longjmp(md->fail_env, 1);
3142
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003143 /* Assertion brackets. Check the alternative branches in turn - the
3144 matching won't pass the KET for an assertion. If any one branch matches,
3145 the assertion is true. */
3146
3147 case OP_ASSERT:
3148 do
3149 {
3150 if (match(eptr, ecode+3, offset_top, md)) break;
3151 ecode += (ecode[1] << 8) + ecode[2];
3152 }
3153 while (*ecode == OP_ALT);
3154 if (*ecode == OP_KET) FAIL;
3155
3156 /* Continue from after the assertion, updating the offsets high water
3157 mark, since extracts may have been taken during the assertion. */
3158
3159 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3160 ecode += 3;
3161 offset_top = md->end_offset_top;
3162 continue;
3163
3164 /* Negative assertion: all branches must fail to match */
3165
3166 case OP_ASSERT_NOT:
3167 do
3168 {
3169 if (match(eptr, ecode+3, offset_top, md)) FAIL;
3170 ecode += (ecode[1] << 8) + ecode[2];
3171 }
3172 while (*ecode == OP_ALT);
3173 ecode += 3;
3174 continue;
3175
Guido van Rossum50700601997-12-08 17:15:20 +00003176 /* "Once" brackets are like assertion brackets except that after a match,
3177 the point in the subject string is not moved back. Thus there can never be
3178 a move back into the brackets. Check the alternative branches in turn - the
3179 matching won't pass the KET for this kind of subpattern. If any one branch
3180 matches, we carry on, leaving the subject pointer. */
3181
3182 case OP_ONCE:
3183 do
3184 {
3185 if (match(eptr, ecode+3, offset_top, md)) break;
3186 ecode += (ecode[1] << 8) + ecode[2];
3187 }
3188 while (*ecode == OP_ALT);
3189 if (*ecode == OP_KET) return FALSE;
3190
3191 /* Continue as from after the assertion, updating the offsets high water
3192 mark, since extracts may have been taken. */
3193
3194 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3195 ecode += 3;
3196 offset_top = md->end_offset_top;
3197 eptr = md->end_match_ptr;
3198 continue;
3199
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003200 /* An alternation is the end of a branch; scan along to find the end of the
3201 bracketed group and go to there. */
3202
3203 case OP_ALT:
3204 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3205 break;
3206
3207 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3208 that it may occur zero times. It may repeat infinitely, or not at all -
3209 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3210 repeat limits are compiled as a number of copies, with the optional ones
3211 preceded by BRAZERO or BRAMINZERO. */
3212
3213 case OP_BRAZERO:
3214 {
3215 uschar *next = ecode+1;
3216 if (match(eptr, next, offset_top, md)) SUCCEED;
3217 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3218 ecode = next + 3;
3219 }
3220 break;
3221
3222 case OP_BRAMINZERO:
3223 {
3224 uschar *next = ecode+1;
3225 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3226 if (match(eptr, next+3, offset_top, md)) SUCCEED;
3227 ecode++;
3228 }
3229 break;;
3230
3231 /* End of a group, repeated or non-repeating. If we are at the end of
3232 an assertion "group", stop matching and SUCCEED, but record the
3233 current high water mark for use by positive assertions. */
3234
3235 case OP_KET:
3236 case OP_KETRMIN:
3237 case OP_KETRMAX:
3238 {
Guido van Rossum50700601997-12-08 17:15:20 +00003239 int number;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003240 uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3241
Guido van Rossum50700601997-12-08 17:15:20 +00003242 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003243 {
Guido van Rossum50700601997-12-08 17:15:20 +00003244 md->end_match_ptr = eptr; /* For ONCE */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003245 md->end_offset_top = offset_top;
3246 SUCCEED;
3247 }
3248
3249 /* In all other cases we have to check the group number back at the
3250 start and if necessary complete handling an extraction by setting the
3251 final offset and bumping the high water mark. */
3252
3253 number = (*prev - OP_BRA) << 1;
3254
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003255#ifdef DEBUG
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003256 printf("end bracket %d\n", number/2);
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003257#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003258
3259 if (number > 0)
3260 {
3261 if (number >= md->offset_end) md->offset_overflow = TRUE; else
3262 {
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003263 md->offset_vector[number+1] = eptr - md->start_subject;
3264 if (offset_top <= number) offset_top = number + 2;
3265 }
3266 }
3267
3268 /* For a non-repeating ket, just advance to the next node and continue at
3269 this level. */
3270
3271 if (*ecode == OP_KET)
3272 {
3273 ecode += 3;
3274 break;
3275 }
3276
3277 /* The repeating kets try the rest of the pattern or restart from the
3278 preceding bracket, in the appropriate order. */
3279
3280 if (*ecode == OP_KETRMIN)
3281 {
3282 uschar *ptr;
3283 if (match(eptr, ecode+3, offset_top, md)) goto succeed;
3284 /* Handle alternation inside the BRA...KET; push the additional
3285 alternatives onto the stack
3286 XXX this tries the alternatives backwards! */
3287 ptr=prev;
3288 do {
3289 ptr += (ptr[1]<<8)+ ptr[2];
3290 if (*ptr==OP_ALT)
3291 {
Guido van Rossum50700601997-12-08 17:15:20 +00003292 if (md->length == md->point)
3293 {
3294 grow_stack(md);
3295 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003296 md->offset_top[md->point] = offset_top;
3297 md->eptr[md->point] = eptr;
3298 md->ecode[md->point] = ptr+3;
3299 md->r1[md->point] = 0;
3300 md->r2[md->point] = 0;
3301 md->off_num[md->point] = 0;
3302 md->point++;
3303 }
3304 } while (*ptr==OP_ALT);
3305 ecode=prev+3; goto match_loop;
3306 }
3307 else /* OP_KETRMAX */
3308 {
3309 uschar *ptr;
3310 int points_pushed=0;
3311
3312 /* Push one failure point, that will resume matching at the code after
3313 the KETRMAX opcode. */
Guido van Rossum50700601997-12-08 17:15:20 +00003314 if (md->length == md->point)
3315 {
3316 grow_stack(md);
3317 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003318 md->offset_top[md->point] = offset_top;
3319 md->eptr[md->point] = eptr;
3320 md->ecode[md->point] = ecode+3;
3321 md->r1[md->point] = md->offset_vector[number];
3322 md->r2[md->point] = md->offset_vector[number+1];
3323 md->off_num[md->point] = number;
3324 md->point++;
3325
3326 md->offset_vector[number] = eptr - md->start_subject;
3327 /* Handle alternation inside the BRA...KET; push each of the
3328 additional alternatives onto the stack
3329 XXX this tries the alternatives backwards! */
3330 ptr=prev;
3331 do {
3332 ptr += (ptr[1]<<8)+ ptr[2];
3333 if (*ptr==OP_ALT)
3334 {
Guido van Rossum50700601997-12-08 17:15:20 +00003335 if (md->length == md->point)
3336 if (md->length == md->point)
3337 {
3338 grow_stack(md);
3339 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003340 md->offset_top[md->point] = offset_top;
3341 md->eptr[md->point] = eptr;
3342 md->ecode[md->point] = ptr+3;
3343 md->r1[md->point] = 0;
3344 md->r2[md->point] = 0;
3345 md->off_num[md->point] = 0;
3346 md->point++;
3347 points_pushed++;
3348 }
3349 } while (*ptr==OP_ALT);
3350 /* Jump to the first (or only) alternative and resume trying to match */
3351 ecode=prev+3; goto match_loop;
3352 }
3353 }
3354 FAIL;
3355
Guido van Rossum50700601997-12-08 17:15:20 +00003356 /* Start of subject unless notbol, or after internal newline if multiline */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003357
3358 case OP_CIRC:
Guido van Rossum50700601997-12-08 17:15:20 +00003359 if (md->notbol && eptr == md->start_subject) FAIL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003360 if (md->multiline)
3361 {
3362 if (eptr != md->start_subject && eptr[-1] != '\n') FAIL;
3363 ecode++;
3364 break;
3365 }
3366 /* ... else fall through */
3367
3368 /* Start of subject assertion */
3369
3370 case OP_SOD:
3371 if (eptr != md->start_subject) FAIL;
3372 ecode++;
3373 break;
3374
Guido van Rossum50700601997-12-08 17:15:20 +00003375 /* Assert before internal newline if multiline, or before
3376 a terminating newline unless endonly is set, else end of subject unless
3377 noteol is set. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003378
3379 case OP_DOLL:
Guido van Rossum50700601997-12-08 17:15:20 +00003380 if (md->noteol && eptr >= md->end_subject) FAIL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003381 if (md->multiline)
3382 {
3383 if (eptr < md->end_subject && *eptr != '\n') FAIL;
3384 ecode++;
3385 break;
3386 }
Guido van Rossum50700601997-12-08 17:15:20 +00003387 else if (!md->endonly)
3388 {
3389 if (eptr < md->end_subject - 1 ||
3390 (eptr == md->end_subject - 1 && *eptr != '\n')) FAIL;
3391 ecode++;
3392 break;
3393 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003394 /* ... else fall through */
3395
3396 /* End of subject assertion */
3397
3398 case OP_EOD:
3399 if (eptr < md->end_subject) FAIL;
3400 ecode++;
3401 break;
3402
3403 /* Word boundary assertions */
3404
3405 case OP_NOT_WORD_BOUNDARY:
3406 case OP_WORD_BOUNDARY:
3407 {
3408 BOOL prev_is_word = (eptr != md->start_subject) &&
3409 ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
3410 BOOL cur_is_word = (eptr < md->end_subject) &&
3411 ((pcre_ctypes[*eptr] & ctype_word) != 0);
3412 if ((*ecode++ == OP_WORD_BOUNDARY)?
3413 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3414 FAIL;
3415 }
3416 break;
3417
Guido van Rossum50700601997-12-08 17:15:20 +00003418 case OP_NOT_WORD_BOUNDARY_L:
3419 case OP_WORD_BOUNDARY_L:
3420 {
3421 BOOL prev_is_word = (eptr != md->start_subject) &&
3422 (isalpha(eptr[-1]) || eptr[-1]=='_');
3423 BOOL cur_is_word = (eptr < md->end_subject) &&
3424 (isalpha(eptr[-1]) || eptr[-1]=='_');
3425 if ((*ecode++ == OP_WORD_BOUNDARY_L)?
3426 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3427 FAIL;
3428 }
3429 break;
3430
3431
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003432 /* Match a single character type; inline for speed */
3433
3434 case OP_ANY:
3435 if (!md->dotall && eptr < md->end_subject && *eptr == '\n') FAIL;
3436 if (eptr++ >= md->end_subject) FAIL;
3437 ecode++;
3438 break;
3439
3440 case OP_NOT_DIGIT:
3441 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)
3442 FAIL;
3443 ecode++;
3444 break;
3445
3446 case OP_DIGIT:
3447 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)
3448 FAIL;
3449 ecode++;
3450 break;
3451
3452 case OP_NOT_WHITESPACE:
3453 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)
3454 FAIL;
3455 ecode++;
3456 break;
3457
3458 case OP_WHITESPACE:
3459 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)
3460 FAIL;
3461 ecode++;
3462 break;
3463
3464 case OP_NOT_WORDCHAR:
3465 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)
3466 FAIL;
3467 ecode++;
3468 break;
3469
3470 case OP_WORDCHAR:
3471 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)
3472 FAIL;
3473 ecode++;
3474 break;
3475
Guido van Rossum50700601997-12-08 17:15:20 +00003476 case OP_NOT_WORDCHAR_L:
3477 if (eptr >= md->end_subject || (*eptr=='_' || isalpha(*eptr) ))
3478 return FALSE;
3479 eptr++;
3480 ecode++;
3481 break;
3482
3483 case OP_WORDCHAR_L:
3484 if (eptr >= md->end_subject || (*eptr!='_' && !isalpha(*eptr) ))
3485 return FALSE;
3486 eptr++;
3487 ecode++;
3488 break;
3489
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003490 /* Match a back reference, possibly repeatedly. Look past the end of the
3491 item to see if there is repeat information following. The code is similar
3492 to that for character classes, but repeated for efficiency. Then obey
3493 similar code to character type repeats - written out again for speed.
3494 However, if the referenced string is the empty string, always treat
3495 it as matched, any number of times (otherwise there could be infinite
3496 loops). */
3497
3498 case OP_REF:
3499 {
3500 int length;
3501 int number = ecode[1] << 1; /* Doubled reference number */
3502 ecode += 2; /* Advance past the item */
3503
3504 if (number >= offset_top || md->offset_vector[number] < 0)
3505 {
3506 md->errorcode = PCRE_ERROR_BADREF;
3507 FAIL;
3508 }
3509
3510 length = md->offset_vector[number+1] - md->offset_vector[number];
3511
3512 switch (*ecode)
3513 {
3514 case OP_CRSTAR:
3515 case OP_CRMINSTAR:
3516 case OP_CRPLUS:
3517 case OP_CRMINPLUS:
3518 case OP_CRQUERY:
3519 case OP_CRMINQUERY:
3520 c = *ecode++ - OP_CRSTAR;
3521 minimize = (c & 1) != 0;
3522 min = rep_min[c]; /* Pick up values from tables; */
3523 max = rep_max[c]; /* zero for max => infinity */
3524 if (max == 0) max = INT_MAX;
3525 break;
3526
3527 case OP_CRRANGE:
3528 case OP_CRMINRANGE:
3529 minimize = (*ecode == OP_CRMINRANGE);
3530 min = (ecode[1] << 8) + ecode[2];
3531 max = (ecode[3] << 8) + ecode[4];
3532 if (max == 0) max = INT_MAX;
3533 ecode += 5;
3534 break;
3535
3536 default: /* No repeat follows */
3537 if (!match_ref(number, eptr, length, md)) FAIL;
3538 eptr += length;
3539 continue; /* With the main loop */
3540 }
3541
3542 /* If the length of the reference is zero, just continue with the
3543 main loop. */
3544
3545 if (length == 0) continue;
3546
3547 /* First, ensure the minimum number of matches are present. We get back
3548 the length of the reference string explicitly rather than passing the
3549 address of eptr, so that eptr can be a register variable. */
3550
3551 for (i = 1; i <= min; i++)
3552 {
3553 if (!match_ref(number, eptr, length, md)) FAIL;
3554 eptr += length;
3555 }
3556
3557 /* If min = max, continue at the same level without recursion.
3558 They are not both allowed to be zero. */
3559
3560 if (min == max) continue;
3561
3562 /* If minimizing, keep trying and advancing the pointer */
3563
3564 if (minimize)
3565 {
3566 for (i = min;; i++)
3567 {
3568 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3569 if (i >= max || !match_ref(number, eptr, length, md))
3570 FAIL;
3571 eptr += length;
3572 }
3573 /* Control never gets here */
3574 }
3575
3576 /* If maximizing, find the longest string and work backwards */
3577
3578 else
3579 {
3580 uschar *pp = eptr;
3581 for (i = min; i < max; i++)
3582 {
3583 if (!match_ref(number, eptr, length, md)) break;
3584 eptr += length;
3585 }
3586 while (eptr >= pp)
3587 {
3588 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3589 eptr -= length;
3590 }
3591 FAIL;
3592 }
3593 }
3594 /* Control never gets here */
3595
3596 /* Match a character class, possibly repeatedly. Look past the end of the
3597 item to see if there is repeat information following. Then obey similar
Guido van Rossum50700601997-12-08 17:15:20 +00003598 code to character type repeats - written out again for speed. If caseless
3599 matching was set at runtime but not at compile time, we have to check both
3600 versions of a character. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003601
3602 case OP_CLASS:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003603 {
Guido van Rossum50700601997-12-08 17:15:20 +00003604 uschar *data = ecode + 1; /* Save for matching */
3605 ecode += 33; /* Advance past the item */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003606
3607 switch (*ecode)
3608 {
3609 case OP_CRSTAR:
3610 case OP_CRMINSTAR:
3611 case OP_CRPLUS:
3612 case OP_CRMINPLUS:
3613 case OP_CRQUERY:
3614 case OP_CRMINQUERY:
3615 c = *ecode++ - OP_CRSTAR;
3616 minimize = (c & 1) != 0;
3617 min = rep_min[c]; /* Pick up values from tables; */
3618 max = rep_max[c]; /* zero for max => infinity */
3619 if (max == 0) max = INT_MAX;
3620 break;
3621
3622 case OP_CRRANGE:
3623 case OP_CRMINRANGE:
3624 minimize = (*ecode == OP_CRMINRANGE);
3625 min = (ecode[1] << 8) + ecode[2];
3626 max = (ecode[3] << 8) + ecode[4];
3627 if (max == 0) max = INT_MAX;
3628 ecode += 5;
3629 break;
3630
3631 default: /* No repeat follows */
Guido van Rossum50700601997-12-08 17:15:20 +00003632 if (eptr >= md->end_subject) FAIL;
3633 c = *eptr++;
3634 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
3635 if (md->runtime_caseless)
3636 {
3637 c = pcre_fcc[c];
3638 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
3639 }
3640 FAIL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003641 }
3642
3643 /* First, ensure the minimum number of matches are present. */
3644
3645 for (i = 1; i <= min; i++)
Guido van Rossum50700601997-12-08 17:15:20 +00003646 {
3647 if (eptr >= md->end_subject) FAIL;
3648 c = *eptr++;
3649 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3650 if (md->runtime_caseless)
3651 {
3652 c = pcre_fcc[c];
3653 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3654 }
3655 FAIL;
3656 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003657
3658 /* If max == min we can continue with the main loop without the
3659 need to recurse. */
3660
3661 if (min == max) continue;
3662
3663 /* If minimizing, keep testing the rest of the expression and advancing
3664 the pointer while it matches the class. */
3665
3666 if (minimize)
3667 {
3668 for (i = min;; i++)
3669 {
3670 if (match(eptr, ecode, offset_top, md)) SUCCEED;
Guido van Rossum50700601997-12-08 17:15:20 +00003671 if (i >= max || eptr >= md->end_subject) FAIL;
3672 c = *eptr++;
3673 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3674 if (md->runtime_caseless)
3675 {
3676 c = pcre_fcc[c];
3677 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3678 }
3679 FAIL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003680 }
3681 /* Control never gets here */
3682 }
3683
3684 /* If maximizing, find the longest possible run, then work backwards. */
3685
3686 else
3687 {
3688 uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00003689 for (i = min; i < max; eptr++, i++)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003690 {
Guido van Rossum50700601997-12-08 17:15:20 +00003691 if (eptr >= md->end_subject) break;
3692 c = *eptr;
3693 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3694 if (md->runtime_caseless)
3695 {
3696 c = pcre_fcc[c];
3697 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3698 }
3699 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003700 }
Guido van Rossum50700601997-12-08 17:15:20 +00003701
3702 while (eptr >= pp)
3703 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3704 FAIL;
3705 }
3706 }
3707 /* Control never gets here */
3708
3709 /* OP_CLASS_L opcode: handles localized character classes */
3710
3711 case OP_CLASS_L:
3712 {
3713 uschar *data = ecode + 1; /* Save for matching */
3714 uschar locale_flag = *data;
3715 ecode++; data++; /* The localization support adds an extra byte */
3716
3717 ecode += 33; /* Advance past the item */
3718
3719 switch (*ecode)
3720 {
3721 case OP_CRSTAR:
3722 case OP_CRMINSTAR:
3723 case OP_CRPLUS:
3724 case OP_CRMINPLUS:
3725 case OP_CRQUERY:
3726 case OP_CRMINQUERY:
3727 c = *ecode++ - OP_CRSTAR;
3728 minimize = (c & 1) != 0;
3729 min = rep_min[c]; /* Pick up values from tables; */
3730 max = rep_max[c]; /* zero for max => infinity */
3731 if (max == 0) max = INT_MAX;
3732 break;
3733
3734 case OP_CRRANGE:
3735 case OP_CRMINRANGE:
3736 minimize = (*ecode == OP_CRMINRANGE);
3737 min = (ecode[1] << 8) + ecode[2];
3738 max = (ecode[3] << 8) + ecode[4];
3739 if (max == 0) max = INT_MAX;
3740 ecode += 5;
3741 break;
3742
3743 default: /* No repeat follows */
3744 if (eptr >= md->end_subject) FAIL;
3745 c = *eptr++;
3746 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
3747 if ( (locale_flag & 1) && (isalpha(c) || c=='_') ) continue; /* Locale \w */
3748 if ( (locale_flag & 2) && (!isalpha(c) && c!='_') ) continue; /* Locale \W */
3749#if 0
3750 if ( (locale_flag & 4) && isdigit(c) ) continue; /* Locale \d */
3751 if ( (locale_flag & 8) && !isdigit(c) ) continue; /* Locale \D */
3752 if ( (locale_flag & 16) && isspace(c) ) continue; /* Locale \s */
3753 if ( (locale_flag & 32) && !isspace(c) ) continue; /* Locale \S */
3754#endif
3755
3756 if (md->runtime_caseless)
3757 {
3758 c = pcre_fcc[c];
3759 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
3760
3761 if ( (locale_flag & 1) && (isalpha(c) || c=='_') ) continue; /* Locale \w */
3762 if ( (locale_flag & 2) && (!isalpha(c) && c!='_') ) continue; /* Locale \W */
3763 }
3764 FAIL;
3765 }
3766
3767 /* First, ensure the minimum number of matches are present. */
3768
3769 for (i = 1; i <= min; i++)
3770 {
3771 if (eptr >= md->end_subject) FAIL;
3772 c = *eptr++;
3773 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3774 if ( (locale_flag & 1) && (isalpha(c) || c=='_') ) continue; /* Locale \w */
3775 if ( (locale_flag & 2) && (!isalpha(c) && c!='_') ) continue; /* Locale \W */
3776
3777 if (md->runtime_caseless)
3778 {
3779 c = pcre_fcc[c];
3780 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3781 if ( (locale_flag & 1) && (isalpha(c) || c=='_') ) continue; /* Locale \w */
3782 if ( (locale_flag & 2) && (!isalpha(c) && c!='_') ) continue; /* Locale \W */
3783 }
3784 FAIL;
3785 }
3786
3787 /* If max == min we can continue with the main loop without the
3788 need to recurse. */
3789
3790 if (min == max) continue;
3791
3792 /* If minimizing, keep testing the rest of the expression and advancing
3793 the pointer while it matches the class. */
3794
3795 if (minimize)
3796 {
3797 for (i = min;; i++)
3798 {
3799 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3800 if (i >= max || eptr >= md->end_subject) FAIL;
3801 c = *eptr++;
3802 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3803 if ( (locale_flag & 1) && (isalpha(c) || c=='_') ) continue; /* Locale \w */
3804 if ( (locale_flag & 2) && (!isalpha(c) && c!='_') ) continue; /* Locale \W */
3805
3806 if (md->runtime_caseless)
3807 {
3808 c = pcre_fcc[c];
3809 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3810 if ( (locale_flag & 1) && (isalpha(c) || c=='_') ) continue; /* Locale \w */
3811 if ( (locale_flag & 2) && (!isalpha(c) && c!='_') ) continue; /* Locale \W */
3812 }
3813 FAIL;
3814 }
3815 /* Control never gets here */
3816 }
3817
3818 /* If maximizing, find the longest possible run, then work backwards. */
3819
3820 else
3821 {
3822 uschar *pp = eptr;
3823 for (i = min; i < max; eptr++, i++)
3824 {
3825 if (eptr >= md->end_subject) break;
3826 c = *eptr;
3827 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3828 if ( (locale_flag & 1) && (isalpha(c) || c=='_') ) continue; /* Locale \w */
3829 if ( (locale_flag & 2) && (!isalpha(c) && c!='_') ) continue; /* Locale \W */
3830 if (md->runtime_caseless)
3831 {
3832 c = pcre_fcc[c];
3833 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3834 if ( (locale_flag & 1) && (isalpha(c) || c=='_') ) continue; /* Locale \w */
3835 if ( (locale_flag & 2) && (!isalpha(c) && c!='_') ) continue; /* Locale \W */
3836 }
3837 break;
3838 }
3839
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003840 while (eptr >= pp)
3841 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3842 FAIL;
3843 }
3844 }
3845 /* Control never gets here */
3846
3847 /* Match a run of characters */
3848
3849 case OP_CHARS:
3850 {
3851 register int length = ecode[1];
3852 ecode += 2;
3853
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003854#ifdef DEBUG
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003855 if (eptr >= md->end_subject)
3856 printf("matching subject <null> against pattern ");
3857 else
3858 {
3859 printf("matching subject ");
3860 pchars(eptr, length, TRUE, md);
3861 printf(" against pattern ");
3862 }
3863 pchars(ecode, length, FALSE, md);
3864 printf("\n");
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003865#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003866
3867 if (length > md->end_subject - eptr) FAIL;
3868 if (md->caseless)
3869 {
3870 while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) FAIL;
3871 }
3872 else
3873 {
3874 while (length-- > 0) if (*ecode++ != *eptr++) FAIL;
3875 }
3876 }
3877 break;
3878
3879 /* Match a single character repeatedly; different opcodes share code. */
3880
3881 case OP_EXACT:
3882 min = max = (ecode[1] << 8) + ecode[2];
3883 ecode += 3;
3884 goto REPEATCHAR;
3885
3886 case OP_UPTO:
3887 case OP_MINUPTO:
3888 min = 0;
3889 max = (ecode[1] << 8) + ecode[2];
3890 minimize = *ecode == OP_MINUPTO;
3891 ecode += 3;
3892 goto REPEATCHAR;
3893
3894 case OP_STAR:
3895 case OP_MINSTAR:
3896 case OP_PLUS:
3897 case OP_MINPLUS:
3898 case OP_QUERY:
3899 case OP_MINQUERY:
3900 c = *ecode++ - OP_STAR;
3901 minimize = (c & 1) != 0;
3902 min = rep_min[c]; /* Pick up values from tables; */
3903 max = rep_max[c]; /* zero for max => infinity */
3904 if (max == 0) max = INT_MAX;
3905
3906 /* Common code for all repeated single-character matches. We can give
3907 up quickly if there are fewer than the minimum number of characters left in
3908 the subject. */
3909
3910 REPEATCHAR:
3911 if (min > md->end_subject - eptr) FAIL;
3912 c = *ecode++;
3913
3914 /* The code is duplicated for the caseless and caseful cases, for speed,
3915 since matching characters is likely to be quite common. First, ensure the
3916 minimum number of matches are present. If min = max, continue at the same
3917 level without recursing. Otherwise, if minimizing, keep trying the rest of
3918 the expression and advancing one matching character if failing, up to the
3919 maximum. Alternatively, if maximizing, find the maximum number of
3920 characters and work backwards. */
3921
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003922#ifdef DEBUG
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003923 printf("matching %c{%d,%d} against subject %.*s\n", c, min, max,
3924 max, eptr);
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003925#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003926
3927 if (md->caseless)
3928 {
3929 c = pcre_lcc[c];
3930 for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) FAIL;
3931 if (min == max) continue;
3932 if (minimize)
3933 {
3934 for (i = min;; i++)
3935 {
3936 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3937 if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])
3938 FAIL;
3939 }
3940 /* Control never gets here */
3941 }
3942 else
3943 {
3944 uschar *pp = eptr;
3945 for (i = min; i < max; i++)
3946 {
3947 if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;
3948 eptr++;
3949 }
3950 while (eptr >= pp)
3951 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3952 FAIL;
3953 }
Guido van Rossum50700601997-12-08 17:15:20 +00003954 /* Control never gets here */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003955 }
3956
3957 /* Caseful comparisons */
3958
3959 else
3960 {
3961 for (i = 1; i <= min; i++) if (c != *eptr++) FAIL;
3962 if (min == max) continue;
3963 if (minimize)
3964 {
3965 for (i = min;; i++)
3966 {
3967 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3968 if (i >= max || eptr >= md->end_subject || c != *eptr++) FAIL;
3969 }
3970 /* Control never gets here */
3971 }
3972 else
3973 {
3974 uschar *pp = eptr;
3975 for (i = min; i < max; i++)
3976 {
3977 if (eptr >= md->end_subject || c != *eptr) break;
3978 eptr++;
3979 }
3980 while (eptr >= pp)
3981 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3982 FAIL;
3983 }
3984 }
3985 /* Control never gets here */
3986
Guido van Rossum50700601997-12-08 17:15:20 +00003987 /* Match a negated single character */
3988
3989 case OP_NOT:
3990 if (eptr > md->end_subject) FAIL;
3991 ecode++;
3992 if (md->caseless)
3993 {
3994 if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) FAIL;
3995 }
3996 else
3997 {
3998 if (*ecode++ == *eptr++) FAIL;
3999 }
4000 break;
4001
4002 /* Match a negated single character repeatedly. This is almost a repeat of
4003 the code for a repeated single character, but I haven't found a nice way of
4004 commoning these up that doesn't require a test of the positive/negative
4005 option for each character match. Maybe that wouldn't add very much to the
4006 time taken, but character matching *is* what this is all about... */
4007
4008 case OP_NOTEXACT:
4009 min = max = (ecode[1] << 8) + ecode[2];
4010 ecode += 3;
4011 goto REPEATNOTCHAR;
4012
4013 case OP_NOTUPTO:
4014 case OP_NOTMINUPTO:
4015 min = 0;
4016 max = (ecode[1] << 8) + ecode[2];
4017 minimize = *ecode == OP_NOTMINUPTO;
4018 ecode += 3;
4019 goto REPEATNOTCHAR;
4020
4021 case OP_NOTSTAR:
4022 case OP_NOTMINSTAR:
4023 case OP_NOTPLUS:
4024 case OP_NOTMINPLUS:
4025 case OP_NOTQUERY:
4026 case OP_NOTMINQUERY:
4027 c = *ecode++ - OP_NOTSTAR;
4028 minimize = (c & 1) != 0;
4029 min = rep_min[c]; /* Pick up values from tables; */
4030 max = rep_max[c]; /* zero for max => infinity */
4031 if (max == 0) max = INT_MAX;
4032
4033 /* Common code for all repeated single-character matches. We can give
4034 up quickly if there are fewer than the minimum number of characters left in
4035 the subject. */
4036
4037 REPEATNOTCHAR:
4038 if (min > md->end_subject - eptr) FAIL;
4039 c = *ecode++;
4040
4041 /* The code is duplicated for the caseless and caseful cases, for speed,
4042 since matching characters is likely to be quite common. First, ensure the
4043 minimum number of matches are present. If min = max, continue at the same
4044 level without recursing. Otherwise, if minimizing, keep trying the rest of
4045 the expression and advancing one matching character if failing, up to the
4046 maximum. Alternatively, if maximizing, find the maximum number of
4047 characters and work backwards. */
4048
4049#ifdef DEBUG
4050 printf("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
4051 max, eptr);
4052#endif
4053
4054 if (md->caseless)
4055 {
4056 c = pcre_lcc[c];
4057 for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) FAIL;
4058 if (min == max) continue;
4059 if (minimize)
4060 {
4061 for (i = min;; i++)
4062 {
4063 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4064 if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])
4065 FAIL;
4066 }
4067 /* Control never gets here */
4068 }
4069 else
4070 {
4071 uschar *pp = eptr;
4072 for (i = min; i < max; i++)
4073 {
4074 if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;
4075 eptr++;
4076 }
4077 while (eptr >= pp)
4078 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4079 FAIL;
4080 }
4081 /* Control never gets here */
4082 }
4083
4084 /* Caseful comparisons */
4085
4086 else
4087 {
4088 for (i = 1; i <= min; i++) if (c == *eptr++) FAIL;
4089 if (min == max) continue;
4090 if (minimize)
4091 {
4092 for (i = min;; i++)
4093 {
4094 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4095 if (i >= max || eptr >= md->end_subject || c == *eptr++) FAIL;
4096 }
4097 /* Control never gets here */
4098 }
4099 else
4100 {
4101 uschar *pp = eptr;
4102 for (i = min; i < max; i++)
4103 {
4104 if (eptr >= md->end_subject || c == *eptr) break;
4105 eptr++;
4106 }
4107 while (eptr >= pp)
4108 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4109 FAIL;
4110 }
4111 }
4112 /* Control never gets here */
4113
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004114 /* Match a single character type repeatedly; several different opcodes
4115 share code. This is very similar to the code for single characters, but we
4116 repeat it in the interests of efficiency. */
4117
4118 case OP_TYPEEXACT:
4119 min = max = (ecode[1] << 8) + ecode[2];
4120 minimize = TRUE;
4121 ecode += 3;
4122 goto REPEATTYPE;
4123
4124 case OP_TYPEUPTO:
4125 case OP_TYPEMINUPTO:
4126 min = 0;
4127 max = (ecode[1] << 8) + ecode[2];
4128 minimize = *ecode == OP_TYPEMINUPTO;
4129 ecode += 3;
4130 goto REPEATTYPE;
4131
4132 case OP_TYPESTAR:
4133 case OP_TYPEMINSTAR:
4134 case OP_TYPEPLUS:
4135 case OP_TYPEMINPLUS:
4136 case OP_TYPEQUERY:
4137 case OP_TYPEMINQUERY:
4138 c = *ecode++ - OP_TYPESTAR;
4139 minimize = (c & 1) != 0;
4140 min = rep_min[c]; /* Pick up values from tables; */
4141 max = rep_max[c]; /* zero for max => infinity */
4142 if (max == 0) max = INT_MAX;
4143
4144 /* Common code for all repeated single character type matches */
4145
4146 REPEATTYPE:
4147 ctype = *ecode++; /* Code for the character type */
4148
4149 /* First, ensure the minimum number of matches are present. Use inline
4150 code for maximizing the speed, and do the type test once at the start
4151 (i.e. keep it out of the loop). Also test that there are at least the
4152 minimum number of characters before we start. */
4153
4154 if (min > md->end_subject - eptr) FAIL;
4155 if (min > 0) switch(ctype)
4156 {
4157 case OP_ANY:
4158 if (!md->dotall)
4159 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') FAIL; }
4160 else eptr += min;
4161 break;
4162
4163 case OP_NOT_DIGIT:
4164 for (i = 1; i <= min; i++)
4165 if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) FAIL;
4166 break;
4167
4168 case OP_DIGIT:
4169 for (i = 1; i <= min; i++)
4170 if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) FAIL;
4171 break;
4172
4173 case OP_NOT_WHITESPACE:
4174 for (i = 1; i <= min; i++)
4175 if ((pcre_ctypes[*eptr++] & ctype_space) != 0) FAIL;
4176 break;
4177
4178 case OP_WHITESPACE:
4179 for (i = 1; i <= min; i++)
4180 if ((pcre_ctypes[*eptr++] & ctype_space) == 0) FAIL;
4181 break;
4182
4183 case OP_NOT_WORDCHAR:
4184 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
4185 FAIL;
4186 break;
4187
4188 case OP_WORDCHAR:
4189 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
4190 FAIL;
4191 break;
Guido van Rossum50700601997-12-08 17:15:20 +00004192
4193 case OP_NOT_WORDCHAR_L:
4194 for (i = 1; i <= min; i++, eptr++) if (*eptr=='_' || isalpha(*eptr))
4195 return FALSE;
4196 break;
4197
4198 case OP_WORDCHAR_L:
4199 for (i = 1; i <= min; i++, eptr++) if (*eptr!='_' && !isalpha(*eptr))
4200 return FALSE;
4201 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004202 }
4203
4204 /* If min = max, continue at the same level without recursing */
4205
4206 if (min == max) continue;
4207
4208 /* If minimizing, we have to test the rest of the pattern before each
4209 subsequent match, so inlining isn't much help; just use the function. */
4210
4211 if (minimize)
4212 {
4213 for (i = min;; i++)
4214 {
4215 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4216 if (i >= max || eptr >= md->end_subject ||
4217 !match_type(ctype, *eptr++, md->dotall))
4218 FAIL;
4219 }
4220 /* Control never gets here */
4221 }
4222
4223 /* If maximizing it is worth using inline code for speed, doing the type
4224 test once at the start (i.e. keep it out of the loop). */
4225
4226 else
4227 {
4228 uschar *pp = eptr;
4229 switch(ctype)
4230 {
4231 case OP_ANY:
4232 if (!md->dotall)
4233 {
4234 for (i = min; i < max; i++)
4235 {
4236 if (eptr >= md->end_subject || *eptr == '\n') break;
4237 eptr++;
4238 }
4239 }
4240 else
4241 {
4242 c = max - min;
4243 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4244 eptr += c;
4245 }
4246 break;
4247
4248 case OP_NOT_DIGIT:
4249 for (i = min; i < max; i++)
4250 {
4251 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)
4252 break;
4253 eptr++;
4254 }
4255 break;
4256
4257 case OP_DIGIT:
4258 for (i = min; i < max; i++)
4259 {
4260 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)
4261 break;
4262 eptr++;
4263 }
4264 break;
4265
4266 case OP_NOT_WHITESPACE:
4267 for (i = min; i < max; i++)
4268 {
4269 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)
4270 break;
4271 eptr++;
4272 }
4273 break;
4274
4275 case OP_WHITESPACE:
4276 for (i = min; i < max; i++)
4277 {
4278 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)
4279 break;
4280 eptr++;
4281 }
4282 break;
4283
4284 case OP_NOT_WORDCHAR:
4285 for (i = min; i < max; i++)
4286 {
4287 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)
4288 break;
4289 eptr++;
4290 }
4291 break;
4292
4293 case OP_WORDCHAR:
4294 for (i = min; i < max; i++)
4295 {
Guido van Rossum50700601997-12-08 17:15:20 +00004296 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)
4297 break;
4298 eptr++;
4299 }
4300 break;
4301 case OP_NOT_WORDCHAR_L:
4302 for (i = min; i < max; i++)
4303 {
4304 if (eptr >= md->end_subject || (*eptr=='_' || isalpha(*eptr) ) )
4305 break;
4306 eptr++;
4307 }
4308 break;
4309
4310 case OP_WORDCHAR_L:
4311 for (i = min; i < max; i++)
4312 {
4313 if (eptr >= md->end_subject || (*eptr!='_' && !isalpha(*eptr) ) )
4314 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004315 eptr++;
4316 }
4317 break;
4318 }
4319
4320 while (eptr >= pp)
4321 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4322 FAIL;
4323 }
4324 /* Control never gets here */
4325
4326 /* There's been some horrible disaster. */
4327
4328 default:
Guido van Rossum57ba4f31997-12-02 20:40:28 +00004329#ifdef DEBUG
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004330 printf("Unknown opcode %d\n", *ecode);
Guido van Rossum57ba4f31997-12-02 20:40:28 +00004331#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004332 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4333 FAIL;
4334 }
4335
4336 /* Do not stick any code in here without much thought; it is assumed
4337 that "continue" in the code above comes out to here to repeat the main
4338 loop. */
4339
4340 } /* End of main loop */
4341/* Control never reaches here */
4342
4343fail:
4344 if (md->point > save_stack_position)
4345 {
4346 /* If there are still points remaining on the stack, pop the next one off */
Guido van Rossumc3861071997-10-08 02:07:40 +00004347 int off_num;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004348
4349 md->point--;
4350 offset_top = md->offset_top[md->point];
4351 eptr = md->eptr[md->point];
4352 ecode = md->ecode[md->point];
4353 off_num = md->off_num[md->point];
4354 md->offset_vector[off_num] = md->r1[md->point];
4355 md->offset_vector[off_num+1] = md->r2[md->point];
4356 goto match_loop;
4357 }
4358 /* Failure, and nothing left on the stack, so end this function call */
4359
4360 /* Restore the top of the stack to where it was before this function
4361 call. This lets us use one stack for everything; recursive calls
4362 can push and pop information, and may increase the stack. When
4363 the call returns, the parent function can resume pushing and
4364 popping wherever it was. */
4365
4366 md->point = save_stack_position;
4367 return FALSE;
4368
4369succeed:
4370 return TRUE;
4371}
4372
4373
Guido van Rossum50700601997-12-08 17:15:20 +00004374
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004375/*************************************************
4376* Execute a Regular Expression *
4377*************************************************/
4378
4379/* This function applies a compiled re to a subject string and picks out
4380portions of the string if it matches. Two elements in the vector are set for
4381each substring: the offsets to the start and end of the substring.
4382
4383Arguments:
Guido van Rossum50700601997-12-08 17:15:20 +00004384 external_re points to the compiled expression
4385 external_extra points to "hints" from pcre_study() or is NULL
4386 subject points to the subject string
4387 length length of subject string (may contain binary zeros)
4388 options option bits
4389 offsets points to a vector of ints to be filled in with offsets
4390 offsetcount the number of elements in the vector
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004391
Guido van Rossum50700601997-12-08 17:15:20 +00004392Returns: > 0 => success; value is the number of elements filled in
4393 = 0 => success, but offsets is not big enough
4394 -1 => failed to match
4395 < -1 => some kind of unexpected problem
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004396*/
4397
4398int
Guido van Rossum50700601997-12-08 17:15:20 +00004399pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4400 const char *subject, int length, int options, int *offsets, int offsetcount)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004401{
4402int resetcount;
Guido van Rossum50700601997-12-08 17:15:20 +00004403int ocount = offsetcount;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004404int first_char = -1;
4405match_data match_block;
4406uschar *start_bits = NULL;
4407uschar *start_match = (uschar *)subject;
4408uschar *end_subject;
4409real_pcre *re = (real_pcre *)external_re;
4410real_pcre_extra *extra = (real_pcre_extra *)external_extra;
4411BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4412BOOL startline = (re->options & PCRE_STARTLINE) != 0;
4413
4414if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4415
4416if (re == NULL || subject == NULL ||
4417 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4418if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4419
4420match_block.start_subject = (uschar *)subject;
4421match_block.end_subject = match_block.start_subject + length;
4422end_subject = match_block.end_subject;
4423
Guido van Rossum50700601997-12-08 17:15:20 +00004424match_block.caseless = ((re->options | options) & PCRE_CASELESS) != 0;
4425match_block.runtime_caseless = match_block.caseless &&
4426 (re->options & PCRE_CASELESS) == 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004427
Guido van Rossum50700601997-12-08 17:15:20 +00004428match_block.multiline = ((re->options | options) & PCRE_MULTILINE) != 0;
4429match_block.dotall = ((re->options | options) & PCRE_DOTALL) != 0;
4430match_block.endonly = ((re->options | options) & PCRE_DOLLAR_ENDONLY) != 0;
4431
4432match_block.notbol = (options & PCRE_NOTBOL) != 0;
4433match_block.noteol = (options & PCRE_NOTEOL) != 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004434
4435match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
4436
4437/* Set the stack state to empty */
4438 match_block.off_num = match_block.offset_top = NULL;
4439 match_block.r1 = match_block.r2 = NULL;
4440 match_block.eptr = match_block.ecode = NULL;
4441 match_block.point = match_block.length = 0;
4442
Guido van Rossum50700601997-12-08 17:15:20 +00004443/* If the expression has got more back references than the offsets supplied can
4444hold, we get a temporary bit of working store to use during the matching.
4445Otherwise, we can use the vector supplied, rounding down the size of it to a
4446multiple of 2. */
4447
4448ocount &= (-2);
4449if (re->top_backref > 0 && re->top_backref + 1 >= ocount/2)
4450 {
4451 ocount = re->top_backref * 2 + 2;
4452 match_block.offset_vector = (pcre_malloc)(ocount * sizeof(int));
4453 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4454#ifdef DEBUG
4455 printf("Got memory to hold back references\n");
4456#endif
4457 }
4458else match_block.offset_vector = offsets;
4459
4460match_block.offset_end = ocount;
4461match_block.offset_overflow = FALSE;
4462
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004463/* Compute the minimum number of offsets that we need to reset each time. Doing
4464this makes a huge difference to execution time when there aren't many brackets
4465in the pattern. */
4466
4467resetcount = 2 + re->top_bracket * 2;
Guido van Rossum50700601997-12-08 17:15:20 +00004468if (resetcount > offsetcount) resetcount = ocount;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004469
4470/* If MULTILINE is set at exec time but was not set at compile time, and the
4471anchored flag is set, we must re-check because a setting provoked by ^ in the
4472pattern is not right in multi-line mode. Calling is_anchored() again here does
4473the right check, because multiline is now set. If it now yields FALSE, the
4474expression must have had ^ starting some of its branches. Check to see if
4475that is true for *all* branches, and if so, set the startline flag. */
4476
4477if (match_block. multiline && anchored && (re->options & PCRE_MULTILINE) == 0 &&
4478 !is_anchored(re->code, match_block.multiline))
4479 {
4480 anchored = FALSE;
4481 if (is_startline(re->code)) startline = TRUE;
4482 }
4483
4484/* Set up the first character to match, if available. The first_char value is
4485never set for an anchored regular expression, but the anchoring may be forced
4486at run time, so we have to test for anchoring. The first char may be unset for
4487an unanchored pattern, of course. If there's no first char and the pattern was
4488studied, the may be a bitmap of possible first characters. However, we can
4489use this only if the caseless state of the studying was correct. */
4490
4491if (!anchored)
4492 {
4493 if ((re->options & PCRE_FIRSTSET) != 0)
4494 {
4495 first_char = re->first_char;
4496 if (match_block.caseless) first_char = pcre_lcc[first_char];
4497 }
4498 else
4499 if (!startline && extra != NULL &&
4500 (extra->options & PCRE_STUDY_MAPPED) != 0 &&
4501 ((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless)
4502 start_bits = extra->start_bits;
4503 }
4504
4505/* Loop for unanchored matches; for anchored regexps the loop runs just once. */
4506
4507do
4508 {
Guido van Rossum50700601997-12-08 17:15:20 +00004509 register int *iptr = match_block.offset_vector;
4510 register int *iend = iptr + resetcount;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004511
4512 /* Reset the maximum number of extractions we might see. */
4513
4514 while (iptr < iend) *iptr++ = -1;
4515
4516 /* Advance to a unique first char if possible */
4517
4518 if (first_char >= 0)
4519 {
4520 if (match_block.caseless)
4521 while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
4522 start_match++;
4523 else
4524 while (start_match < end_subject && *start_match != first_char)
4525 start_match++;
4526 }
4527
4528 /* Or to just after \n for a multiline match if possible */
4529
4530 else if (startline)
4531 {
4532 if (start_match > match_block.start_subject)
4533 {
4534 while (start_match < end_subject && start_match[-1] != '\n')
4535 start_match++;
4536 }
4537 }
4538
4539 /* Or to a non-unique first char */
4540
4541 else if (start_bits != NULL)
4542 {
4543 while (start_match < end_subject)
4544 {
4545 register int c = *start_match;
Guido van Rossum50700601997-12-08 17:15:20 +00004546 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004547 }
4548 }
4549
Guido van Rossum57ba4f31997-12-02 20:40:28 +00004550#ifdef DEBUG
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004551 printf(">>>> Match against: ");
4552 pchars(start_match, end_subject - start_match, TRUE, &match_block);
4553 printf("\n");
Guido van Rossum57ba4f31997-12-02 20:40:28 +00004554#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004555
4556 /* When a match occurs, substrings will be set for all internal extractions;
4557 we just need to set up the whole thing as substring 0 before returning. If
Guido van Rossum50700601997-12-08 17:15:20 +00004558 there were too many extractions, set the return code to zero. In the case
4559 where we had to get some local store to hold offsets for backreferences, copy
4560 those back references that we can. In this case there need not be overflow
4561 if certain parts of the pattern were not used.
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004562
Guido van Rossum50700601997-12-08 17:15:20 +00004563 Before starting the match, we have to set up a longjmp() target to enable
4564 the "cut" operation to fail a match completely without backtracking. */
4565
4566 /* To handle errors such as running out of memory for the failure
4567 stack, we need to save this location via setjmp(), so
4568 error-handling code can call longjmp() to jump out of deeply-nested code. */
4569 if (setjmp(match_block.error_env)==0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004570 {
Guido van Rossum50700601997-12-08 17:15:20 +00004571
4572 if (setjmp(match_block.fail_env) == 0 &&
4573 match(start_match, re->code, 2, &match_block))
4574 {
4575 int rc;
4576
4577 if (ocount != offsetcount)
4578 {
4579 if (offsetcount >= 4)
4580 {
4581 memcpy(offsets + 2, match_block.offset_vector + 2,
4582 (offsetcount - 2) * sizeof(int));
4583#ifdef DEBUG
4584 printf("Copied offsets; freeing temporary memory\n");
4585#endif
4586 }
4587 if (match_block.end_offset_top > offsetcount)
4588 match_block.offset_overflow = TRUE;
4589
4590#ifdef DEBUG
4591 printf("Freeing temporary memory\n");
4592#endif
4593
4594 (pcre_free)(match_block.offset_vector);
4595 }
4596
4597 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
4598
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004599 if (match_block.offset_end < 2) rc = 0; else
4600 {
4601 offsets[0] = start_match - match_block.start_subject;
4602 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
4603 }
Guido van Rossum50700601997-12-08 17:15:20 +00004604
Guido van Rossum57ba4f31997-12-02 20:40:28 +00004605#ifdef DEBUG
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004606 printf(">>>> returning %d\n", rc);
Guido van Rossum57ba4f31997-12-02 20:40:28 +00004607#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004608 free_stack(&match_block);
4609 return rc;
4610 }
Guido van Rossum50700601997-12-08 17:15:20 +00004611 } /* End of (if setjmp(match_block.error_env)...) */
4612 /* Return an error code; pcremodule.c will preserve the exception */
4613 if (PyErr_Occurred()) return PCRE_ERROR_NOMEMORY;
4614
4615 free_stack(&match_block);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004616 }
4617while (!anchored &&
4618 match_block.errorcode == PCRE_ERROR_NOMATCH &&
4619 start_match++ < end_subject);
4620
4621#ifdef DEBUG
4622printf(">>>> returning %d\n", match_block.errorcode);
4623#endif
Guido van Rossum50700601997-12-08 17:15:20 +00004624
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004625return match_block.errorcode;
4626}
4627
4628/* End of pcre.c */
Guido van Rossum50700601997-12-08 17:15:20 +00004629
4630
4631
4632
4633
4634
4635
4636
4637
4638