blob: a6ba17c756f55aa258523b916b746804b3f53eb0 [file] [log] [blame]
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001
2/*************************************************
3* Perl-Compatible Regular Expressions *
4*************************************************/
5
6/* DO NOT EDIT THIS FILE! */
7
8/* This file is automatically written by the merge-files.py script
9included with the PCRE distribution for Python; it's produced from
10several C files, and code is removed in the process. If you want to
11modify the code or track down bugs, it will be much easier to work
12with the code in its original, multiple-file form. Don't edit this
13file by hand, or submit patches to it.
14
15The Python-specific PCRE distribution can be retrieved from
16 http://starship.skyport.net/crew/amk/regex/
17
Guido van Rossum58132c61997-12-17 00:24:13 +000018The unmodified original PCRE distribution is available at
19ftp://ftp.cus.cam.ac.uk/pub/software/programs/pcre/, and is originally
20written by: Philip Hazel <ph10@cam.ac.uk>
Guido van Rossum51b3aa31997-10-06 14:43:11 +000021
22Extensively modified by the Python String-SIG: <string-sig@python.org>
23Send bug reports to: <string-sig@python.org>
24(They'll figure out if it's a bug in PCRE or in the Python-specific
25changes.)
26
27 Copyright (c) 1997 University of Cambridge
28
29-----------------------------------------------------------------------------
30Permission is granted to anyone to use this software for any purpose on any
31computer system, and to redistribute it freely, subject to the following
32restrictions:
33
341. This software is distributed in the hope that it will be useful,
35 but WITHOUT ANY WARRANTY; without even the implied warranty of
36 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
37
382. The origin of this software must not be misrepresented, either by
39 explicit claim or by omission.
40
413. Altered versions must be plainly marked as such, and must not be
42 misrepresented as being the original software.
43-----------------------------------------------------------------------------
44*/
45
46
47#define FOR_PYTHON
Guido van Rossum51b3aa31997-10-06 14:43:11 +000048#include "Python.h"
Martin v. Löwisd4233b22002-03-15 09:16:40 +000049#include "pcre-int.h"
Guido van Rossum50700601997-12-08 17:15:20 +000050#include <ctype.h>
Guido van Rossum51b3aa31997-10-06 14:43:11 +000051#include "graminit.h"
52
53/*************************************************
54* Perl-Compatible Regular Expressions *
55*************************************************/
56
57/* This file is automatically written by the makechartables auxiliary
58program. If you edit it by hand, you might like to edit the Makefile to
59prevent its ever being regenerated. */
60
61/* This table is a lower casing table. */
62
63unsigned char pcre_lcc[] = {
64 0, 1, 2, 3, 4, 5, 6, 7,
65 8, 9, 10, 11, 12, 13, 14, 15,
66 16, 17, 18, 19, 20, 21, 22, 23,
67 24, 25, 26, 27, 28, 29, 30, 31,
68 32, 33, 34, 35, 36, 37, 38, 39,
69 40, 41, 42, 43, 44, 45, 46, 47,
70 48, 49, 50, 51, 52, 53, 54, 55,
71 56, 57, 58, 59, 60, 61, 62, 63,
72 64, 97, 98, 99,100,101,102,103,
73 104,105,106,107,108,109,110,111,
74 112,113,114,115,116,117,118,119,
75 120,121,122, 91, 92, 93, 94, 95,
76 96, 97, 98, 99,100,101,102,103,
77 104,105,106,107,108,109,110,111,
78 112,113,114,115,116,117,118,119,
79 120,121,122,123,124,125,126,127,
80 128,129,130,131,132,133,134,135,
81 136,137,138,139,140,141,142,143,
82 144,145,146,147,148,149,150,151,
83 152,153,154,155,156,157,158,159,
84 160,161,162,163,164,165,166,167,
85 168,169,170,171,172,173,174,175,
86 176,177,178,179,180,181,182,183,
87 184,185,186,187,188,189,190,191,
88 192,193,194,195,196,197,198,199,
89 200,201,202,203,204,205,206,207,
90 208,209,210,211,212,213,214,215,
91 216,217,218,219,220,221,222,223,
92 224,225,226,227,228,229,230,231,
93 232,233,234,235,236,237,238,239,
94 240,241,242,243,244,245,246,247,
95 248,249,250,251,252,253,254,255 };
96
Guido van Rossum50700601997-12-08 17:15:20 +000097/* This table is a case flipping table. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +000098
Guido van Rossum50700601997-12-08 17:15:20 +000099unsigned char pcre_fcc[] = {
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000100 0, 1, 2, 3, 4, 5, 6, 7,
101 8, 9, 10, 11, 12, 13, 14, 15,
102 16, 17, 18, 19, 20, 21, 22, 23,
103 24, 25, 26, 27, 28, 29, 30, 31,
104 32, 33, 34, 35, 36, 37, 38, 39,
105 40, 41, 42, 43, 44, 45, 46, 47,
106 48, 49, 50, 51, 52, 53, 54, 55,
107 56, 57, 58, 59, 60, 61, 62, 63,
Guido van Rossum50700601997-12-08 17:15:20 +0000108 64, 97, 98, 99,100,101,102,103,
109 104,105,106,107,108,109,110,111,
110 112,113,114,115,116,117,118,119,
111 120,121,122, 91, 92, 93, 94, 95,
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000112 96, 65, 66, 67, 68, 69, 70, 71,
113 72, 73, 74, 75, 76, 77, 78, 79,
114 80, 81, 82, 83, 84, 85, 86, 87,
115 88, 89, 90,123,124,125,126,127,
116 128,129,130,131,132,133,134,135,
117 136,137,138,139,140,141,142,143,
118 144,145,146,147,148,149,150,151,
119 152,153,154,155,156,157,158,159,
120 160,161,162,163,164,165,166,167,
121 168,169,170,171,172,173,174,175,
122 176,177,178,179,180,181,182,183,
123 184,185,186,187,188,189,190,191,
124 192,193,194,195,196,197,198,199,
125 200,201,202,203,204,205,206,207,
126 208,209,210,211,212,213,214,215,
127 216,217,218,219,220,221,222,223,
128 224,225,226,227,228,229,230,231,
129 232,233,234,235,236,237,238,239,
130 240,241,242,243,244,245,246,247,
131 248,249,250,251,252,253,254,255 };
132
Guido van Rossum50700601997-12-08 17:15:20 +0000133/* This table contains bit maps for digits, letters, 'word' chars, and
134white space. Each map is 32 bytes long and the bits run from the least
135significant end of each byte. */
136
137unsigned char pcre_cbits[] = {
138 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
139 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
140 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
141 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
142
143 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
144 0xfe,0xff,0xff,0x07,0xfe,0xff,0xff,0x07,
145 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
146 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
147
148 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
149 0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07,
150 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
151 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
152
153 0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00,
154 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
155 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
156 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 };
157
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000158/* This table identifies various classes of character by individual bits:
Guido van Rossum50700601997-12-08 17:15:20 +0000159 0x01 white space character
160 0x02 letter
161 0x04 decimal digit
162 0x08 hexadecimal digit
163 0x10 alphanumeric or '_'
164 0x80 regular expression metacharacter or binary zero
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000165*/
166
167unsigned char pcre_ctypes[] = {
168 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
169 0x00,0x01,0x01,0x01,0x01,0x01,0x00,0x00, /* 8- 15 */
170 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
171 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
172 0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /* - ' */
173 0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /* ( - / */
Guido van Rossum50700601997-12-08 17:15:20 +0000174 0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c, /* 0 - 7 */
175 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x80, /* 8 - ? */
176 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* @ - G */
177 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* H - O */
178 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* P - W */
179 0x12,0x12,0x12,0x80,0x00,0x00,0x80,0x10, /* X - _ */
180 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* ` - g */
181 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* h - o */
182 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* p - w */
183 0x12,0x12,0x12,0x80,0x80,0x00,0x00,0x00, /* x -127 */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000184 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
185 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
186 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
187 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
188 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
189 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
190 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
191 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
192 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
193 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
194 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
195 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
196 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
197 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
198 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
199 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
200
Guido van Rossum50700601997-12-08 17:15:20 +0000201/* End of chartables.c */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000202/*************************************************
203* Perl-Compatible Regular Expressions *
204*************************************************/
205
206/*
207This is a library of functions to support regular expressions whose syntax
208and semantics are as close as possible to those of the Perl 5 language. See
209the file Tech.Notes for some information on the internals.
210
211Written by: Philip Hazel <ph10@cam.ac.uk>
212
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000213 Copyright (c) 1998 University of Cambridge
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000214
215-----------------------------------------------------------------------------
216Permission is granted to anyone to use this software for any purpose on any
217computer system, and to redistribute it freely, subject to the following
218restrictions:
219
2201. This software is distributed in the hope that it will be useful,
221 but WITHOUT ANY WARRANTY; without even the implied warranty of
222 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
223
2242. The origin of this software must not be misrepresented, either by
225 explicit claim or by omission.
226
2273. Altered versions must be plainly marked as such, and must not be
228 misrepresented as being the original software.
229-----------------------------------------------------------------------------
230*/
231
232
233/* Include the internals header, which itself includes Standard C headers plus
234the external pcre header. */
235
236
237
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000238
239/*************************************************
240* Create bitmap of starting chars *
241*************************************************/
242
243/* This function scans a compiled unanchored expression and attempts to build a
244bitmap of the set of initial characters. If it can't, it returns FALSE. As time
245goes by, we may be able to get more clever at doing this.
246
247Arguments:
248 code points to an expression
249 start_bits points to a 32-byte table, initialized to 0
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000250
251Returns: TRUE if table built, FALSE otherwise
252*/
253
254static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +0000255set_start_bits(const uschar *code, uschar *start_bits)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000256{
Guido van Rossum50700601997-12-08 17:15:20 +0000257register int c;
Guido van Rossum95864d31998-12-21 18:35:49 +0000258volatile int dummy;
Guido van Rossum50700601997-12-08 17:15:20 +0000259
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000260do
261 {
Guido van Rossum58132c61997-12-17 00:24:13 +0000262 const uschar *tcode = code + 3;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000263 BOOL try_next = TRUE;
264
265 while (try_next)
266 {
267 try_next = FALSE;
268
269 if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
270 {
Guido van Rossum50700601997-12-08 17:15:20 +0000271 if (!set_start_bits(tcode, start_bits)) return FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000272 }
273
274 else switch(*tcode)
275 {
276 default:
277 return FALSE;
278
279 /* BRAZERO does the bracket, but carries on. */
280
281 case OP_BRAZERO:
282 case OP_BRAMINZERO:
Guido van Rossum50700601997-12-08 17:15:20 +0000283 if (!set_start_bits(++tcode, start_bits)) return FALSE;
Guido van Rossum95864d31998-12-21 18:35:49 +0000284 dummy = 1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000285 do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
286 tcode += 3;
287 try_next = TRUE;
288 break;
289
290 /* Single-char * or ? sets the bit and tries the next item */
291
292 case OP_STAR:
293 case OP_MINSTAR:
294 case OP_QUERY:
295 case OP_MINQUERY:
Guido van Rossum50700601997-12-08 17:15:20 +0000296 start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000297 tcode += 2;
298 try_next = TRUE;
299 break;
300
301 /* Single-char upto sets the bit and tries the next */
302
303 case OP_UPTO:
304 case OP_MINUPTO:
Guido van Rossum50700601997-12-08 17:15:20 +0000305 start_bits[tcode[3]/8] |= (1 << (tcode[3]&7));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000306 tcode += 4;
307 try_next = TRUE;
308 break;
309
310 /* At least one single char sets the bit and stops */
311
312 case OP_EXACT: /* Fall through */
313 tcode++;
314
315 case OP_CHARS: /* Fall through */
316 tcode++;
317
318 case OP_PLUS:
319 case OP_MINPLUS:
Guido van Rossum50700601997-12-08 17:15:20 +0000320 start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000321 break;
322
323 /* Single character type sets the bits and stops */
324
325 case OP_NOT_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000326 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000327 break;
328
329 case OP_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000330 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000331 break;
332
333 case OP_NOT_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000334 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000335 break;
336
337 case OP_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000338 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000339 break;
340
341 case OP_NOT_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000342 for (c = 0; c < 32; c++)
343 start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000344 break;
345
346 case OP_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000347 for (c = 0; c < 32; c++)
348 start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000349 break;
350
351 /* One or more character type fudges the pointer and restarts, knowing
352 it will hit a single character type and stop there. */
353
354 case OP_TYPEPLUS:
355 case OP_TYPEMINPLUS:
356 tcode++;
357 try_next = TRUE;
358 break;
359
360 case OP_TYPEEXACT:
361 tcode += 3;
362 try_next = TRUE;
363 break;
364
365 /* Zero or more repeats of character types set the bits and then
366 try again. */
367
368 case OP_TYPEUPTO:
369 case OP_TYPEMINUPTO:
370 tcode += 2; /* Fall through */
371
372 case OP_TYPESTAR:
373 case OP_TYPEMINSTAR:
374 case OP_TYPEQUERY:
375 case OP_TYPEMINQUERY:
376 switch(tcode[1])
377 {
378 case OP_NOT_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000379 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000380 break;
381
382 case OP_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000383 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000384 break;
385
386 case OP_NOT_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000387 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000388 break;
389
390 case OP_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000391 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000392 break;
393
394 case OP_NOT_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000395 for (c = 0; c < 32; c++)
396 start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000397 break;
398
399 case OP_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000400 for (c = 0; c < 32; c++)
401 start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000402 break;
403 }
404
405 tcode += 2;
406 try_next = TRUE;
407 break;
408
409 /* Character class: set the bits and either carry on or not,
410 according to the repeat count. */
411
412 case OP_CLASS:
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000413 case OP_NEGCLASS:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000414 {
Guido van Rossum50700601997-12-08 17:15:20 +0000415 tcode++;
416 for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
417 tcode += 32;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000418 switch (*tcode)
419 {
420 case OP_CRSTAR:
421 case OP_CRMINSTAR:
422 case OP_CRQUERY:
423 case OP_CRMINQUERY:
424 tcode++;
425 try_next = TRUE;
426 break;
427
428 case OP_CRRANGE:
429 case OP_CRMINRANGE:
430 if (((tcode[1] << 8) + tcode[2]) == 0)
431 {
432 tcode += 5;
433 try_next = TRUE;
434 }
435 break;
436 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000437 }
438 break; /* End of class handling */
439
440 } /* End of switch */
441 } /* End of try_next loop */
442
443 code += (code[1] << 8) + code[2]; /* Advance to next branch */
444 }
445while (*code == OP_ALT);
446return TRUE;
447}
448
449
450
451/*************************************************
452* Study a compiled expression *
453*************************************************/
454
455/* This function is handed a compiled expression that it must study to produce
456information that will speed up the matching. It returns a pcre_extra block
457which then gets handed back to pcre_exec().
458
459Arguments:
460 re points to the compiled expression
461 options contains option bits
462 errorptr points to where to place error messages;
463 set NULL unless error
464
465Returns: pointer to a pcre_extra block,
466 NULL on error or if no optimization possible
467*/
468
469pcre_extra *
Guido van Rossum58132c61997-12-17 00:24:13 +0000470pcre_study(const pcre *external_re, int options, const char **errorptr)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000471{
472BOOL caseless;
473uschar start_bits[32];
474real_pcre_extra *extra;
Guido van Rossum58132c61997-12-17 00:24:13 +0000475const real_pcre *re = (const real_pcre *)external_re;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000476
477*errorptr = NULL;
478
479if (re == NULL || re->magic_number != MAGIC_NUMBER)
480 {
481 *errorptr = "argument is not a compiled regular expression";
482 return NULL;
483 }
484
485if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
486 {
487 *errorptr = "unknown or incorrect option bit(s) set";
488 return NULL;
489 }
490
Guido van Rossum50700601997-12-08 17:15:20 +0000491/* Caseless can either be from the compiled regex or from options. */
492
493caseless = ((re->options | options) & PCRE_CASELESS) != 0;
494
Thomas Wouters7e474022000-07-16 12:04:32 +0000495/* For an anchored pattern, or an unanchored pattern that has a first char, or a
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000496multiline pattern that matches only at "line starts", no further processing at
497present. */
498
499if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
500 return NULL;
501
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000502/* See if we can find a fixed set of initial characters for the pattern. */
503
Guido van Rossum50700601997-12-08 17:15:20 +0000504memset(start_bits, 0, 32 * sizeof(uschar));
505if (!set_start_bits(re->code, start_bits)) return NULL;
506
507/* If this studying is caseless, scan the created bit map and duplicate the
508bits for any letters. */
509
510if (caseless)
511 {
512 register int c;
513 for (c = 0; c < 256; c++)
514 {
515 if ((start_bits[c/8] & (1 << (c&7))) != 0 &&
516 (pcre_ctypes[c] & ctype_letter) != 0)
517 {
518 int d = pcre_fcc[c];
519 start_bits[d/8] |= (1 << (d&7));
520 }
521 }
522 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000523
524/* Get an "extra" block and put the information therein. */
525
526extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
527
528if (extra == NULL)
529 {
530 *errorptr = "failed to get memory";
531 return NULL;
532 }
Guido van Rossum50700601997-12-08 17:15:20 +0000533
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000534extra->options = PCRE_STUDY_MAPPED | (caseless? PCRE_STUDY_CASELESS : 0);
Guido van Rossum50700601997-12-08 17:15:20 +0000535memcpy(extra->start_bits, start_bits, sizeof(start_bits));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000536
537return (pcre_extra *)extra;
538}
539
Guido van Rossum50700601997-12-08 17:15:20 +0000540/* End of study.c */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000541/*************************************************
542* Perl-Compatible Regular Expressions *
543*************************************************/
544
545/*
546This is a library of functions to support regular expressions whose syntax
547and semantics are as close as possible to those of the Perl 5 language. See
548the file Tech.Notes for some information on the internals.
549
550Written by: Philip Hazel <ph10@cam.ac.uk>
551
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000552 Copyright (c) 1998 University of Cambridge
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000553
554-----------------------------------------------------------------------------
555Permission is granted to anyone to use this software for any purpose on any
556computer system, and to redistribute it freely, subject to the following
557restrictions:
558
5591. This software is distributed in the hope that it will be useful,
560 but WITHOUT ANY WARRANTY; without even the implied warranty of
561 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
562
5632. The origin of this software must not be misrepresented, either by
564 explicit claim or by omission.
565
5663. Altered versions must be plainly marked as such, and must not be
567 misrepresented as being the original software.
568-----------------------------------------------------------------------------
569*/
570
571
572/* Define DEBUG to get debugging output on stdout. */
573
574/* #define DEBUG */
575
Guido van Rossum557dea11997-12-22 22:46:52 +0000576/* Use a macro for debugging printing, 'cause that eliminates the the use
577of #ifdef inline, and there are *still* stupid compilers about that don't like
578indented pre-processor statements. I suppose it's only been 10 years... */
579
Jack Jansenc5601f42002-06-26 20:41:30 +0000580#undef DPRINTF
Guido van Rossum557dea11997-12-22 22:46:52 +0000581#ifdef DEBUG
582#define DPRINTF(p) printf p
583#else
584#define DPRINTF(p) /*nothing*/
585#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000586
587/* Include the internals header, which itself includes Standard C headers plus
588the external pcre header. */
589
590
Guido van Rossum50700601997-12-08 17:15:20 +0000591
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000592
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000593#ifndef Py_eval_input
594/* For Python 1.4, graminit.h has to be explicitly included */
595#define Py_eval_input eval_input
Guido van Rossum50700601997-12-08 17:15:20 +0000596
597#endif /* FOR_PYTHON */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000598
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000599/* Allow compilation as C++ source code, should anybody want to do that. */
600
601#ifdef __cplusplus
602#define class pcre_class
603#endif
604
605
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000606/* Min and max values for the common repeats; for the maxima, 0 => infinity */
607
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000608static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
609static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000610
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000611/* Text forms of OP_ values and things, for debugging (not all used) */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000612
613#ifdef DEBUG
Guido van Rossum58132c61997-12-17 00:24:13 +0000614static const char *OP_names[] = {
615 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
Guido van Rossum50700601997-12-08 17:15:20 +0000616 "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z",
617 "localized \\B", "localized \\b", "localized \\W", "localized \\w",
618 "^", "$", "Any", "chars",
619 "not",
620 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000621 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
622 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
623 "*", "*?", "+", "+?", "?", "??", "{", "{",
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000624 "class", "negclass", "classL", "Ref",
Guido van Rossum50700601997-12-08 17:15:20 +0000625 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000626 "Brazero", "Braminzero", "Bra"
627};
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000628#endif
629
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000630/* Table for handling escaped characters in the range '0'-'z'. Positive returns
631are simple data values; negative values are for special things like \d and so
632on. Zero means further processing is needed (for things like \x), or the escape
633is invalid. */
634
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000635static const short int escapes[] = {
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000636 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
637 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
638 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
639 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
640 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
641 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
642 '`', 7, -ESC_b, 0, -ESC_d, 0, '\f', 0, /* ` - g */
643 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
644 0, 0, '\r', -ESC_s, '\t', 0, '\v', -ESC_w, /* p - w */
645 0, 0, 0 /* x - z */
646};
647
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000648/* Definition to allow mutual recursion */
649
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000650static BOOL
651compile_regex(int, int *, uschar **, const uschar **, const char **,
652 PyObject *);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000653
654/* Structure for passing "static" information around between the functions
655doing the matching, so that they are thread-safe. */
656
657typedef struct match_data {
658 int errorcode; /* As it says */
659 int *offset_vector; /* Offset vector */
660 int offset_end; /* One past the end */
661 BOOL offset_overflow; /* Set if too many extractions */
662 BOOL caseless; /* Case-independent flag */
Guido van Rossum50700601997-12-08 17:15:20 +0000663 BOOL runtime_caseless; /* Caseless forced at run time */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000664 BOOL multiline; /* Multiline flag */
Guido van Rossum50700601997-12-08 17:15:20 +0000665 BOOL notbol; /* NOTBOL flag */
666 BOOL noteol; /* NOTEOL flag */
667 BOOL dotall; /* Dot matches any char */
668 BOOL endonly; /* Dollar not before final \n */
Guido van Rossum58132c61997-12-17 00:24:13 +0000669 const uschar *start_subject; /* Start of the subject string */
670 const uschar *end_subject; /* End of the subject string */
Guido van Rossum50700601997-12-08 17:15:20 +0000671 jmp_buf fail_env; /* Environment for longjump() break out */
Guido van Rossum58132c61997-12-17 00:24:13 +0000672 const uschar *end_match_ptr; /* Subject position at end match */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000673 int end_offset_top; /* Highwater mark at end of match */
Guido van Rossum50700601997-12-08 17:15:20 +0000674 jmp_buf error_env; /* For longjmp() if an error occurs deep inside a
675 matching operation */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000676 int length; /* Length of the allocated stacks */
677 int point; /* Point to add next item pushed onto stacks */
678 /* Pointers to the 6 stacks */
679 int *off_num, *offset_top, *r1, *r2;
Guido van Rossum58132c61997-12-17 00:24:13 +0000680 const uschar **eptr, **ecode;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000681} match_data;
682
683
684
685/*************************************************
Guido van Rossum50700601997-12-08 17:15:20 +0000686* Global variables *
687*************************************************/
688
689/* PCRE is thread-clean and doesn't use any global variables in the normal
690sense. However, it calls memory allocation and free functions via the two
691indirections below, which are can be changed by the caller, but are shared
692between all threads. */
693
694void *(*pcre_malloc)(size_t) = malloc;
695void (*pcre_free)(void *) = free;
696
697
698
699
700/*************************************************
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000701* Return version string *
702*************************************************/
703
Guido van Rossum58132c61997-12-17 00:24:13 +0000704const char *
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000705pcre_version(void)
706{
707return PCRE_VERSION;
708}
709
710
711
712
713/*************************************************
714* Return info about a compiled pattern *
715*************************************************/
716
717/* This function picks potentially useful data out of the private
718structure.
719
720Arguments:
721 external_re points to compiled code
722 optptr where to pass back the options
723 first_char where to pass back the first character,
724 or -1 if multiline and all branches start ^,
725 or -2 otherwise
726
727Returns: number of identifying extraction brackets
728 or negative values on error
729*/
730
731int
Guido van Rossum50700601997-12-08 17:15:20 +0000732pcre_info(const pcre *external_re, int *optptr, int *first_char)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000733{
Guido van Rossum58132c61997-12-17 00:24:13 +0000734const real_pcre *re = (real_pcre *)external_re;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000735if (re == NULL) return PCRE_ERROR_NULL;
736if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
737if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
738if (first_char != NULL)
739 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
740 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
741return re->top_bracket;
742}
743
744
745
746
747#ifdef DEBUG
748/*************************************************
749* Debugging function to print chars *
750*************************************************/
751
752/* Print a sequence of chars in printable format, stopping at the end of the
753subject if the requested.
754
755Arguments:
756 p points to characters
757 length number to print
758 is_subject TRUE if printing from within md->start_subject
759 md pointer to matching data block, if is_subject is TRUE
760
761Returns: nothing
762*/
763
Guido van Rossum557dea11997-12-22 22:46:52 +0000764static void
765pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000766{
767int c;
768if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
769while (length-- > 0)
770 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
771}
772#endif
773
774
775
776
777/*************************************************
778* Check subpattern for empty operand *
779*************************************************/
780
781/* This function checks a bracketed subpattern to see if any of the paths
782through it could match an empty string. This is used to diagnose an error if
783such a subpattern is followed by a quantifier with an unlimited upper bound.
784
785Argument:
786 code points to the opening bracket
787
788Returns: TRUE or FALSE
789*/
790
791static BOOL
792could_be_empty(uschar *code)
793{
794do {
795 uschar *cc = code + 3;
796
797 /* Scan along the opcodes for this branch; as soon as we find something
798 that matches a non-empty string, break out and advance to test the next
799 branch. If we get to the end of the branch, return TRUE for the whole
800 sub-expression. */
801
802 for (;;)
803 {
804 /* Test an embedded subpattern; if it could not be empty, break the
805 loop. Otherwise carry on in the branch. */
806
Guido van Rossum50700601997-12-08 17:15:20 +0000807 if ((int)(*cc) >= OP_BRA || (int)(*cc) == OP_ONCE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000808 {
809 if (!could_be_empty(cc)) break;
810 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
811 cc += 3;
812 }
813
814 else switch (*cc)
815 {
816 /* Reached end of a branch: the subpattern may match the empty string */
817
818 case OP_ALT:
819 case OP_KET:
820 case OP_KETRMAX:
821 case OP_KETRMIN:
822 return TRUE;
823
Guido van Rossumd0f432b1998-02-20 21:45:14 +0000824 /* Skip over entire bracket groups with zero lower bound */
825
826 case OP_BRAZERO:
827 case OP_BRAMINZERO:
828 cc++;
829 /* Fall through */
830
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000831 /* Skip over assertive subpatterns */
832
833 case OP_ASSERT:
834 case OP_ASSERT_NOT:
835 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
836 cc += 3;
837 break;
838
839 /* Skip over things that don't match chars */
840
841 case OP_SOD:
842 case OP_EOD:
843 case OP_CIRC:
844 case OP_DOLL:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000845 case OP_NOT_WORD_BOUNDARY:
846 case OP_WORD_BOUNDARY:
Guido van Rossum50700601997-12-08 17:15:20 +0000847 case OP_NOT_WORD_BOUNDARY_L:
848 case OP_WORD_BOUNDARY_L:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000849 cc++;
850 break;
851
852 /* Skip over simple repeats with zero lower bound */
853
854 case OP_STAR:
855 case OP_MINSTAR:
856 case OP_QUERY:
857 case OP_MINQUERY:
Guido van Rossum50700601997-12-08 17:15:20 +0000858 case OP_NOTSTAR:
859 case OP_NOTMINSTAR:
860 case OP_NOTQUERY:
861 case OP_NOTMINQUERY:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000862 case OP_TYPESTAR:
863 case OP_TYPEMINSTAR:
864 case OP_TYPEQUERY:
865 case OP_TYPEMINQUERY:
866 cc += 2;
867 break;
868
869 /* Skip over UPTOs (lower bound is zero) */
870
871 case OP_UPTO:
872 case OP_MINUPTO:
873 case OP_TYPEUPTO:
874 case OP_TYPEMINUPTO:
875 cc += 4;
876 break;
877
878 /* Check a class or a back reference for a zero minimum */
879
880 case OP_CLASS:
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000881 case OP_NEGCLASS:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000882 case OP_REF:
Guido van Rossum50700601997-12-08 17:15:20 +0000883 case OP_CLASS_L:
884 switch(*cc)
885 {
886 case (OP_REF): cc += 2; break;
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000887 case (OP_CLASS): case (OP_NEGCLASS): cc += 1+32; break;
Guido van Rossum50700601997-12-08 17:15:20 +0000888 case (OP_CLASS_L): cc += 1+1+32; break;
889 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000890
891 switch (*cc)
892 {
893 case OP_CRSTAR:
894 case OP_CRMINSTAR:
895 case OP_CRQUERY:
896 case OP_CRMINQUERY:
897 cc++;
898 break;
899
900 case OP_CRRANGE:
901 case OP_CRMINRANGE:
902 if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
903 cc += 3;
904 break;
905
906 default:
907 goto NEXT_BRANCH;
908 }
909 break;
910
911 /* Anything else matches at least one character */
912
913 default:
914 goto NEXT_BRANCH;
915 }
916 }
917
918 NEXT_BRANCH:
919 code += (code[1] << 8) + code[2];
920 }
921while (*code == OP_ALT);
922
923/* No branches match the empty string */
924
925return FALSE;
926}
927
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000928/* Determine the length of a group ID in an expression like
929 (?P<foo_123>...)
930Arguments:
931 ptr pattern position pointer (say that 3 times fast)
932 finalchar the character that will mark the end of the ID
933 errorptr points to the pointer to the error message
934*/
935
936static int
Guido van Rossum58132c61997-12-17 00:24:13 +0000937get_group_id(const uschar *ptr, char finalchar, const char **errorptr)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000938{
Guido van Rossum58132c61997-12-17 00:24:13 +0000939 const uschar *start = ptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000940
941 /* If the first character is not in \w, or is in \w but is a digit,
942 report an error */
943 if (!(pcre_ctypes[*ptr] & ctype_word) ||
944 (pcre_ctypes[*ptr++] & ctype_digit))
945 {
946 *errorptr = "(?P identifier must start with a letter or underscore";
947 return 0;
948 }
949
950 /* Increment ptr until we either hit a null byte, the desired
951 final character, or a non-word character */
952 for(; (*ptr != 0) && (*ptr != finalchar) &&
953 (pcre_ctypes[*ptr] & ctype_word); ptr++)
954 {
Guido van Rossumc3861071997-10-08 02:07:40 +0000955 /* Empty loop body */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000956 }
957 if (*ptr==finalchar)
958 return ptr-start;
959 if (*ptr==0)
960 {
961 *errorptr = "unterminated (?P identifier";
962 return 0;
963 }
964 *errorptr = "illegal character in (?P identifier";
965 return 0;
966}
967
968/*************************************************
969* Handle escapes *
970*************************************************/
971
972/* This function is called when a \ has been encountered. It either returns a
973positive value for a simple escape such as \n, or a negative value which
974encodes one of the more complicated things such as \d. On entry, ptr is
975pointing at the \. On exit, it is on the final character of the escape
976sequence.
977
978Arguments:
979 ptrptr points to the pattern position pointer
980 errorptr points to the pointer to the error message
Guido van Rossum50700601997-12-08 17:15:20 +0000981 bracount number of previous extracting brackets
982 options the options bits
983 isclass TRUE if inside a character class
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000984
985Returns: zero or positive => a data character
986 negative => a special escape sequence
987 on error, errorptr is set
988*/
989
990static int
Guido van Rossum58132c61997-12-17 00:24:13 +0000991check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
992 int options, BOOL isclass)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000993{
Guido van Rossum58132c61997-12-17 00:24:13 +0000994const uschar *ptr = *ptrptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000995int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
996int i;
997
Guido van Rossum50700601997-12-08 17:15:20 +0000998if (c == 0) *errorptr = ERR1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000999
1000/* Digits or letters may have special meaning; all others are literals. */
1001
1002else if (c < '0' || c > 'z') {}
1003
1004/* Do an initial lookup in a table. A non-zero result is something that can be
1005returned immediately. Otherwise further processing may be required. */
1006
1007else if ((i = escapes[c - '0']) != 0) c = i;
1008
1009/* Escapes that need further processing, or are illegal. */
1010
Guido van Rossum50700601997-12-08 17:15:20 +00001011else
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001012 {
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001013
Guido van Rossum50700601997-12-08 17:15:20 +00001014 switch (c)
1015 {
1016 /* The handling of escape sequences consisting of a string of digits
1017 starting with one that is not zero is not straightforward. By experiment,
1018 the way Perl works seems to be as follows:
1019
1020 Outside a character class, the digits are read as a decimal number. If the
1021 number is less than 10, or if there are that many previous extracting
1022 left brackets, then it is a back reference. Otherwise, up to three octal
1023 digits are read to form an escaped byte. Thus \123 is likely to be octal
1024 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
1025 value is greater than 377, the least significant 8 bits are taken. Inside a
1026 character class, \ followed by a digit is always an octal number. */
1027
1028 case '1': case '2': case '3': case '4': case '5':
1029 case '6': case '7': case '8': case '9':
1030
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001031 {
1032 /* PYTHON: Try to compute an octal value for a character */
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001033 for(c=0, i=0; ptr[i]!=0 && i<3; i++)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001034 {
1035 if (( pcre_ctypes[ ptr[i] ] & ctype_odigit) != 0)
Andrew M. Kuchling3b96d0b2000-08-02 13:41:18 +00001036 c = (c * 8 + ptr[i]-'0') & 255;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001037 else
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001038 break; /* Non-octal character--break out of the loop */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001039 }
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001040 /* It's a character if there were exactly 3 octal digits, or if
1041 we're inside a character class and there was at least one
1042 octal digit. */
1043 if ( (i == 3) || (isclass && i!=0) )
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001044 {
1045 ptr += i-1;
1046 break;
1047 }
1048 c = ptr[0]; /* Restore the first character after the \ */
1049 c -= '0'; i = 1;
1050 while (i<2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0)
1051 {
1052 c = c * 10 + ptr[1] - '0';
1053 ptr++; i++;
1054 }
1055 if (c > 255 - ESC_REF) *errorptr = "back reference too big";
1056 c = -(ESC_REF + c);
1057 }
1058 break;
1059
Guido van Rossum50700601997-12-08 17:15:20 +00001060 /* \0 always starts an octal number, but we may drop through to here with a
1061 larger first octal digit */
1062
1063 case '0':
1064 c -= '0';
1065 while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&
1066 ptr[1] != '8' && ptr[1] != '9')
Andrew M. Kuchling94c34522000-06-01 03:02:48 +00001067 c = (c * 8 + *(++ptr) - '0') & 255;
Guido van Rossum50700601997-12-08 17:15:20 +00001068 break;
1069
1070 /* Special escapes not starting with a digit are straightforward */
1071
1072 case 'x':
1073 c = 0;
1074 while ( (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001075 {
Guido van Rossum50700601997-12-08 17:15:20 +00001076 ptr++;
1077 c = c * 16 + pcre_lcc[*ptr] -
1078 (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
1079 c &= 255;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001080 }
1081 break;
1082
1083
Guido van Rossum50700601997-12-08 17:15:20 +00001084 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1085 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
1086 for Perl compatibility, it is a literal. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001087
Guido van Rossum50700601997-12-08 17:15:20 +00001088 default:
1089 if ((options & PCRE_EXTRA) != 0) switch(c)
1090 {
1091 case 'X':
1092 c = -ESC_X; /* This could be a lookup if it ever got into Perl */
1093 break;
1094
1095 default:
1096 *errorptr = ERR3;
1097 break;
1098 }
1099 break;
1100 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001101 }
1102
1103*ptrptr = ptr;
1104return c;
1105}
1106
1107
1108
1109/*************************************************
Guido van Rossum50700601997-12-08 17:15:20 +00001110* Check for counted repeat *
1111*************************************************/
1112
1113/* This function is called when a '{' is encountered in a place where it might
1114start a quantifier. It looks ahead to see if it really is a quantifier or not.
1115It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1116where the ddds are digits.
1117
1118Arguments:
1119 p pointer to the first char after '{'
1120
1121Returns: TRUE or FALSE
1122*/
1123
1124static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00001125is_counted_repeat(const uschar *p)
Guido van Rossum50700601997-12-08 17:15:20 +00001126{
1127if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1128while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1129if (*p == '}') return TRUE;
1130
1131if (*p++ != ',') return FALSE;
1132if (*p == '}') return TRUE;
1133
1134if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1135while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1136return (*p == '}');
1137}
1138
1139
1140
1141/*************************************************
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001142* Read repeat counts *
1143*************************************************/
1144
Guido van Rossum50700601997-12-08 17:15:20 +00001145/* Read an item of the form {n,m} and return the values. This is called only
1146after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1147so the syntax is guaranteed to be correct, but we need to check the values.
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001148
1149Arguments:
1150 p pointer to first char after '{'
1151 minp pointer to int for min
1152 maxp pointer to int for max
1153 returned as -1 if no max
1154 errorptr points to pointer to error message
1155
1156Returns: pointer to '}' on success;
1157 current ptr on error, with errorptr set
1158*/
1159
Guido van Rossum58132c61997-12-17 00:24:13 +00001160static const uschar *
1161read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001162{
1163int min = 0;
1164int max = -1;
1165
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001166while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
1167
1168if (*p == '}') max = min; else
1169 {
Guido van Rossum50700601997-12-08 17:15:20 +00001170 if (*(++p) != '}')
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001171 {
1172 max = 0;
1173 while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001174 if (max < min)
1175 {
Guido van Rossum50700601997-12-08 17:15:20 +00001176 *errorptr = ERR4;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001177 return p;
1178 }
1179 }
1180 }
1181
1182/* Do paranoid checks, then fill in the required variables, and pass back the
1183pointer to the terminating '}'. */
1184
Guido van Rossum50700601997-12-08 17:15:20 +00001185if (min > 65535 || max > 65535)
1186 *errorptr = ERR5;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001187else
1188 {
1189 *minp = min;
1190 *maxp = max;
1191 }
1192return p;
1193}
1194
1195
1196
1197/*************************************************
1198* Compile one branch *
1199*************************************************/
1200
1201/* Scan the pattern, compiling it into the code vector.
1202
1203Arguments:
Guido van Rossum50700601997-12-08 17:15:20 +00001204 options the option bits
1205 bracket points to number of brackets used
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001206 code points to the pointer to the current code point
1207 ptrptr points to the current pattern pointer
1208 errorptr points to pointer to error message
1209
1210Returns: TRUE on success
1211 FALSE, with *errorptr set on error
1212*/
1213
1214static BOOL
Guido van Rossum50700601997-12-08 17:15:20 +00001215compile_branch(int options, int *brackets, uschar **codeptr,
Guido van Rossum58132c61997-12-17 00:24:13 +00001216 const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001217{
1218int repeat_type, op_type;
1219int repeat_min, repeat_max;
1220int bravalue, length;
Guido van Rossumdda66961998-05-07 15:32:44 +00001221int greedy_default, greedy_non_default;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001222register int c;
1223register uschar *code = *codeptr;
Guido van Rossum58132c61997-12-17 00:24:13 +00001224const uschar *ptr = *ptrptr;
1225const uschar *oldptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001226uschar *previous = NULL;
Guido van Rossum50700601997-12-08 17:15:20 +00001227uschar class[32];
1228uschar *class_flag; /* Pointer to the single-byte flag for OP_CLASS_L */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001229
Guido van Rossumdda66961998-05-07 15:32:44 +00001230/* Set up the default and non-default settings for greediness */
1231
1232greedy_default = ((options & PCRE_UNGREEDY) != 0);
1233greedy_non_default = greedy_default ^ 1;
1234
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001235/* Switch on next character until the end of the branch */
1236
1237for (;; ptr++)
1238 {
Guido van Rossum50700601997-12-08 17:15:20 +00001239 BOOL negate_class;
1240 int class_charcount;
1241 int class_lastchar;
1242
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001243 c = *ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00001244 if ((options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001245 {
1246 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1247 if (c == '#')
1248 {
1249 while ((c = *(++ptr)) != 0 && c != '\n');
1250 continue;
1251 }
1252 }
1253
1254 switch(c)
1255 {
1256 /* The branch terminates at end of string, |, or ). */
1257
1258 case 0:
1259 case '|':
1260 case ')':
1261 *codeptr = code;
1262 *ptrptr = ptr;
1263 return TRUE;
1264
1265 /* Handle single-character metacharacters */
1266
1267 case '^':
1268 previous = NULL;
1269 *code++ = OP_CIRC;
1270 break;
1271
1272 case '$':
1273 previous = NULL;
1274 *code++ = OP_DOLL;
1275 break;
1276
1277 case '.':
1278 previous = code;
1279 *code++ = OP_ANY;
1280 break;
1281
Guido van Rossum50700601997-12-08 17:15:20 +00001282 /* Character classes. These always build a 32-byte bitmap of the permitted
1283 characters, except in the special case where there is only one character.
1284 For negated classes, we build the map as usual, then invert it at the end.
1285 */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001286
1287 case '[':
Guido van Rossum50700601997-12-08 17:15:20 +00001288 previous = code;
1289 if (options & PCRE_LOCALE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001290 {
Guido van Rossum50700601997-12-08 17:15:20 +00001291 *code++ = OP_CLASS_L;
1292 /* Set the flag for localized classes (like \w) to 0 */
1293 class_flag = code;
1294 *class_flag = 0;
1295 }
1296 else
1297 {
1298 *code++ = OP_CLASS;
1299 class_flag = NULL;
1300 }
1301
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001302 /* If the first character is '^', set the negation flag, and use a
1303 different opcode. This only matters if caseless matching is specified at
1304 runtime. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001305
Guido van Rossum50700601997-12-08 17:15:20 +00001306 if ((c = *(++ptr)) == '^')
1307 {
1308 negate_class = TRUE;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001309 if (*(code-1)==OP_CLASS) *(code-1) = OP_NEGCLASS;
Guido van Rossum50700601997-12-08 17:15:20 +00001310 c = *(++ptr);
1311 }
1312 else negate_class = FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001313
Guido van Rossum50700601997-12-08 17:15:20 +00001314 /* Keep a count of chars so that we can optimize the case of just a single
1315 character. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001316
Guido van Rossum50700601997-12-08 17:15:20 +00001317 class_charcount = 0;
1318 class_lastchar = -1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001319
Guido van Rossum50700601997-12-08 17:15:20 +00001320 /* Initialize the 32-char bit map to all zeros. We have to build the
1321 map in a temporary bit of store, in case the class contains only 1
1322 character, because in that case the compiled code doesn't use the
1323 bit map. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001324
Guido van Rossum50700601997-12-08 17:15:20 +00001325 memset(class, 0, 32 * sizeof(uschar));
1326
1327 /* Process characters until ] is reached. By writing this as a "do" it
1328 means that an initial ] is taken as a data character. */
1329
1330 do
1331 {
1332 if (c == 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001333 {
Guido van Rossum50700601997-12-08 17:15:20 +00001334 *errorptr = ERR6;
1335 goto FAILED;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001336 }
1337
Guido van Rossum50700601997-12-08 17:15:20 +00001338 /* Backslash may introduce a single character, or it may introduce one
1339 of the specials, which just set a flag. Escaped items are checked for
1340 validity in the pre-compiling pass. The sequence \b is a special case.
Guido van Rossum58132c61997-12-17 00:24:13 +00001341 Inside a class (and only there) it is treated as backspace. Elsewhere
Guido van Rossum50700601997-12-08 17:15:20 +00001342 it marks a word boundary. Other escapes have preset maps ready to
1343 or into the one we are building. We assume they have more than one
1344 character in them, so set class_count bigger than one. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001345
Guido van Rossum50700601997-12-08 17:15:20 +00001346 if (c == '\\')
1347 {
1348 c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1349 if (-c == ESC_b) c = '\b';
1350 else if (c < 0)
1351 {
1352 class_charcount = 10;
1353 switch (-c)
1354 {
1355 case ESC_d:
Guido van Rossum50700601997-12-08 17:15:20 +00001356 {
1357 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];
1358 }
1359 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001360
Guido van Rossum50700601997-12-08 17:15:20 +00001361 case ESC_D:
Guido van Rossum50700601997-12-08 17:15:20 +00001362 {
1363 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];
1364 }
1365 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001366
Guido van Rossum50700601997-12-08 17:15:20 +00001367 case ESC_w:
1368 if (options & PCRE_LOCALE)
1369 {
1370 *class_flag |= 1;
1371 }
1372 else
1373 {
1374 for (c = 0; c < 32; c++)
1375 class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1376 }
1377 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001378
Guido van Rossum50700601997-12-08 17:15:20 +00001379 case ESC_W:
1380 if (options & PCRE_LOCALE)
1381 {
1382 *class_flag |= 2;
1383 }
1384 else
1385 {
1386 for (c = 0; c < 32; c++)
1387 class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1388 }
1389 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001390
Guido van Rossum50700601997-12-08 17:15:20 +00001391 case ESC_s:
Guido van Rossum50700601997-12-08 17:15:20 +00001392 {
1393 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];
1394 }
1395 continue;
1396
1397 case ESC_S:
Guido van Rossum50700601997-12-08 17:15:20 +00001398 {
1399 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];
1400 }
1401 continue;
1402
1403 default:
1404 *errorptr = ERR7;
1405 goto FAILED;
1406 }
1407 }
1408 /* Fall through if single character */
1409 }
1410
1411 /* A single character may be followed by '-' to form a range. However,
1412 Perl does not permit ']' to be the end of the range. A '-' character
1413 here is treated as a literal. */
1414
1415 if (ptr[1] == '-' && ptr[2] != ']')
1416 {
1417 int d;
1418 ptr += 2;
1419 d = *ptr;
1420
1421 if (d == 0)
1422 {
1423 *errorptr = ERR6;
1424 goto FAILED;
1425 }
1426
1427 /* The second part of a range can be a single-character escape, but
1428 not any of the other escapes. */
1429
1430 if (d == '\\')
1431 {
1432 d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1433 if (d < 0)
1434 {
1435 if (d == -ESC_b) d = '\b'; else
1436 {
1437 *errorptr = ERR7;
1438 goto FAILED;
1439 }
1440 }
1441 }
1442
1443 if (d < c)
1444 {
1445 *errorptr = ERR8;
1446 goto FAILED;
1447 }
1448
1449 for (; c <= d; c++)
1450 {
1451 class[c/8] |= (1 << (c&7));
1452 if ((options & PCRE_CASELESS) != 0)
1453 {
1454 int uc = pcre_fcc[c]; /* flip case */
1455 class[uc/8] |= (1 << (uc&7));
1456 }
1457 class_charcount++; /* in case a one-char range */
1458 class_lastchar = c;
1459 }
1460 continue; /* Go get the next char in the class */
1461 }
1462
1463 /* Handle a lone single character - we can get here for a normal
1464 non-escape char, or after \ that introduces a single character. */
1465
1466 class [c/8] |= (1 << (c&7));
1467 if ((options & PCRE_CASELESS) != 0)
1468 {
1469 c = pcre_fcc[c]; /* flip case */
1470 class[c/8] |= (1 << (c&7));
1471 }
1472 class_charcount++;
1473 class_lastchar = c;
1474 }
1475
1476 /* Loop until ']' reached; the check for end of string happens inside the
1477 loop. This "while" is the end of the "do" above. */
1478
1479 while ((c = *(++ptr)) != ']');
1480
1481 /* If class_charcount is 1 and class_lastchar is not negative, we saw
1482 precisely one character. This doesn't need the whole 32-byte bit map.
1483 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1484 it's negative. */
1485
1486 if (class_charcount == 1 && class_lastchar >= 0)
1487 {
1488 if (negate_class)
1489 {
1490 code[-1] = OP_NOT;
1491 }
1492 else
1493 {
1494 code[-1] = OP_CHARS;
1495 *code++ = 1;
1496 }
1497 *code++ = class_lastchar;
1498 }
1499
1500 /* Otherwise, negate the 32-byte map if necessary, and copy it into
1501 the code vector. */
1502
1503 else
1504 {
1505 /* If this is a localized opcode, bump the code pointer up */
1506 if (class_flag) code++;
1507 if (negate_class)
1508 {
1509 if (class_flag) *class_flag = (*class_flag) ^ 63;
1510 for (c = 0; c < 32; c++) code[c] = ~class[c];
1511 }
1512 else
1513 memcpy(code, class, 32);
1514 code += 32;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001515 }
1516 break;
1517
1518 /* Various kinds of repeat */
1519
1520 case '{':
Guido van Rossum50700601997-12-08 17:15:20 +00001521 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001522 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
1523 if (*errorptr != NULL) goto FAILED;
1524 goto REPEAT;
1525
1526 case '*':
1527 repeat_min = 0;
1528 repeat_max = -1;
1529 goto REPEAT;
1530
1531 case '+':
1532 repeat_min = 1;
1533 repeat_max = -1;
1534 goto REPEAT;
1535
1536 case '?':
1537 repeat_min = 0;
1538 repeat_max = 1;
1539
1540 REPEAT:
1541 if (previous == NULL)
1542 {
Guido van Rossum50700601997-12-08 17:15:20 +00001543 *errorptr = ERR9;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001544 goto FAILED;
1545 }
1546
Guido van Rossumdda66961998-05-07 15:32:44 +00001547 /* If the next character is '?' this is a minimizing repeat, by default,
1548 but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001549 next character. */
1550
Guido van Rossumdda66961998-05-07 15:32:44 +00001551 if (ptr[1] == '?')
1552 { repeat_type = greedy_non_default; ptr++; }
1553 else repeat_type = greedy_default;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001554
Guido van Rossum50700601997-12-08 17:15:20 +00001555 /* If the maximum is zero then the minimum must also be zero; Perl allows
1556 this case, so we do too - by simply omitting the item altogether. */
1557
1558 if (repeat_max == 0) code = previous;
1559
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001560 /* If previous was a string of characters, chop off the last one and use it
1561 as the subject of the repeat. If there was only one character, we can
1562 abolish the previous item altogether. */
1563
Guido van Rossum50700601997-12-08 17:15:20 +00001564 else if (*previous == OP_CHARS)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001565 {
1566 int len = previous[1];
1567 if (len == 1)
1568 {
1569 c = previous[2];
1570 code = previous;
1571 }
1572 else
1573 {
1574 c = previous[len+1];
1575 previous[1]--;
1576 code--;
1577 }
1578 op_type = 0; /* Use single-char op codes */
1579 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1580 }
1581
Guido van Rossum50700601997-12-08 17:15:20 +00001582 /* If previous was a single negated character ([^a] or similar), we use
1583 one of the special opcodes, replacing it. The code is shared with single-
1584 character repeats by adding a suitable offset into repeat_type. */
1585
1586 else if ((int)*previous == OP_NOT)
1587 {
1588 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
1589 c = previous[1];
1590 code = previous;
1591 goto OUTPUT_SINGLE_REPEAT;
1592 }
1593
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001594 /* If previous was a character type match (\d or similar), abolish it and
1595 create a suitable repeat item. The code is shared with single-character
1596 repeats by adding a suitable offset into repeat_type. */
1597
Guido van Rossum50700601997-12-08 17:15:20 +00001598 else if ((int)*previous < OP_CIRC || *previous == OP_ANY)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001599 {
1600 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1601 c = *previous;
1602 code = previous;
1603
1604 OUTPUT_SINGLE_REPEAT:
1605 repeat_type += op_type; /* Combine both values for many cases */
1606
1607 /* A minimum of zero is handled either as the special case * or ?, or as
1608 an UPTO, with the maximum given. */
1609
1610 if (repeat_min == 0)
1611 {
1612 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1613 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1614 else
1615 {
1616 *code++ = OP_UPTO + repeat_type;
1617 *code++ = repeat_max >> 8;
1618 *code++ = (repeat_max & 255);
1619 }
1620 }
1621
1622 /* The case {1,} is handled as the special case + */
1623
1624 else if (repeat_min == 1 && repeat_max == -1)
1625 *code++ = OP_PLUS + repeat_type;
1626
1627 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1628 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1629
1630 else
1631 {
1632 if (repeat_min != 1)
1633 {
1634 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1635 *code++ = repeat_min >> 8;
1636 *code++ = (repeat_min & 255);
1637 }
1638
Thomas Wouters7e474022000-07-16 12:04:32 +00001639 /* If the minimum is 1 and the previous item was a character string,
1640 we either have to put back the item that got canceled if the string
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001641 length was 1, or add the character back onto the end of a longer
Guido van Rossumdda66961998-05-07 15:32:44 +00001642 string. For a character type nothing need be done; it will just get
1643 put back naturally. Note that the final character is always going to
1644 get added below. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001645
1646 else if (*previous == OP_CHARS)
1647 {
1648 if (code == previous) code += 2; else previous[1]++;
1649 }
1650
Guido van Rossumdda66961998-05-07 15:32:44 +00001651 /* For a single negated character we also have to put back the
Thomas Wouters7e474022000-07-16 12:04:32 +00001652 item that got canceled. */
Guido van Rossumdda66961998-05-07 15:32:44 +00001653
1654 else if (*previous == OP_NOT) code++;
1655
Guido van Rossum557dea11997-12-22 22:46:52 +00001656 /* If the maximum is unlimited, insert an OP_STAR. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001657
Guido van Rossum557dea11997-12-22 22:46:52 +00001658 if (repeat_max < 0)
1659 {
1660 *code++ = c;
1661 *code++ = OP_STAR + repeat_type;
1662 }
1663
1664 /* Else insert an UPTO if the max is greater than the min. */
1665
1666 else if (repeat_max != repeat_min)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001667 {
1668 *code++ = c;
1669 repeat_max -= repeat_min;
1670 *code++ = OP_UPTO + repeat_type;
1671 *code++ = repeat_max >> 8;
1672 *code++ = (repeat_max & 255);
1673 }
1674 }
1675
1676 /* The character or character type itself comes last in all cases. */
1677
1678 *code++ = c;
1679 }
1680
1681 /* If previous was a character class or a back reference, we put the repeat
1682 stuff after it. */
1683
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001684 else if (*previous == OP_CLASS || *previous == OP_NEGCLASS ||
1685 *previous==OP_CLASS_L || *previous == OP_REF)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001686 {
1687 if (repeat_min == 0 && repeat_max == -1)
1688 *code++ = OP_CRSTAR + repeat_type;
1689 else if (repeat_min == 1 && repeat_max == -1)
1690 *code++ = OP_CRPLUS + repeat_type;
1691 else if (repeat_min == 0 && repeat_max == 1)
1692 *code++ = OP_CRQUERY + repeat_type;
1693 else
1694 {
1695 *code++ = OP_CRRANGE + repeat_type;
1696 *code++ = repeat_min >> 8;
1697 *code++ = repeat_min & 255;
1698 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1699 *code++ = repeat_max >> 8;
1700 *code++ = repeat_max & 255;
1701 }
1702 }
1703
1704 /* If previous was a bracket group, we may have to replicate it in certain
1705 cases. If the maximum repeat count is unlimited, check that the bracket
1706 group cannot match the empty string, and diagnose an error if it can. */
1707
1708 else if ((int)*previous >= OP_BRA)
1709 {
1710 int i;
Guido van Rossum557dea11997-12-22 22:46:52 +00001711 int len = code - previous;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001712
1713 if (repeat_max == -1 && could_be_empty(previous))
1714 {
Guido van Rossum50700601997-12-08 17:15:20 +00001715 *errorptr = ERR10;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001716 goto FAILED;
1717 }
1718
1719 /* If the minimum is greater than zero, and the maximum is unlimited or
1720 equal to the minimum, the first copy remains where it is, and is
1721 replicated up to the minimum number of times. This case includes the +
1722 repeat, but of course no replication is needed in that case. */
1723
1724 if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1725 {
1726 for (i = 1; i < repeat_min; i++)
1727 {
Guido van Rossum557dea11997-12-22 22:46:52 +00001728 memcpy(code, previous, len);
1729 code += len;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001730 }
1731 }
1732
1733 /* If the minimum is zero, stick BRAZERO in front of the first copy.
1734 Then, if there is a fixed upper limit, replicated up to that many times,
1735 sticking BRAZERO in front of all the optional ones. */
1736
1737 else
1738 {
1739 if (repeat_min == 0)
1740 {
Guido van Rossum557dea11997-12-22 22:46:52 +00001741 memmove(previous+1, previous, len);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001742 code++;
1743 *previous++ = OP_BRAZERO + repeat_type;
1744 }
1745
1746 for (i = 1; i < repeat_min; i++)
1747 {
Guido van Rossum557dea11997-12-22 22:46:52 +00001748 memcpy(code, previous, len);
1749 code += len;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001750 }
1751
1752 for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1753 {
1754 *code++ = OP_BRAZERO + repeat_type;
Guido van Rossum557dea11997-12-22 22:46:52 +00001755 memcpy(code, previous, len);
1756 code += len;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001757 }
1758 }
1759
1760 /* If the maximum is unlimited, set a repeater in the final copy. */
1761
1762 if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
1763 }
1764
1765 /* Else there's some kind of shambles */
1766
1767 else
1768 {
Guido van Rossum50700601997-12-08 17:15:20 +00001769 *errorptr = ERR11;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001770 goto FAILED;
1771 }
1772
1773 /* In all case we no longer have a previous item. */
1774
1775 previous = NULL;
1776 break;
1777
1778
1779 /* Start of nested bracket sub-expression, or comment or lookahead.
1780 First deal with special things that can come after a bracket; all are
1781 introduced by ?, and the appearance of any of them means that this is not a
1782 referencing group. They were checked for validity in the first pass over
1783 the string, so we don't have to check for syntax errors here. */
1784
1785 case '(':
1786 previous = code; /* Only real brackets can be repeated */
1787 if (*(++ptr) == '?')
1788 {
1789 bravalue = OP_BRA;
1790
1791 switch (*(++ptr))
1792 {
1793 case '#':
1794 case 'i':
Guido van Rossumbd49ac41997-12-10 23:05:53 +00001795 case 'L':
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001796 case 'm':
1797 case 's':
1798 case 'x':
1799 ptr++;
1800 while (*ptr != ')') ptr++;
1801 previous = NULL;
1802 continue;
1803
1804 case ':': /* Non-extracting bracket */
1805 ptr++;
1806 break;
1807
1808 case '=': /* Assertions can't be repeated */
1809 bravalue = OP_ASSERT;
1810 ptr++;
1811 previous = NULL;
1812 break;
1813
1814 case '!':
1815 bravalue = OP_ASSERT_NOT;
1816 ptr++;
1817 previous = NULL;
1818 break;
1819
1820 case ('P'):
1821 ptr++;
1822 if (*ptr=='<')
1823 {
1824 /* (?P<groupname>...) */
1825 int idlen;
1826 PyObject *string, *intobj;
1827
1828 ptr++;
1829 idlen = get_group_id(ptr, '>', errorptr);
1830 if (*errorptr) {
1831 goto FAILED;
1832 }
Guido van Rossum57ba4f31997-12-02 20:40:28 +00001833 string = PyString_FromStringAndSize((char*)ptr, idlen);
Guido van Rossum50700601997-12-08 17:15:20 +00001834 intobj = PyInt_FromLong( brackets[0] + 1 );
Guido van Rossum58132c61997-12-17 00:24:13 +00001835 if (intobj == NULL || string == NULL)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001836 {
1837 Py_XDECREF(string);
1838 Py_XDECREF(intobj);
1839 *errorptr = "exception raised";
1840 goto FAILED;
1841 }
1842 PyDict_SetItem(dictionary, string, intobj);
Guido van Rossum58132c61997-12-17 00:24:13 +00001843 Py_DECREF(string); Py_DECREF(intobj); /* XXX DECREF commented out! */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001844 ptr += idlen+1; /* Point to rest of expression */
1845 goto do_grouping_bracket;
1846 }
1847 if (*ptr=='=')
1848 {
1849 /* (?P=groupname) */
1850 int idlen, refnum;
1851 PyObject *string, *intobj;
1852
1853 ptr++;
1854 idlen = get_group_id(ptr, ')', errorptr);
1855 if (*errorptr) {
1856 goto FAILED;
1857 }
Guido van Rossum50700601997-12-08 17:15:20 +00001858 string = PyString_FromStringAndSize((char *)ptr, idlen);
Guido van Rossumc3861071997-10-08 02:07:40 +00001859 if (string==NULL) {
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001860 *errorptr = "exception raised";
1861 goto FAILED;
1862 }
1863 intobj = PyDict_GetItem(dictionary, string);
1864 if (intobj==NULL) {
Guido van Rossumc3861071997-10-08 02:07:40 +00001865 Py_DECREF(string);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001866 *errorptr = "?P= group identifier isn't defined";
1867 goto FAILED;
1868 }
1869
1870 refnum = PyInt_AsLong(intobj);
Guido van Rossum1eadb411997-12-15 17:33:24 +00001871 Py_DECREF(string);
Guido van Rossum58132c61997-12-17 00:24:13 +00001872 /* The caller doesn't own the reference to the value
1873 returned from PyDict_GetItem, so intobj is not
1874 DECREF'ed. */
1875
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001876 *code++ = OP_REF;
1877 *code++ = refnum;
1878 /* The continue will cause the top-level for() loop to
1879 be resumed, so ptr will be immediately incremented.
1880 Therefore, the following line adds just idlen, not
1881 idlen+1 */
1882 ptr += idlen;
1883 continue;
1884 }
1885 /* The character after ?P is neither < nor =, so
1886 report an error. Add more Python-extensions here. */
1887 *errorptr="unknown after (?P";
1888 goto FAILED;
Guido van Rossum50700601997-12-08 17:15:20 +00001889
1890 case '>': /* "Match once" brackets */
1891 if ((options & PCRE_EXTRA) != 0) /* Not yet standard */
1892 {
1893 bravalue = OP_ONCE;
1894 ptr++;
1895 previous = NULL;
1896 break;
1897 }
1898 /* Else fall through */
1899
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001900 default:
Guido van Rossum50700601997-12-08 17:15:20 +00001901 *errorptr = ERR12;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001902 goto FAILED;
1903 }
1904 }
1905
1906 /* Else we have a referencing group */
1907
1908 else
1909 {
1910 do_grouping_bracket:
Guido van Rossum50700601997-12-08 17:15:20 +00001911 if (++(*brackets) > EXTRACT_MAX)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001912 {
Guido van Rossum50700601997-12-08 17:15:20 +00001913 *errorptr = ERR13;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001914 goto FAILED;
1915 }
Guido van Rossum50700601997-12-08 17:15:20 +00001916 bravalue = OP_BRA + *brackets;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001917 }
1918
1919 /* Process nested bracketed re; at end pointer is on the bracket. We copy
1920 code into a non-register variable in order to be able to pass its address
1921 because some compilers complain otherwise. */
1922
1923 *code = bravalue;
1924 {
1925 uschar *mcode = code;
Guido van Rossum50700601997-12-08 17:15:20 +00001926 if (!compile_regex(options, brackets, &mcode, &ptr, errorptr, dictionary))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001927 goto FAILED;
1928 code = mcode;
1929 }
1930
1931 if (*ptr != ')')
1932 {
Guido van Rossum50700601997-12-08 17:15:20 +00001933 *errorptr = ERR14;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001934 goto FAILED;
1935 }
1936 break;
1937
1938 /* Check \ for being a real metacharacter; if not, fall through and handle
1939 it as a data character at the start of a string. Escape items are checked
1940 for validity in the pre-compiling pass. */
1941
1942 case '\\':
1943 oldptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00001944 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001945
1946 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1947 are arranged to be the negation of the corresponding OP_values. For the
1948 back references, the values are ESC_REF plus the reference number. Only
1949 back references and those types that consume a character may be repeated.
1950 We can test for values between ESC_b and ESC_Z for the latter; this may
1951 have to change if any new ones are ever created. */
1952
1953 if (c < 0)
1954 {
1955 if (-c >= ESC_REF)
1956 {
Guido van Rossum50700601997-12-08 17:15:20 +00001957 int refnum = -c - ESC_REF;
1958 if (*brackets < refnum)
1959 {
1960 *errorptr = ERR15;
1961 goto FAILED;
1962 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001963 previous = code;
1964 *code++ = OP_REF;
1965 *code++ = refnum;
1966 }
1967 else
1968 {
Guido van Rossum50700601997-12-08 17:15:20 +00001969 previous = (-c > ESC_b && -c < ESC_X)? code : NULL;
1970 if ( (options & PCRE_LOCALE) != 0)
1971 {
1972 switch (c)
1973 {
1974 case (-ESC_b): c = -OP_WORD_BOUNDARY_L; break;
1975 case (-ESC_B): c = -OP_NOT_WORD_BOUNDARY_L; break;
1976 case (-ESC_w): c = -OP_WORDCHAR_L; break;
1977 case (-ESC_W): c = -OP_NOT_WORDCHAR_L; break;
1978 }
1979 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001980 *code++ = -c;
1981 }
1982 continue;
1983 }
1984
Guido van Rossum58132c61997-12-17 00:24:13 +00001985 /* Data character: Reset and fall through */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001986
1987 ptr = oldptr;
1988 c = '\\';
1989
1990 /* Handle a run of data characters until a metacharacter is encountered.
1991 The first character is guaranteed not to be whitespace or # when the
1992 extended flag is set. */
1993
Guido van Rossum50700601997-12-08 17:15:20 +00001994 NORMAL_CHAR:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001995 default:
1996 previous = code;
1997 *code = OP_CHARS;
1998 code += 2;
1999 length = 0;
2000
2001 do
2002 {
Guido van Rossum50700601997-12-08 17:15:20 +00002003 if ((options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002004 {
2005 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
2006 if (c == '#')
2007 {
2008 while ((c = *(++ptr)) != 0 && c != '\n');
2009 if (c == 0) break;
2010 continue;
2011 }
2012 }
2013
2014 /* Backslash may introduce a data char or a metacharacter. Escaped items
2015 are checked for validity in the pre-compiling pass. Stop the string
2016 before a metaitem. */
2017
2018 if (c == '\\')
2019 {
2020 oldptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00002021 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002022 if (c < 0) { ptr = oldptr; break; }
2023 }
2024
2025 /* Ordinary character or single-char escape */
2026
2027 *code++ = c;
2028 length++;
2029 }
2030
2031 /* This "while" is the end of the "do" above. */
2032
2033 while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2034
2035 /* Compute the length and set it in the data vector, and advance to
2036 the next state. */
2037
2038 previous[1] = length;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00002039 if (length < 255) ptr--;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002040 break;
2041 }
2042 } /* end of big loop */
2043
2044/* Control never reaches here by falling through, only by a goto for all the
2045error states. Pass back the position in the pattern so that it can be displayed
2046to the user for diagnosing the error. */
2047
2048FAILED:
2049*ptrptr = ptr;
2050return FALSE;
2051}
2052
2053
2054
2055
2056/*************************************************
2057* Compile sequence of alternatives *
2058*************************************************/
2059
2060/* On entry, ptr is pointing past the bracket character, but on return
2061it points to the closing bracket, or vertical bar, or end of string.
2062The code variable is pointing at the byte into which the BRA operator has been
2063stored.
2064
2065Argument:
Guido van Rossum50700601997-12-08 17:15:20 +00002066 options the option bits
2067 brackets -> int containing the number of extracting brackets used
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002068 codeptr -> the address of the current code pointer
2069 ptrptr -> the address of the current pattern pointer
2070 errorptr -> pointer to error message
2071
2072Returns: TRUE on success
2073*/
2074
2075static BOOL
Guido van Rossum50700601997-12-08 17:15:20 +00002076compile_regex(int options, int *brackets, uschar **codeptr,
Guido van Rossum58132c61997-12-17 00:24:13 +00002077 const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002078{
Guido van Rossum58132c61997-12-17 00:24:13 +00002079const uschar *ptr = *ptrptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002080uschar *code = *codeptr;
2081uschar *start_bracket = code;
2082
2083for (;;)
2084 {
2085 int length;
2086 uschar *last_branch = code;
2087
2088 code += 3;
Guido van Rossum50700601997-12-08 17:15:20 +00002089 if (!compile_branch(options, brackets, &code, &ptr, errorptr, dictionary))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002090 {
2091 *ptrptr = ptr;
2092 return FALSE;
2093 }
2094
2095 /* Fill in the length of the last branch */
2096
2097 length = code - last_branch;
2098 last_branch[1] = length >> 8;
2099 last_branch[2] = length & 255;
2100
2101 /* Reached end of expression, either ')' or end of pattern. Insert a
2102 terminating ket and the length of the whole bracketed item, and return,
2103 leaving the pointer at the terminating char. */
2104
2105 if (*ptr != '|')
2106 {
2107 length = code - start_bracket;
2108 *code++ = OP_KET;
2109 *code++ = length >> 8;
2110 *code++ = length & 255;
2111 *codeptr = code;
2112 *ptrptr = ptr;
2113 return TRUE;
2114 }
2115
2116 /* Another branch follows; insert an "or" node and advance the pointer. */
2117
2118 *code = OP_ALT;
2119 ptr++;
2120 }
2121/* Control never reaches here */
2122}
2123
2124
2125
2126/*************************************************
2127* Check for anchored expression *
2128*************************************************/
2129
2130/* Try to find out if this is an anchored regular expression. Consider each
2131alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2132all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2133it's anchored. However, if this is a multiline pattern, then only OP_SOD
2134counts, since OP_CIRC can match in the middle.
2135
2136A branch is also implicitly anchored if it starts with .* because that will try
2137the rest of the pattern at all possible matching points, so there is no point
2138trying them again.
2139
2140Argument: points to start of expression (the bracket)
2141Returns: TRUE or FALSE
2142*/
2143
2144static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00002145is_anchored(register const uschar *code, BOOL multiline)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002146{
2147do {
2148 int op = (int)code[3];
Guido van Rossum50700601997-12-08 17:15:20 +00002149 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002150 { if (!is_anchored(code+3, multiline)) return FALSE; }
2151 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2152 { if (code[4] != OP_ANY) return FALSE; }
2153 else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;
2154 code += (code[1] << 8) + code[2];
2155 }
2156while (*code == OP_ALT);
2157return TRUE;
2158}
2159
2160
2161
2162/*************************************************
2163* Check for start with \n line expression *
2164*************************************************/
2165
2166/* This is called for multiline expressions to try to find out if every branch
2167starts with ^ so that "first char" processing can be done to speed things up.
2168
2169Argument: points to start of expression (the bracket)
2170Returns: TRUE or FALSE
2171*/
2172
2173static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00002174is_startline(const uschar *code)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002175{
2176do {
2177 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2178 { if (!is_startline(code+3)) return FALSE; }
2179 else if (code[3] != OP_CIRC) return FALSE;
2180 code += (code[1] << 8) + code[2];
2181 }
2182while (*code == OP_ALT);
2183return TRUE;
2184}
2185
2186
2187
2188/*************************************************
2189* Check for fixed first char *
2190*************************************************/
2191
2192/* Try to find out if there is a fixed first character. This is called for
2193unanchored expressions, as it speeds up their processing quite considerably.
2194Consider each alternative branch. If they all start with the same char, or with
2195a bracket all of whose alternatives start with the same char (recurse ad lib),
2196then we return that char, otherwise -1.
2197
2198Argument: points to start of expression (the bracket)
2199Returns: -1 or the fixed first char
2200*/
2201
2202static int
2203find_firstchar(uschar *code)
2204{
2205register int c = -1;
2206do
2207 {
2208 register int charoffset = 4;
2209
2210 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2211 {
2212 register int d;
2213 if ((d = find_firstchar(code+3)) < 0) return -1;
2214 if (c < 0) c = d; else if (c != d) return -1;
2215 }
2216
2217 else switch(code[3])
2218 {
2219 default:
2220 return -1;
2221
2222 case OP_EXACT: /* Fall through */
2223 charoffset++;
2224
2225 case OP_CHARS: /* Fall through */
2226 charoffset++;
2227
2228 case OP_PLUS:
2229 case OP_MINPLUS:
2230 if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;
2231 break;
2232 }
2233 code += (code[1] << 8) + code[2];
2234 }
2235while (*code == OP_ALT);
2236return c;
2237}
2238
2239
2240
2241/*************************************************
2242* Compile a Regular Expression *
2243*************************************************/
2244
2245/* This function takes a string and returns a pointer to a block of store
2246holding a compiled version of the expression.
2247
2248Arguments:
2249 pattern the regular expression
2250 options various option bits
2251 errorptr pointer to pointer to error text
2252 erroroffset ptr offset in pattern where error was detected
2253
2254Returns: pointer to compiled data block, or NULL on error,
2255 with errorptr and erroroffset set
2256*/
2257
2258pcre *
Guido van Rossum58132c61997-12-17 00:24:13 +00002259pcre_compile(const char *pattern, int options, const char **errorptr,
Guido van Rossum50700601997-12-08 17:15:20 +00002260 int *erroroffset, PyObject *dictionary)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002261{
2262real_pcre *re;
2263int spaces = 0;
2264int length = 3; /* For initial BRA plus length */
2265int runlength;
2266int c, size;
Guido van Rossum50700601997-12-08 17:15:20 +00002267int bracount = 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002268int brastack[200];
Guido van Rossum50700601997-12-08 17:15:20 +00002269int top_backref = 0;
Guido van Rossum58132c61997-12-17 00:24:13 +00002270unsigned int brastackptr = 0;
2271uschar *code;
2272const uschar *ptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002273
2274#ifdef DEBUG
2275uschar *code_base, *code_end;
2276#endif
2277
Guido van Rossum50700601997-12-08 17:15:20 +00002278/* We can't pass back an error message if errorptr is NULL; I guess the best we
2279can do is just return NULL. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002280
Guido van Rossum50700601997-12-08 17:15:20 +00002281if (errorptr == NULL) return NULL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002282*errorptr = NULL;
Guido van Rossum50700601997-12-08 17:15:20 +00002283
2284/* However, we can give a message for this error */
2285
2286if (erroroffset == NULL)
2287 {
2288 *errorptr = ERR16;
2289 return NULL;
2290 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002291*erroroffset = 0;
2292
2293if ((options & ~PUBLIC_OPTIONS) != 0)
2294 {
Guido van Rossum50700601997-12-08 17:15:20 +00002295 *errorptr = ERR17;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002296 return NULL;
2297 }
2298
Guido van Rossum557dea11997-12-22 22:46:52 +00002299DPRINTF(("------------------------------------------------------------------\n"));
2300DPRINTF(("%s\n", pattern));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002301
2302/* The first thing to do is to make a pass over the pattern to compute the
2303amount of store required to hold the compiled code. This does not have to be
2304perfect as long as errors are overestimates. At the same time we can detect any
2305internal flag settings. Make an attempt to correct for any counted white space
2306if an "extended" flag setting appears late in the pattern. We can't be so
2307clever for #-comments. */
2308
Guido van Rossum58132c61997-12-17 00:24:13 +00002309ptr = (const uschar *)(pattern - 1);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002310while ((c = *(++ptr)) != 0)
2311 {
Guido van Rossum50700601997-12-08 17:15:20 +00002312 int min, max;
2313 int class_charcount;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002314
2315 if ((pcre_ctypes[c] & ctype_space) != 0)
2316 {
Guido van Rossum50700601997-12-08 17:15:20 +00002317 if ((options & PCRE_EXTENDED) != 0) continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002318 spaces++;
2319 }
2320
Guido van Rossum50700601997-12-08 17:15:20 +00002321 if (c == '#' && (options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002322 {
2323 while ((c = *(++ptr)) != 0 && c != '\n');
2324 continue;
2325 }
2326
2327 switch(c)
2328 {
2329 /* A backslashed item may be an escaped "normal" character or a
2330 character type. For a "normal" character, put the pointers and
2331 character back so that tests for whitespace etc. in the input
2332 are done correctly. */
2333
2334 case '\\':
2335 {
Guido van Rossum58132c61997-12-17 00:24:13 +00002336 const uschar *save_ptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00002337 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002338 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2339 if (c >= 0)
2340 {
2341 ptr = save_ptr;
2342 c = '\\';
2343 goto NORMAL_CHAR;
2344 }
2345 }
2346 length++;
2347
2348 /* A back reference needs an additional char, plus either one or 5
Guido van Rossum50700601997-12-08 17:15:20 +00002349 bytes for a repeat. We also need to keep the value of the highest
2350 back reference. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002351
2352 if (c <= -ESC_REF)
2353 {
Guido van Rossum50700601997-12-08 17:15:20 +00002354 int refnum = -c - ESC_REF;
2355 if (refnum > top_backref) top_backref = refnum;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002356 length++; /* For single back reference */
Guido van Rossum50700601997-12-08 17:15:20 +00002357 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002358 {
2359 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2360 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2361 if ((min == 0 && (max == 1 || max == -1)) ||
2362 (min == 1 && max == -1))
2363 length++;
2364 else length += 5;
2365 if (ptr[1] == '?') ptr++;
2366 }
2367 }
2368 continue;
2369
2370 case '^':
2371 case '.':
2372 case '$':
2373 case '*': /* These repeats won't be after brackets; */
2374 case '+': /* those are handled separately */
2375 case '?':
2376 length++;
2377 continue;
2378
2379 /* This covers the cases of repeats after a single char, metachar, class,
2380 or back reference. */
2381
2382 case '{':
Guido van Rossum50700601997-12-08 17:15:20 +00002383 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002384 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
2385 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2386 if ((min == 0 && (max == 1 || max == -1)) ||
2387 (min == 1 && max == -1))
2388 length++;
2389 else
2390 {
2391 length--; /* Uncount the original char or metachar */
2392 if (min == 1) length++; else if (min > 0) length += 4;
2393 if (max > 0) length += 4; else length += 2;
2394 }
2395 if (ptr[1] == '?') ptr++;
2396 continue;
2397
2398 /* An alternation contains an offset to the next branch or ket. */
2399 case '|':
2400 length += 3;
2401 continue;
2402
Guido van Rossum50700601997-12-08 17:15:20 +00002403 /* A character class uses 33 characters. Don't worry about character types
2404 that aren't allowed in classes - they'll get picked up during the compile.
2405 A character class that contains only one character uses 2 or 3 bytes,
2406 depending on whether it is negated or not. Notice this where we can. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002407
2408 case '[':
Guido van Rossum50700601997-12-08 17:15:20 +00002409 class_charcount = 0;
2410 if (*(++ptr) == '^') ptr++;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002411 do
2412 {
Guido van Rossum50700601997-12-08 17:15:20 +00002413 if (*ptr == '\\')
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002414 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002415 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002416 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
Guido van Rossum557dea11997-12-22 22:46:52 +00002417 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002418 }
Guido van Rossum50700601997-12-08 17:15:20 +00002419 else class_charcount++;
2420 ptr++;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002421 }
2422 while (*ptr != 0 && *ptr != ']');
2423
Guido van Rossum50700601997-12-08 17:15:20 +00002424 /* Repeats for negated single chars are handled by the general code */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002425
Guido van Rossum50700601997-12-08 17:15:20 +00002426 if (class_charcount == 1) length += 3; else
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002427 {
Guido van Rossum50700601997-12-08 17:15:20 +00002428 length += 33;
2429 if (options & PCRE_LOCALE) length++; /* Add a byte for the localization flag */
2430
2431 /* A repeat needs either 1 or 5 bytes. */
2432
Guido van Rossum557dea11997-12-22 22:46:52 +00002433 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
Guido van Rossum50700601997-12-08 17:15:20 +00002434 {
2435 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2436 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2437 if ((min == 0 && (max == 1 || max == -1)) ||
2438 (min == 1 && max == -1))
2439 length++;
2440 else length += 5;
2441 if (ptr[1] == '?') ptr++;
2442 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002443 }
2444 continue;
2445
2446 /* Brackets may be genuine groups or special things */
2447
2448 case '(':
2449
2450 /* Handle special forms of bracket, which all start (? */
2451
2452 if (ptr[1] == '?') switch (c = ptr[2])
2453 {
2454 /* Skip over comments entirely */
2455 case '#':
2456 ptr += 3;
2457 while (*ptr != 0 && *ptr != ')') ptr++;
2458 if (*ptr == 0)
2459 {
Guido van Rossum50700601997-12-08 17:15:20 +00002460 *errorptr = ERR18;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002461 goto PCRE_ERROR_RETURN;
2462 }
2463 continue;
2464
2465 /* Non-referencing groups and lookaheads just move the pointer on, and
Guido van Rossum50700601997-12-08 17:15:20 +00002466 then behave like a non-special bracket, except that they don't increment
2467 the count of extracting brackets. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002468
2469 case ':':
2470 case '=':
2471 case '!':
2472 ptr += 2;
2473 break;
2474
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002475 case ('P'):
2476 {
2477 int idlen;
2478 switch (*ptr++) {
2479 case ('<'):
2480 idlen = get_group_id(ptr++, '>', errorptr);
2481 if (*errorptr) goto PCRE_ERROR_RETURN;
2482 ptr += idlen+1;
2483 break;
2484 case ('='):
2485 idlen = get_group_id(ptr++, ')', errorptr);
2486 if (*errorptr) goto PCRE_ERROR_RETURN;
2487 ptr += idlen+1;
2488 length++;
2489 break;
2490 }
2491 }
2492 break;
2493
Guido van Rossum50700601997-12-08 17:15:20 +00002494 /* Ditto for the "once only" bracket, allowed only if the extra bit
2495 is set. */
2496
2497 case '>':
2498 if ((options & PCRE_EXTRA) != 0)
2499 {
2500 ptr += 2;
2501 break;
2502 }
Guido van Rossumdda66961998-05-07 15:32:44 +00002503 /* Else fall through */
Guido van Rossum50700601997-12-08 17:15:20 +00002504
2505 /* Else loop setting valid options until ) is met. Anything else is an
2506 error. */
2507
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002508 default:
2509 ptr += 2;
2510 for (;; ptr++)
2511 {
2512 if ((c = *ptr) == 'i')
2513 {
2514 options |= PCRE_CASELESS;
2515 continue;
2516 }
Guido van Rossumbd49ac41997-12-10 23:05:53 +00002517 else if ((c = *ptr) == 'L')
Guido van Rossum50700601997-12-08 17:15:20 +00002518 {
2519 options |= PCRE_LOCALE;
2520 continue;
2521 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002522 else if ((c = *ptr) == 'm')
2523 {
2524 options |= PCRE_MULTILINE;
2525 continue;
2526 }
Guido van Rossum50700601997-12-08 17:15:20 +00002527 else if (c == 's')
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002528 {
2529 options |= PCRE_DOTALL;
2530 continue;
2531 }
2532 else if (c == 'x')
2533 {
2534 options |= PCRE_EXTENDED;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002535 length -= spaces; /* Already counted spaces */
2536 continue;
2537 }
2538 else if (c == ')') break;
2539
Guido van Rossum50700601997-12-08 17:15:20 +00002540 *errorptr = ERR12;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002541 goto PCRE_ERROR_RETURN;
2542 }
2543 continue; /* End of this bracket handling */
2544 }
2545
Guido van Rossum50700601997-12-08 17:15:20 +00002546 /* Extracting brackets must be counted so we can process escapes in a
2547 Perlish way. */
2548
2549 else bracount++;
2550
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002551 /* Non-special forms of bracket. Save length for computing whole length
2552 at end if there's a repeat that requires duplication of the group. */
2553
2554 if (brastackptr >= sizeof(brastack)/sizeof(int))
2555 {
Guido van Rossum50700601997-12-08 17:15:20 +00002556 *errorptr = ERR19;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002557 goto PCRE_ERROR_RETURN;
2558 }
2559
2560 brastack[brastackptr++] = length;
2561 length += 3;
2562 continue;
2563
2564 /* Handle ket. Look for subsequent max/min; for certain sets of values we
Guido van Rossum557dea11997-12-22 22:46:52 +00002565 have to replicate this bracket up to that many times. If brastackptr is
2566 0 this is an unmatched bracket which will generate an error, but take care
2567 not to try to access brastack[-1]. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002568
2569 case ')':
2570 length += 3;
2571 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002572 int minval = 1;
2573 int maxval = 1;
2574 int duplength = (brastackptr > 0)? length - brastack[--brastackptr] : 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002575
2576 /* Leave ptr at the final char; for read_repeat_counts this happens
2577 automatically; for the others we need an increment. */
2578
Guido van Rossum50700601997-12-08 17:15:20 +00002579 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002580 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002581 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002582 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2583 }
Guido van Rossum557dea11997-12-22 22:46:52 +00002584 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2585 else if (c == '+') { maxval = -1; ptr++; }
2586 else if (c == '?') { minval = 0; ptr++; }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002587
Guido van Rossum557dea11997-12-22 22:46:52 +00002588 /* If there is a minimum > 1 we have to replicate up to minval-1 times;
2589 if there is a limited maximum we have to replicate up to maxval-1 times
2590 and allow for a BRAZERO item before each optional copy, as we also have
2591 to do before the first copy if the minimum is zero. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002592
Guido van Rossum557dea11997-12-22 22:46:52 +00002593 if (minval == 0) length++;
2594 else if (minval > 1) length += (minval - 1) * duplength;
2595 if (maxval > minval) length += (maxval - minval) * (duplength + 1);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002596 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002597 continue;
2598
2599 /* Non-special character. For a run of such characters the length required
2600 is the number of characters + 2, except that the maximum run length is 255.
2601 We won't get a skipped space or a non-data escape or the start of a #
2602 comment as the first character, so the length can't be zero. */
2603
2604 NORMAL_CHAR:
2605 default:
2606 length += 2;
2607 runlength = 0;
2608 do
2609 {
2610 if ((pcre_ctypes[c] & ctype_space) != 0)
2611 {
Guido van Rossum50700601997-12-08 17:15:20 +00002612 if ((options & PCRE_EXTENDED) != 0) continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002613 spaces++;
2614 }
2615
Guido van Rossum50700601997-12-08 17:15:20 +00002616 if (c == '#' && (options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002617 {
2618 while ((c = *(++ptr)) != 0 && c != '\n');
2619 continue;
2620 }
2621
2622 /* Backslash may introduce a data char or a metacharacter; stop the
2623 string before the latter. */
2624
2625 if (c == '\\')
2626 {
Guido van Rossum58132c61997-12-17 00:24:13 +00002627 const uschar *saveptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00002628 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002629 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2630 if (c < 0) { ptr = saveptr; break; }
2631 }
2632
2633 /* Ordinary character or single-char escape */
2634
2635 runlength++;
2636 }
2637
2638 /* This "while" is the end of the "do" above. */
2639
2640 while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2641
2642 ptr--;
2643 length += runlength;
2644 continue;
2645 }
2646 }
2647
2648length += 4; /* For final KET and END */
2649
2650if (length > 65539)
2651 {
Guido van Rossum50700601997-12-08 17:15:20 +00002652 *errorptr = ERR20;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002653 return NULL;
2654 }
2655
2656/* Compute the size of data block needed and get it, either from malloc or
Guido van Rossum557dea11997-12-22 22:46:52 +00002657externally provided function. We specify "code[0]" in the offsetof() expression
2658rather than just "code", because it has been reported that one broken compiler
2659fails on "code" because it is also an independent variable. It should make no
2660difference to the value of the offsetof(). */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002661
Guido van Rossum557dea11997-12-22 22:46:52 +00002662size = length + offsetof(real_pcre, code[0]);
Guido van Rossum50700601997-12-08 17:15:20 +00002663re = (real_pcre *)(pcre_malloc)(size+50);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002664
2665if (re == NULL)
2666 {
Guido van Rossum50700601997-12-08 17:15:20 +00002667 *errorptr = ERR21;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002668 return NULL;
2669 }
2670
Guido van Rossum557dea11997-12-22 22:46:52 +00002671/* Put in the magic number and the options. */
2672
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002673re->magic_number = MAGIC_NUMBER;
2674re->options = options;
2675
2676/* Set up a starting, non-extracting bracket, then compile the expression. On
2677error, *errorptr will be set non-NULL, so we don't need to look at the result
2678of the function here. */
2679
Guido van Rossum58132c61997-12-17 00:24:13 +00002680ptr = (const uschar *)pattern;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002681code = re->code;
2682*code = OP_BRA;
Guido van Rossum50700601997-12-08 17:15:20 +00002683bracount = 0;
2684(void)compile_regex(options, &bracount, &code, &ptr, errorptr, dictionary);
2685re->top_bracket = bracount;
2686re->top_backref = top_backref;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002687
2688/* If not reached end of pattern on success, there's an excess bracket. */
2689
Guido van Rossum50700601997-12-08 17:15:20 +00002690if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
2691
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002692/* Fill in the terminating state and check for disastrous overflow, but
2693if debugging, leave the test till after things are printed out. */
2694
2695*code++ = OP_END;
2696
Guido van Rossum50700601997-12-08 17:15:20 +00002697
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002698#ifndef DEBUG
Guido van Rossum50700601997-12-08 17:15:20 +00002699if (code - re->code > length) *errorptr = ERR23;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002700#endif
2701
2702/* Failed to compile */
2703
2704if (*errorptr != NULL)
2705 {
2706 (pcre_free)(re);
2707 PCRE_ERROR_RETURN:
Guido van Rossum58132c61997-12-17 00:24:13 +00002708 *erroroffset = ptr - (const uschar *)pattern;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002709 return NULL;
2710 }
2711
2712/* If the anchored option was not passed, set flag if we can determine that it
2713is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
2714we can determine what the first character has to be, because that speeds up
2715unanchored matches no end. In the case of multiline matches, an alternative is
2716to set the PCRE_STARTLINE flag if all branches start with ^. */
2717
2718if ((options & PCRE_ANCHORED) == 0)
2719 {
2720 if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))
2721 re->options |= PCRE_ANCHORED;
2722 else
2723 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002724 int ch = find_firstchar(re->code);
2725 if (ch >= 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002726 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002727 re->first_char = ch;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002728 re->options |= PCRE_FIRSTSET;
2729 }
2730 else if (is_startline(re->code))
2731 re->options |= PCRE_STARTLINE;
2732 }
2733 }
2734
2735/* Print out the compiled data for debugging */
2736
2737#ifdef DEBUG
2738
Guido van Rossum50700601997-12-08 17:15:20 +00002739printf("Length = %d top_bracket = %d top_backref=%d\n",
2740 length, re->top_bracket, re->top_backref);
2741
2742if (re->options != 0)
2743 {
Guido van Rossumdda66961998-05-07 15:32:44 +00002744 printf("%s%s%s%s%s%s%s%s\n",
Guido van Rossum50700601997-12-08 17:15:20 +00002745 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2746 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2747 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2748 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2749 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2750 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
Guido van Rossumdda66961998-05-07 15:32:44 +00002751 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
2752 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
Guido van Rossum50700601997-12-08 17:15:20 +00002753 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002754
2755if ((re->options & PCRE_FIRSTSET) != 0)
2756 {
2757 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2758 else printf("First char = \\x%02x\n", re->first_char);
2759 }
2760
2761code_end = code;
2762code_base = code = re->code;
2763
2764while (code < code_end)
2765 {
2766 int charlength;
2767
2768 printf("%3d ", code - code_base);
2769
2770 if (*code >= OP_BRA)
2771 {
2772 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2773 code += 2;
2774 }
2775
2776 else switch(*code)
2777 {
2778 case OP_CHARS:
2779 charlength = *(++code);
2780 printf("%3d ", charlength);
2781 while (charlength-- > 0)
2782 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2783 break;
2784
2785 case OP_KETRMAX:
2786 case OP_KETRMIN:
2787 case OP_ALT:
2788 case OP_KET:
2789 case OP_ASSERT:
2790 case OP_ASSERT_NOT:
Guido van Rossum50700601997-12-08 17:15:20 +00002791 case OP_ONCE:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002792 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2793 code += 2;
2794 break;
2795
2796 case OP_STAR:
2797 case OP_MINSTAR:
2798 case OP_PLUS:
2799 case OP_MINPLUS:
2800 case OP_QUERY:
2801 case OP_MINQUERY:
2802 case OP_TYPESTAR:
2803 case OP_TYPEMINSTAR:
2804 case OP_TYPEPLUS:
2805 case OP_TYPEMINPLUS:
2806 case OP_TYPEQUERY:
2807 case OP_TYPEMINQUERY:
2808 if (*code >= OP_TYPESTAR)
2809 printf(" %s", OP_names[code[1]]);
2810 else if (isprint(c = code[1])) printf(" %c", c);
2811 else printf(" \\x%02x", c);
2812 printf("%s", OP_names[*code++]);
2813 break;
2814
2815 case OP_EXACT:
2816 case OP_UPTO:
2817 case OP_MINUPTO:
2818 if (isprint(c = code[3])) printf(" %c{", c);
2819 else printf(" \\x%02x{", c);
Guido van Rossum557dea11997-12-22 22:46:52 +00002820 if (*code != OP_EXACT) printf("0,");
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002821 printf("%d}", (code[1] << 8) + code[2]);
2822 if (*code == OP_MINUPTO) printf("?");
2823 code += 3;
2824 break;
2825
2826 case OP_TYPEEXACT:
2827 case OP_TYPEUPTO:
2828 case OP_TYPEMINUPTO:
2829 printf(" %s{", OP_names[code[3]]);
2830 if (*code != OP_TYPEEXACT) printf(",");
2831 printf("%d}", (code[1] << 8) + code[2]);
2832 if (*code == OP_TYPEMINUPTO) printf("?");
2833 code += 3;
2834 break;
2835
Guido van Rossum50700601997-12-08 17:15:20 +00002836 case OP_NOT:
2837 if (isprint(c = *(++code))) printf(" [^%c]", c);
2838 else printf(" [^\\x%02x]", c);
2839 break;
2840
2841 case OP_NOTSTAR:
2842 case OP_NOTMINSTAR:
2843 case OP_NOTPLUS:
2844 case OP_NOTMINPLUS:
2845 case OP_NOTQUERY:
2846 case OP_NOTMINQUERY:
2847 if (isprint(c = code[1])) printf(" [^%c]", c);
2848 else printf(" [^\\x%02x]", c);
2849 printf("%s", OP_names[*code++]);
2850 break;
2851
2852 case OP_NOTEXACT:
2853 case OP_NOTUPTO:
2854 case OP_NOTMINUPTO:
2855 if (isprint(c = code[3])) printf(" [^%c]{", c);
2856 else printf(" [^\\x%02x]{", c);
2857 if (*code != OP_NOTEXACT) printf(",");
2858 printf("%d}", (code[1] << 8) + code[2]);
2859 if (*code == OP_NOTMINUPTO) printf("?");
2860 code += 3;
2861 break;
2862
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002863 case OP_REF:
2864 printf(" \\%d", *(++code));
Guido van Rossum557dea11997-12-22 22:46:52 +00002865 code ++;
2866 goto CLASS_REF_REPEAT;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002867
2868 case OP_CLASS:
Guido van Rossum042ff9e1998-04-03 21:13:31 +00002869 case OP_NEGCLASS:
Guido van Rossum50700601997-12-08 17:15:20 +00002870 case OP_CLASS_L:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002871 {
2872 int i, min, max;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002873
Guido van Rossum50700601997-12-08 17:15:20 +00002874 if (*code==OP_CLASS_L)
2875 {
2876 code++;
2877 printf("Locflag = %i ", *code++);
Guido van Rossum042ff9e1998-04-03 21:13:31 +00002878 printf(" [");
Guido van Rossum50700601997-12-08 17:15:20 +00002879 }
2880 else
Guido van Rossum042ff9e1998-04-03 21:13:31 +00002881 {
2882 if (*code++ == OP_CLASS) printf(" [");
2883 else printf(" ^[");
2884 }
2885
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002886
Guido van Rossum50700601997-12-08 17:15:20 +00002887 for (i = 0; i < 256; i++)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002888 {
Guido van Rossum50700601997-12-08 17:15:20 +00002889 if ((code[i/8] & (1 << (i&7))) != 0)
2890 {
2891 int j;
2892 for (j = i+1; j < 256; j++)
2893 if ((code[j/8] & (1 << (j&7))) == 0) break;
2894 if (i == '-' || i == ']') printf("\\");
2895 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2896 if (--j > i)
2897 {
2898 printf("-");
2899 if (j == '-' || j == ']') printf("\\");
2900 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2901 }
2902 i = j;
2903 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002904 }
2905 printf("]");
Guido van Rossum50700601997-12-08 17:15:20 +00002906 code += 32;
2907 /* code ++;*/
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002908
Guido van Rossum557dea11997-12-22 22:46:52 +00002909 CLASS_REF_REPEAT:
2910
Guido van Rossum50700601997-12-08 17:15:20 +00002911 switch(*code)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002912 {
2913 case OP_CRSTAR:
2914 case OP_CRMINSTAR:
2915 case OP_CRPLUS:
2916 case OP_CRMINPLUS:
2917 case OP_CRQUERY:
2918 case OP_CRMINQUERY:
2919 printf("%s", OP_names[*code]);
2920 break;
2921
2922 case OP_CRRANGE:
2923 case OP_CRMINRANGE:
2924 min = (code[1] << 8) + code[2];
2925 max = (code[3] << 8) + code[4];
2926 if (max == 0) printf("{%d,}", min);
2927 else printf("{%d,%d}", min, max);
2928 if (*code == OP_CRMINRANGE) printf("?");
2929 code += 4;
2930 break;
2931
2932 default:
2933 code--;
2934 }
2935 }
2936 break;
2937
2938 /* Anything else is just a one-node item */
2939
2940 default:
2941 printf(" %s", OP_names[*code]);
2942 break;
2943 }
2944
2945 code++;
2946 printf("\n");
2947 }
2948printf("------------------------------------------------------------------\n");
2949
2950/* This check is done here in the debugging case so that the code that
2951was compiled can be seen. */
2952
2953if (code - re->code > length)
2954 {
Guido van Rossum50700601997-12-08 17:15:20 +00002955 printf("length=%i, code length=%i\n", length, code-re->code);
2956 *errorptr = ERR23;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002957 (pcre_free)(re);
2958 *erroroffset = ptr - (uschar *)pattern;
2959 return NULL;
2960 }
2961#endif
2962
2963return (pcre *)re;
2964}
2965
2966
2967
2968/*************************************************
2969* Match a character type *
2970*************************************************/
2971
2972/* Not used in all the places it might be as it's sometimes faster
2973to put the code inline.
2974
2975Arguments:
2976 type the character type
2977 c the character
Guido van Rossum50700601997-12-08 17:15:20 +00002978 dotall the dotall flag
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002979
2980Returns: TRUE if character is of the type
2981*/
2982
2983static BOOL
2984match_type(int type, int c, BOOL dotall)
2985{
2986
2987#ifdef DEBUG
2988if (isprint(c)) printf("matching subject %c against ", c);
2989 else printf("matching subject \\x%02x against ", c);
2990printf("%s\n", OP_names[type]);
2991#endif
2992
2993switch(type)
2994 {
2995 case OP_ANY: return dotall || c != '\n';
2996 case OP_NOT_DIGIT: return (pcre_ctypes[c] & ctype_digit) == 0;
2997 case OP_DIGIT: return (pcre_ctypes[c] & ctype_digit) != 0;
2998 case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
2999 case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0;
3000 case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0;
3001 case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0;
Guido van Rossum58132c61997-12-17 00:24:13 +00003002 case OP_NOT_WORDCHAR_L: return (c!='_' && !isalnum(c));
3003 case OP_WORDCHAR_L: return (c=='_' || isalnum(c));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003004 }
3005return FALSE;
3006}
3007
3008
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003009
3010/*************************************************
3011* Match a back-reference *
3012*************************************************/
3013
3014/* If a back reference hasn't been set, the match fails.
3015
3016Arguments:
3017 number reference number
3018 eptr points into the subject
3019 length length to be matched
3020 md points to match data block
3021
3022Returns: TRUE if matched
3023*/
3024
3025static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00003026match_ref(int number, register const uschar *eptr, int length, match_data *md)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003027{
Guido van Rossum58132c61997-12-17 00:24:13 +00003028const uschar *p = md->start_subject + md->offset_vector[number];
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003029
3030#ifdef DEBUG
3031if (eptr >= md->end_subject)
3032 printf("matching subject <null>");
3033else
3034 {
3035 printf("matching subject ");
3036 pchars(eptr, length, TRUE, md);
3037 }
3038printf(" against backref ");
3039pchars(p, length, FALSE, md);
3040printf("\n");
3041#endif
3042
3043/* Always fail if not enough characters left */
3044
3045if (length > md->end_subject - p) return FALSE;
3046
Guido van Rossum58132c61997-12-17 00:24:13 +00003047/* Separate the caseless case for speed */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003048
3049if (md->caseless)
3050 { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }
3051else
3052 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3053
3054return TRUE;
3055}
3056
3057static int free_stack(match_data *md)
3058{
3059/* Free any stack space that was allocated by the call to match(). */
Andrew M. Kuchlingfc9d2252000-02-18 19:16:45 +00003060if (md->off_num) PyMem_DEL(md->off_num);
3061if (md->offset_top) PyMem_DEL(md->offset_top);
3062if (md->r1) PyMem_DEL(md->r1);
3063if (md->r2) PyMem_DEL(md->r2);
3064if (md->eptr) PyMem_DEL((char *)md->eptr);
3065if (md->ecode) PyMem_DEL((char *)md->ecode);
Guido van Rossumc3861071997-10-08 02:07:40 +00003066return 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003067}
3068
3069static int grow_stack(match_data *md)
3070{
Guido van Rossum50700601997-12-08 17:15:20 +00003071 if (md->length != 0)
3072 {
3073 md->length = md->length + md->length/2;
3074 }
3075 else
3076 {
3077 int string_len = md->end_subject - md->start_subject + 1;
3078 if (string_len < 80) {md->length = string_len; }
3079 else {md->length = 80;}
3080 }
3081 PyMem_RESIZE(md->offset_top, int, md->length);
Tim Petersaf3e8de2002-04-12 07:22:56 +00003082 /* Can't realloc a pointer-to-const; cast const away. */
3083 md->eptr = (const uschar **)PyMem_Realloc((void *)md->eptr,
3084 sizeof(uschar *) * md->length);
3085 md->ecode = (const uschar **)PyMem_Realloc((void *)md->ecode,
3086 sizeof(uschar *) * md->length);
Guido van Rossum50700601997-12-08 17:15:20 +00003087 PyMem_RESIZE(md->off_num, int, md->length);
3088 PyMem_RESIZE(md->r1, int, md->length);
3089 PyMem_RESIZE(md->r2, int, md->length);
3090 if (md->offset_top == NULL || md->eptr == NULL || md->ecode == NULL ||
3091 md->off_num == NULL || md->r1 == NULL || md->r2 == NULL)
3092 {
Guido van Rossumdda66961998-05-07 15:32:44 +00003093 PyErr_NoMemory();
Guido van Rossum50700601997-12-08 17:15:20 +00003094 longjmp(md->error_env, 1);
3095 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003096 return 0;
3097}
3098
Guido van Rossum50700601997-12-08 17:15:20 +00003099
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003100/*************************************************
3101* Match from current position *
3102*************************************************/
3103
3104/* On entry ecode points to the first opcode, and eptr to the first character.
3105
3106Arguments:
3107 eptr pointer in subject
3108 ecode position in code
3109 offset_top current top pointer
3110 md pointer to "static" info for the match
3111
3112Returns: TRUE if matched
3113*/
3114
3115static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00003116match(register const uschar *eptr, register const uschar *ecode, int offset_top,
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003117 match_data *md)
3118{
3119 int save_stack_position = md->point;
3120match_loop:
3121
3122#define SUCCEED goto succeed
3123#define FAIL goto fail
3124
3125for (;;)
3126 {
3127 int min, max, ctype;
3128 register int i;
3129 register int c;
Guido van Rossum58132c61997-12-17 00:24:13 +00003130 BOOL minimize = FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003131
3132 /* Opening bracket. Check the alternative branches in turn, failing if none
3133 match. We have to set the start offset if required and there is space
3134 in the offset vector so that it is available for subsequent back references
3135 if the bracket matches. However, if the bracket fails, we must put back the
3136 previous value of both offsets in case they were set by a previous copy of
3137 the same bracket. Don't worry about setting the flag for the error case here;
3138 that is handled in the code for KET. */
3139
3140 if ((int)*ecode >= OP_BRA)
3141 {
3142 int number = (*ecode - OP_BRA) << 1;
Guido van Rossum58132c61997-12-17 00:24:13 +00003143 int save_offset1 = 0, save_offset2 = 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003144
Guido van Rossum557dea11997-12-22 22:46:52 +00003145 DPRINTF(("start bracket %d\n", number/2));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003146
3147 if (number > 0 && number < md->offset_end)
3148 {
3149 save_offset1 = md->offset_vector[number];
3150 save_offset2 = md->offset_vector[number+1];
3151 md->offset_vector[number] = eptr - md->start_subject;
3152
Guido van Rossum557dea11997-12-22 22:46:52 +00003153 DPRINTF(("saving %d %d\n", save_offset1, save_offset2));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003154 }
3155
3156 /* Recurse for all the alternatives. */
3157
3158 do
3159 {
3160 if (match(eptr, ecode+3, offset_top, md)) SUCCEED;
3161 ecode += (ecode[1] << 8) + ecode[2];
3162 }
3163 while (*ecode == OP_ALT);
3164
Guido van Rossum557dea11997-12-22 22:46:52 +00003165 DPRINTF(("bracket %d failed\n", number/2));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003166
3167 if (number > 0 && number < md->offset_end)
3168 {
3169 md->offset_vector[number] = save_offset1;
3170 md->offset_vector[number+1] = save_offset2;
3171 }
3172
3173 FAIL;
3174 }
3175
3176 /* Other types of node can be handled by a switch */
3177
3178 switch(*ecode)
3179 {
3180 case OP_END:
3181 md->end_match_ptr = eptr; /* Record where we ended */
3182 md->end_offset_top = offset_top; /* and how many extracts were taken */
3183 SUCCEED;
3184
Guido van Rossum50700601997-12-08 17:15:20 +00003185 /* The equivalent of Prolog's "cut" - if the rest doesn't match, the
3186 whole thing doesn't match, so we have to get out via a longjmp(). */
3187
3188 case OP_CUT:
3189 if (match(eptr, ecode+1, offset_top, md)) SUCCEED;
3190 longjmp(md->fail_env, 1);
3191
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003192 /* Assertion brackets. Check the alternative branches in turn - the
3193 matching won't pass the KET for an assertion. If any one branch matches,
3194 the assertion is true. */
3195
3196 case OP_ASSERT:
3197 do
3198 {
3199 if (match(eptr, ecode+3, offset_top, md)) break;
3200 ecode += (ecode[1] << 8) + ecode[2];
3201 }
3202 while (*ecode == OP_ALT);
3203 if (*ecode == OP_KET) FAIL;
3204
3205 /* Continue from after the assertion, updating the offsets high water
3206 mark, since extracts may have been taken during the assertion. */
3207
3208 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3209 ecode += 3;
3210 offset_top = md->end_offset_top;
3211 continue;
3212
3213 /* Negative assertion: all branches must fail to match */
3214
3215 case OP_ASSERT_NOT:
3216 do
3217 {
3218 if (match(eptr, ecode+3, offset_top, md)) FAIL;
3219 ecode += (ecode[1] << 8) + ecode[2];
3220 }
3221 while (*ecode == OP_ALT);
3222 ecode += 3;
3223 continue;
3224
Guido van Rossum50700601997-12-08 17:15:20 +00003225 /* "Once" brackets are like assertion brackets except that after a match,
3226 the point in the subject string is not moved back. Thus there can never be
3227 a move back into the brackets. Check the alternative branches in turn - the
3228 matching won't pass the KET for this kind of subpattern. If any one branch
3229 matches, we carry on, leaving the subject pointer. */
3230
3231 case OP_ONCE:
3232 do
3233 {
3234 if (match(eptr, ecode+3, offset_top, md)) break;
3235 ecode += (ecode[1] << 8) + ecode[2];
3236 }
3237 while (*ecode == OP_ALT);
Guido van Rossum557dea11997-12-22 22:46:52 +00003238 if (*ecode == OP_KET) FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00003239
3240 /* Continue as from after the assertion, updating the offsets high water
3241 mark, since extracts may have been taken. */
3242
3243 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3244 ecode += 3;
3245 offset_top = md->end_offset_top;
3246 eptr = md->end_match_ptr;
3247 continue;
3248
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003249 /* An alternation is the end of a branch; scan along to find the end of the
3250 bracketed group and go to there. */
3251
3252 case OP_ALT:
3253 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3254 break;
3255
3256 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3257 that it may occur zero times. It may repeat infinitely, or not at all -
3258 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3259 repeat limits are compiled as a number of copies, with the optional ones
3260 preceded by BRAZERO or BRAMINZERO. */
3261
3262 case OP_BRAZERO:
3263 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003264 const uschar *next = ecode+1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003265 if (match(eptr, next, offset_top, md)) SUCCEED;
3266 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3267 ecode = next + 3;
3268 }
3269 break;
3270
3271 case OP_BRAMINZERO:
3272 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003273 const uschar *next = ecode+1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003274 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3275 if (match(eptr, next+3, offset_top, md)) SUCCEED;
3276 ecode++;
3277 }
3278 break;;
3279
3280 /* End of a group, repeated or non-repeating. If we are at the end of
3281 an assertion "group", stop matching and SUCCEED, but record the
3282 current high water mark for use by positive assertions. */
3283
3284 case OP_KET:
3285 case OP_KETRMIN:
3286 case OP_KETRMAX:
3287 {
Guido van Rossum50700601997-12-08 17:15:20 +00003288 int number;
Guido van Rossum58132c61997-12-17 00:24:13 +00003289 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003290
Guido van Rossum50700601997-12-08 17:15:20 +00003291 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003292 {
Guido van Rossum50700601997-12-08 17:15:20 +00003293 md->end_match_ptr = eptr; /* For ONCE */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003294 md->end_offset_top = offset_top;
3295 SUCCEED;
3296 }
3297
3298 /* In all other cases we have to check the group number back at the
3299 start and if necessary complete handling an extraction by setting the
3300 final offset and bumping the high water mark. */
3301
3302 number = (*prev - OP_BRA) << 1;
3303
Guido van Rossum557dea11997-12-22 22:46:52 +00003304 DPRINTF(("end bracket %d\n", number/2));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003305
3306 if (number > 0)
3307 {
3308 if (number >= md->offset_end) md->offset_overflow = TRUE; else
3309 {
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003310 md->offset_vector[number+1] = eptr - md->start_subject;
3311 if (offset_top <= number) offset_top = number + 2;
3312 }
3313 }
3314
3315 /* For a non-repeating ket, just advance to the next node and continue at
3316 this level. */
3317
3318 if (*ecode == OP_KET)
3319 {
3320 ecode += 3;
3321 break;
3322 }
3323
3324 /* The repeating kets try the rest of the pattern or restart from the
3325 preceding bracket, in the appropriate order. */
3326
3327 if (*ecode == OP_KETRMIN)
3328 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003329 const uschar *ptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003330 if (match(eptr, ecode+3, offset_top, md)) goto succeed;
3331 /* Handle alternation inside the BRA...KET; push the additional
Guido van Rossum58132c61997-12-17 00:24:13 +00003332 alternatives onto the stack */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003333 ptr=prev;
3334 do {
3335 ptr += (ptr[1]<<8)+ ptr[2];
3336 if (*ptr==OP_ALT)
3337 {
Guido van Rossum50700601997-12-08 17:15:20 +00003338 if (md->length == md->point)
3339 {
3340 grow_stack(md);
3341 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003342 md->offset_top[md->point] = offset_top;
3343 md->eptr[md->point] = eptr;
3344 md->ecode[md->point] = ptr+3;
3345 md->r1[md->point] = 0;
3346 md->r2[md->point] = 0;
3347 md->off_num[md->point] = 0;
3348 md->point++;
3349 }
3350 } while (*ptr==OP_ALT);
3351 ecode=prev+3; goto match_loop;
3352 }
3353 else /* OP_KETRMAX */
3354 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003355 const uschar *ptr;
3356 /*int points_pushed=0;*/
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003357
3358 /* Push one failure point, that will resume matching at the code after
3359 the KETRMAX opcode. */
Guido van Rossum50700601997-12-08 17:15:20 +00003360 if (md->length == md->point)
3361 {
3362 grow_stack(md);
3363 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003364 md->offset_top[md->point] = offset_top;
3365 md->eptr[md->point] = eptr;
3366 md->ecode[md->point] = ecode+3;
3367 md->r1[md->point] = md->offset_vector[number];
3368 md->r2[md->point] = md->offset_vector[number+1];
3369 md->off_num[md->point] = number;
3370 md->point++;
3371
3372 md->offset_vector[number] = eptr - md->start_subject;
3373 /* Handle alternation inside the BRA...KET; push each of the
Guido van Rossum58132c61997-12-17 00:24:13 +00003374 additional alternatives onto the stack */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003375 ptr=prev;
3376 do {
3377 ptr += (ptr[1]<<8)+ ptr[2];
3378 if (*ptr==OP_ALT)
3379 {
Guido van Rossum50700601997-12-08 17:15:20 +00003380 if (md->length == md->point)
3381 if (md->length == md->point)
3382 {
3383 grow_stack(md);
3384 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003385 md->offset_top[md->point] = offset_top;
3386 md->eptr[md->point] = eptr;
3387 md->ecode[md->point] = ptr+3;
3388 md->r1[md->point] = 0;
3389 md->r2[md->point] = 0;
3390 md->off_num[md->point] = 0;
3391 md->point++;
Guido van Rossum58132c61997-12-17 00:24:13 +00003392 /*points_pushed++;*/
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003393 }
3394 } while (*ptr==OP_ALT);
3395 /* Jump to the first (or only) alternative and resume trying to match */
3396 ecode=prev+3; goto match_loop;
3397 }
3398 }
Guido van Rossum58132c61997-12-17 00:24:13 +00003399
Guido van Rossum50700601997-12-08 17:15:20 +00003400 /* Start of subject unless notbol, or after internal newline if multiline */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003401
3402 case OP_CIRC:
Guido van Rossum50700601997-12-08 17:15:20 +00003403 if (md->notbol && eptr == md->start_subject) FAIL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003404 if (md->multiline)
3405 {
3406 if (eptr != md->start_subject && eptr[-1] != '\n') FAIL;
3407 ecode++;
3408 break;
3409 }
3410 /* ... else fall through */
3411
3412 /* Start of subject assertion */
3413
3414 case OP_SOD:
3415 if (eptr != md->start_subject) FAIL;
3416 ecode++;
3417 break;
3418
Guido van Rossum50700601997-12-08 17:15:20 +00003419 /* Assert before internal newline if multiline, or before
3420 a terminating newline unless endonly is set, else end of subject unless
3421 noteol is set. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003422
3423 case OP_DOLL:
Guido van Rossum50700601997-12-08 17:15:20 +00003424 if (md->noteol && eptr >= md->end_subject) FAIL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003425 if (md->multiline)
3426 {
3427 if (eptr < md->end_subject && *eptr != '\n') FAIL;
3428 ecode++;
3429 break;
3430 }
Guido van Rossum50700601997-12-08 17:15:20 +00003431 else if (!md->endonly)
3432 {
3433 if (eptr < md->end_subject - 1 ||
3434 (eptr == md->end_subject - 1 && *eptr != '\n')) FAIL;
3435 ecode++;
3436 break;
3437 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003438 /* ... else fall through */
3439
3440 /* End of subject assertion */
3441
3442 case OP_EOD:
3443 if (eptr < md->end_subject) FAIL;
3444 ecode++;
3445 break;
3446
3447 /* Word boundary assertions */
3448
3449 case OP_NOT_WORD_BOUNDARY:
3450 case OP_WORD_BOUNDARY:
3451 {
3452 BOOL prev_is_word = (eptr != md->start_subject) &&
3453 ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
3454 BOOL cur_is_word = (eptr < md->end_subject) &&
3455 ((pcre_ctypes[*eptr] & ctype_word) != 0);
3456 if ((*ecode++ == OP_WORD_BOUNDARY)?
3457 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3458 FAIL;
3459 }
3460 break;
3461
Guido van Rossum50700601997-12-08 17:15:20 +00003462 case OP_NOT_WORD_BOUNDARY_L:
3463 case OP_WORD_BOUNDARY_L:
3464 {
3465 BOOL prev_is_word = (eptr != md->start_subject) &&
Guido van Rossum58132c61997-12-17 00:24:13 +00003466 (isalnum(eptr[-1]) || eptr[-1]=='_');
Guido van Rossum50700601997-12-08 17:15:20 +00003467 BOOL cur_is_word = (eptr < md->end_subject) &&
Guido van Rossum58132c61997-12-17 00:24:13 +00003468 (isalnum(*eptr) || *eptr=='_');
Guido van Rossum50700601997-12-08 17:15:20 +00003469 if ((*ecode++ == OP_WORD_BOUNDARY_L)?
3470 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3471 FAIL;
3472 }
3473 break;
3474
3475
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003476 /* Match a single character type; inline for speed */
3477
3478 case OP_ANY:
3479 if (!md->dotall && eptr < md->end_subject && *eptr == '\n') FAIL;
3480 if (eptr++ >= md->end_subject) FAIL;
3481 ecode++;
3482 break;
3483
3484 case OP_NOT_DIGIT:
3485 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)
3486 FAIL;
3487 ecode++;
3488 break;
3489
3490 case OP_DIGIT:
3491 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)
3492 FAIL;
3493 ecode++;
3494 break;
3495
3496 case OP_NOT_WHITESPACE:
3497 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)
3498 FAIL;
3499 ecode++;
3500 break;
3501
3502 case OP_WHITESPACE:
3503 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)
3504 FAIL;
3505 ecode++;
3506 break;
3507
3508 case OP_NOT_WORDCHAR:
3509 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)
3510 FAIL;
3511 ecode++;
3512 break;
3513
3514 case OP_WORDCHAR:
3515 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)
3516 FAIL;
3517 ecode++;
3518 break;
3519
Guido van Rossum50700601997-12-08 17:15:20 +00003520 case OP_NOT_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00003521 if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) ))
Guido van Rossum557dea11997-12-22 22:46:52 +00003522 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00003523 eptr++;
3524 ecode++;
3525 break;
3526
3527 case OP_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00003528 if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) ))
Guido van Rossum557dea11997-12-22 22:46:52 +00003529 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00003530 eptr++;
3531 ecode++;
3532 break;
3533
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003534 /* Match a back reference, possibly repeatedly. Look past the end of the
3535 item to see if there is repeat information following. The code is similar
3536 to that for character classes, but repeated for efficiency. Then obey
3537 similar code to character type repeats - written out again for speed.
3538 However, if the referenced string is the empty string, always treat
3539 it as matched, any number of times (otherwise there could be infinite
3540 loops). */
3541
3542 case OP_REF:
3543 {
3544 int length;
3545 int number = ecode[1] << 1; /* Doubled reference number */
3546 ecode += 2; /* Advance past the item */
3547
3548 if (number >= offset_top || md->offset_vector[number] < 0)
3549 {
3550 md->errorcode = PCRE_ERROR_BADREF;
3551 FAIL;
3552 }
3553
3554 length = md->offset_vector[number+1] - md->offset_vector[number];
3555
3556 switch (*ecode)
3557 {
3558 case OP_CRSTAR:
3559 case OP_CRMINSTAR:
3560 case OP_CRPLUS:
3561 case OP_CRMINPLUS:
3562 case OP_CRQUERY:
3563 case OP_CRMINQUERY:
3564 c = *ecode++ - OP_CRSTAR;
3565 minimize = (c & 1) != 0;
3566 min = rep_min[c]; /* Pick up values from tables; */
3567 max = rep_max[c]; /* zero for max => infinity */
3568 if (max == 0) max = INT_MAX;
3569 break;
3570
3571 case OP_CRRANGE:
3572 case OP_CRMINRANGE:
3573 minimize = (*ecode == OP_CRMINRANGE);
3574 min = (ecode[1] << 8) + ecode[2];
3575 max = (ecode[3] << 8) + ecode[4];
3576 if (max == 0) max = INT_MAX;
3577 ecode += 5;
3578 break;
3579
3580 default: /* No repeat follows */
3581 if (!match_ref(number, eptr, length, md)) FAIL;
3582 eptr += length;
3583 continue; /* With the main loop */
3584 }
3585
3586 /* If the length of the reference is zero, just continue with the
3587 main loop. */
3588
3589 if (length == 0) continue;
3590
3591 /* First, ensure the minimum number of matches are present. We get back
3592 the length of the reference string explicitly rather than passing the
3593 address of eptr, so that eptr can be a register variable. */
3594
3595 for (i = 1; i <= min; i++)
3596 {
3597 if (!match_ref(number, eptr, length, md)) FAIL;
3598 eptr += length;
3599 }
3600
3601 /* If min = max, continue at the same level without recursion.
3602 They are not both allowed to be zero. */
3603
3604 if (min == max) continue;
3605
3606 /* If minimizing, keep trying and advancing the pointer */
3607
3608 if (minimize)
3609 {
3610 for (i = min;; i++)
3611 {
3612 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3613 if (i >= max || !match_ref(number, eptr, length, md))
3614 FAIL;
3615 eptr += length;
3616 }
3617 /* Control never gets here */
3618 }
3619
3620 /* If maximizing, find the longest string and work backwards */
3621
3622 else
3623 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003624 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003625 for (i = min; i < max; i++)
3626 {
3627 if (!match_ref(number, eptr, length, md)) break;
3628 eptr += length;
3629 }
3630 while (eptr >= pp)
3631 {
3632 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3633 eptr -= length;
3634 }
3635 FAIL;
3636 }
3637 }
3638 /* Control never gets here */
3639
3640 /* Match a character class, possibly repeatedly. Look past the end of the
3641 item to see if there is repeat information following. Then obey similar
Guido van Rossum50700601997-12-08 17:15:20 +00003642 code to character type repeats - written out again for speed. If caseless
3643 matching was set at runtime but not at compile time, we have to check both
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003644 versions of a character, and we have to behave differently for positive and
3645 negative classes. This is the only time where OP_CLASS and OP_NEGCLASS are
3646 treated differently. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003647
3648 case OP_CLASS:
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003649 case OP_NEGCLASS:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003650 {
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003651 BOOL nasty_case = *ecode == OP_NEGCLASS && md->runtime_caseless;
Guido van Rossum58132c61997-12-17 00:24:13 +00003652 const uschar *data = ecode + 1; /* Save for matching */
3653 ecode += 33; /* Advance past the item */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003654
3655 switch (*ecode)
3656 {
3657 case OP_CRSTAR:
3658 case OP_CRMINSTAR:
3659 case OP_CRPLUS:
3660 case OP_CRMINPLUS:
3661 case OP_CRQUERY:
3662 case OP_CRMINQUERY:
3663 c = *ecode++ - OP_CRSTAR;
3664 minimize = (c & 1) != 0;
3665 min = rep_min[c]; /* Pick up values from tables; */
3666 max = rep_max[c]; /* zero for max => infinity */
3667 if (max == 0) max = INT_MAX;
3668 break;
3669
3670 case OP_CRRANGE:
3671 case OP_CRMINRANGE:
3672 minimize = (*ecode == OP_CRMINRANGE);
3673 min = (ecode[1] << 8) + ecode[2];
3674 max = (ecode[3] << 8) + ecode[4];
3675 if (max == 0) max = INT_MAX;
3676 ecode += 5;
3677 break;
3678
3679 default: /* No repeat follows */
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003680 min = max = 1;
3681 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003682 }
3683
3684 /* First, ensure the minimum number of matches are present. */
3685
3686 for (i = 1; i <= min; i++)
Guido van Rossum50700601997-12-08 17:15:20 +00003687 {
3688 if (eptr >= md->end_subject) FAIL;
3689 c = *eptr++;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003690
3691 /* Either not runtime caseless, or it was a positive class. For
3692 runtime caseless, continue if either case is in the map. */
3693
3694 if (!nasty_case)
Guido van Rossum50700601997-12-08 17:15:20 +00003695 {
Guido van Rossum50700601997-12-08 17:15:20 +00003696 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003697 if (md->runtime_caseless)
3698 {
3699 c = pcre_fcc[c];
3700 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3701 }
Guido van Rossum50700601997-12-08 17:15:20 +00003702 }
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003703
3704 /* Runtime caseless and it was a negative class. Continue only if
3705 both cases are in the map. */
3706
3707 else
3708 {
3709 if ((data[c/8] & (1 << (c&7))) == 0) FAIL;
3710 c = pcre_fcc[c];
3711 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3712 }
3713
3714 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00003715 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003716
3717 /* If max == min we can continue with the main loop without the
3718 need to recurse. */
3719
3720 if (min == max) continue;
3721
3722 /* If minimizing, keep testing the rest of the expression and advancing
3723 the pointer while it matches the class. */
3724
3725 if (minimize)
3726 {
3727 for (i = min;; i++)
3728 {
3729 if (match(eptr, ecode, offset_top, md)) SUCCEED;
Guido van Rossum50700601997-12-08 17:15:20 +00003730 if (i >= max || eptr >= md->end_subject) FAIL;
3731 c = *eptr++;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003732
3733 /* Either not runtime caseless, or it was a positive class. For
3734 runtime caseless, continue if either case is in the map. */
3735
3736 if (!nasty_case)
Guido van Rossum50700601997-12-08 17:15:20 +00003737 {
Guido van Rossum50700601997-12-08 17:15:20 +00003738 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003739 if (md->runtime_caseless)
3740 {
3741 c = pcre_fcc[c];
3742 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3743 }
Guido van Rossum50700601997-12-08 17:15:20 +00003744 }
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003745
3746 /* Runtime caseless and it was a negative class. Continue only if
3747 both cases are in the map. */
3748
3749 else
3750 {
3751 if ((data[c/8] & (1 << (c&7))) == 0) return FALSE;
3752 c = pcre_fcc[c];
3753 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3754 }
3755
Guido van Rossum50700601997-12-08 17:15:20 +00003756 FAIL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003757 }
3758 /* Control never gets here */
3759 }
3760
3761 /* If maximizing, find the longest possible run, then work backwards. */
3762
3763 else
3764 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003765 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00003766 for (i = min; i < max; eptr++, i++)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003767 {
Guido van Rossum50700601997-12-08 17:15:20 +00003768 if (eptr >= md->end_subject) break;
3769 c = *eptr;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003770
3771 /* Either not runtime caseless, or it was a positive class. For
3772 runtime caseless, continue if either case is in the map. */
3773
3774 if (!nasty_case)
Guido van Rossum50700601997-12-08 17:15:20 +00003775 {
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003776 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3777 if (md->runtime_caseless)
3778 {
3779 c = pcre_fcc[c];
3780 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3781 }
3782 }
3783
3784 /* Runtime caseless and it was a negative class. Continue only if
3785 both cases are in the map. */
3786
3787 else
3788 {
3789 if ((data[c/8] & (1 << (c&7))) == 0) break;
Guido van Rossum50700601997-12-08 17:15:20 +00003790 c = pcre_fcc[c];
3791 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3792 }
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003793
Guido van Rossum50700601997-12-08 17:15:20 +00003794 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003795 }
Guido van Rossum50700601997-12-08 17:15:20 +00003796
3797 while (eptr >= pp)
3798 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3799 FAIL;
3800 }
3801 }
3802 /* Control never gets here */
3803
3804 /* OP_CLASS_L opcode: handles localized character classes */
3805
3806 case OP_CLASS_L:
3807 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003808 const uschar *data = ecode + 1; /* Save for matching */
3809 const uschar locale_flag = *data;
Guido van Rossum50700601997-12-08 17:15:20 +00003810 ecode++; data++; /* The localization support adds an extra byte */
3811
3812 ecode += 33; /* Advance past the item */
3813
3814 switch (*ecode)
3815 {
3816 case OP_CRSTAR:
3817 case OP_CRMINSTAR:
3818 case OP_CRPLUS:
3819 case OP_CRMINPLUS:
3820 case OP_CRQUERY:
3821 case OP_CRMINQUERY:
3822 c = *ecode++ - OP_CRSTAR;
3823 minimize = (c & 1) != 0;
3824 min = rep_min[c]; /* Pick up values from tables; */
3825 max = rep_max[c]; /* zero for max => infinity */
3826 if (max == 0) max = INT_MAX;
3827 break;
3828
3829 case OP_CRRANGE:
3830 case OP_CRMINRANGE:
3831 minimize = (*ecode == OP_CRMINRANGE);
3832 min = (ecode[1] << 8) + ecode[2];
3833 max = (ecode[3] << 8) + ecode[4];
3834 if (max == 0) max = INT_MAX;
3835 ecode += 5;
3836 break;
3837
3838 default: /* No repeat follows */
3839 if (eptr >= md->end_subject) FAIL;
3840 c = *eptr++;
3841 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
Guido van Rossum58132c61997-12-17 00:24:13 +00003842 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3843 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003844#if 0
3845 if ( (locale_flag & 4) && isdigit(c) ) continue; /* Locale \d */
3846 if ( (locale_flag & 8) && !isdigit(c) ) continue; /* Locale \D */
3847 if ( (locale_flag & 16) && isspace(c) ) continue; /* Locale \s */
3848 if ( (locale_flag & 32) && !isspace(c) ) continue; /* Locale \S */
3849#endif
3850
3851 if (md->runtime_caseless)
3852 {
3853 c = pcre_fcc[c];
3854 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
3855
Guido van Rossum58132c61997-12-17 00:24:13 +00003856 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3857 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003858 }
3859 FAIL;
3860 }
3861
3862 /* First, ensure the minimum number of matches are present. */
3863
3864 for (i = 1; i <= min; i++)
3865 {
3866 if (eptr >= md->end_subject) FAIL;
3867 c = *eptr++;
3868 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003869 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3870 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003871
3872 if (md->runtime_caseless)
3873 {
3874 c = pcre_fcc[c];
3875 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003876 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3877 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003878 }
3879 FAIL;
3880 }
3881
3882 /* If max == min we can continue with the main loop without the
3883 need to recurse. */
3884
3885 if (min == max) continue;
3886
3887 /* If minimizing, keep testing the rest of the expression and advancing
3888 the pointer while it matches the class. */
3889
3890 if (minimize)
3891 {
3892 for (i = min;; i++)
3893 {
3894 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3895 if (i >= max || eptr >= md->end_subject) FAIL;
3896 c = *eptr++;
3897 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003898 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3899 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003900
3901 if (md->runtime_caseless)
3902 {
3903 c = pcre_fcc[c];
3904 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003905 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3906 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003907 }
3908 FAIL;
3909 }
3910 /* Control never gets here */
3911 }
3912
3913 /* If maximizing, find the longest possible run, then work backwards. */
3914
3915 else
3916 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003917 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00003918 for (i = min; i < max; eptr++, i++)
3919 {
3920 if (eptr >= md->end_subject) break;
3921 c = *eptr;
3922 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003923 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3924 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003925 if (md->runtime_caseless)
3926 {
3927 c = pcre_fcc[c];
3928 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003929 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3930 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003931 }
3932 break;
3933 }
3934
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003935 while (eptr >= pp)
3936 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3937 FAIL;
3938 }
3939 }
3940 /* Control never gets here */
3941
3942 /* Match a run of characters */
3943
3944 case OP_CHARS:
3945 {
3946 register int length = ecode[1];
3947 ecode += 2;
3948
Guido van Rossum557dea11997-12-22 22:46:52 +00003949#ifdef DEBUG /* Sigh. Some compilers never learn. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003950 if (eptr >= md->end_subject)
3951 printf("matching subject <null> against pattern ");
3952 else
3953 {
3954 printf("matching subject ");
3955 pchars(eptr, length, TRUE, md);
3956 printf(" against pattern ");
3957 }
3958 pchars(ecode, length, FALSE, md);
3959 printf("\n");
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003960#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003961
3962 if (length > md->end_subject - eptr) FAIL;
3963 if (md->caseless)
3964 {
3965 while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) FAIL;
3966 }
3967 else
3968 {
3969 while (length-- > 0) if (*ecode++ != *eptr++) FAIL;
3970 }
3971 }
3972 break;
3973
3974 /* Match a single character repeatedly; different opcodes share code. */
3975
3976 case OP_EXACT:
3977 min = max = (ecode[1] << 8) + ecode[2];
3978 ecode += 3;
3979 goto REPEATCHAR;
3980
3981 case OP_UPTO:
3982 case OP_MINUPTO:
3983 min = 0;
3984 max = (ecode[1] << 8) + ecode[2];
3985 minimize = *ecode == OP_MINUPTO;
3986 ecode += 3;
3987 goto REPEATCHAR;
3988
3989 case OP_STAR:
3990 case OP_MINSTAR:
3991 case OP_PLUS:
3992 case OP_MINPLUS:
3993 case OP_QUERY:
3994 case OP_MINQUERY:
3995 c = *ecode++ - OP_STAR;
3996 minimize = (c & 1) != 0;
3997 min = rep_min[c]; /* Pick up values from tables; */
3998 max = rep_max[c]; /* zero for max => infinity */
3999 if (max == 0) max = INT_MAX;
4000
4001 /* Common code for all repeated single-character matches. We can give
4002 up quickly if there are fewer than the minimum number of characters left in
4003 the subject. */
4004
4005 REPEATCHAR:
4006 if (min > md->end_subject - eptr) FAIL;
4007 c = *ecode++;
4008
4009 /* The code is duplicated for the caseless and caseful cases, for speed,
4010 since matching characters is likely to be quite common. First, ensure the
4011 minimum number of matches are present. If min = max, continue at the same
4012 level without recursing. Otherwise, if minimizing, keep trying the rest of
4013 the expression and advancing one matching character if failing, up to the
4014 maximum. Alternatively, if maximizing, find the maximum number of
4015 characters and work backwards. */
4016
Guido van Rossum557dea11997-12-22 22:46:52 +00004017 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
4018 max, eptr));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004019
4020 if (md->caseless)
4021 {
4022 c = pcre_lcc[c];
4023 for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) FAIL;
4024 if (min == max) continue;
4025 if (minimize)
4026 {
4027 for (i = min;; i++)
4028 {
4029 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4030 if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])
4031 FAIL;
4032 }
4033 /* Control never gets here */
4034 }
4035 else
4036 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004037 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004038 for (i = min; i < max; i++)
4039 {
4040 if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;
4041 eptr++;
4042 }
4043 while (eptr >= pp)
4044 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4045 FAIL;
4046 }
Guido van Rossum50700601997-12-08 17:15:20 +00004047 /* Control never gets here */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004048 }
4049
4050 /* Caseful comparisons */
4051
4052 else
4053 {
4054 for (i = 1; i <= min; i++) if (c != *eptr++) FAIL;
4055 if (min == max) continue;
4056 if (minimize)
4057 {
4058 for (i = min;; i++)
4059 {
4060 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4061 if (i >= max || eptr >= md->end_subject || c != *eptr++) FAIL;
4062 }
4063 /* Control never gets here */
4064 }
4065 else
4066 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004067 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004068 for (i = min; i < max; i++)
4069 {
4070 if (eptr >= md->end_subject || c != *eptr) break;
4071 eptr++;
4072 }
4073 while (eptr >= pp)
4074 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4075 FAIL;
4076 }
4077 }
4078 /* Control never gets here */
4079
Guido van Rossum50700601997-12-08 17:15:20 +00004080 /* Match a negated single character */
4081
4082 case OP_NOT:
Guido van Rossum557dea11997-12-22 22:46:52 +00004083 if (eptr >= md->end_subject) FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00004084 ecode++;
4085 if (md->caseless)
4086 {
4087 if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) FAIL;
4088 }
4089 else
4090 {
4091 if (*ecode++ == *eptr++) FAIL;
4092 }
4093 break;
4094
4095 /* Match a negated single character repeatedly. This is almost a repeat of
4096 the code for a repeated single character, but I haven't found a nice way of
4097 commoning these up that doesn't require a test of the positive/negative
4098 option for each character match. Maybe that wouldn't add very much to the
4099 time taken, but character matching *is* what this is all about... */
4100
4101 case OP_NOTEXACT:
4102 min = max = (ecode[1] << 8) + ecode[2];
4103 ecode += 3;
4104 goto REPEATNOTCHAR;
4105
4106 case OP_NOTUPTO:
4107 case OP_NOTMINUPTO:
4108 min = 0;
4109 max = (ecode[1] << 8) + ecode[2];
4110 minimize = *ecode == OP_NOTMINUPTO;
4111 ecode += 3;
4112 goto REPEATNOTCHAR;
4113
4114 case OP_NOTSTAR:
4115 case OP_NOTMINSTAR:
4116 case OP_NOTPLUS:
4117 case OP_NOTMINPLUS:
4118 case OP_NOTQUERY:
4119 case OP_NOTMINQUERY:
4120 c = *ecode++ - OP_NOTSTAR;
4121 minimize = (c & 1) != 0;
4122 min = rep_min[c]; /* Pick up values from tables; */
4123 max = rep_max[c]; /* zero for max => infinity */
4124 if (max == 0) max = INT_MAX;
4125
4126 /* Common code for all repeated single-character matches. We can give
4127 up quickly if there are fewer than the minimum number of characters left in
4128 the subject. */
4129
4130 REPEATNOTCHAR:
4131 if (min > md->end_subject - eptr) FAIL;
4132 c = *ecode++;
4133
4134 /* The code is duplicated for the caseless and caseful cases, for speed,
4135 since matching characters is likely to be quite common. First, ensure the
4136 minimum number of matches are present. If min = max, continue at the same
4137 level without recursing. Otherwise, if minimizing, keep trying the rest of
4138 the expression and advancing one matching character if failing, up to the
4139 maximum. Alternatively, if maximizing, find the maximum number of
4140 characters and work backwards. */
4141
Guido van Rossum557dea11997-12-22 22:46:52 +00004142 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
4143 max, eptr));
Guido van Rossum50700601997-12-08 17:15:20 +00004144
4145 if (md->caseless)
4146 {
4147 c = pcre_lcc[c];
4148 for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) FAIL;
4149 if (min == max) continue;
4150 if (minimize)
4151 {
4152 for (i = min;; i++)
4153 {
4154 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4155 if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])
4156 FAIL;
4157 }
4158 /* Control never gets here */
4159 }
4160 else
4161 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004162 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00004163 for (i = min; i < max; i++)
4164 {
4165 if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;
4166 eptr++;
4167 }
4168 while (eptr >= pp)
4169 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4170 FAIL;
4171 }
4172 /* Control never gets here */
4173 }
4174
4175 /* Caseful comparisons */
4176
4177 else
4178 {
4179 for (i = 1; i <= min; i++) if (c == *eptr++) FAIL;
4180 if (min == max) continue;
4181 if (minimize)
4182 {
4183 for (i = min;; i++)
4184 {
4185 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4186 if (i >= max || eptr >= md->end_subject || c == *eptr++) FAIL;
4187 }
4188 /* Control never gets here */
4189 }
4190 else
4191 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004192 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00004193 for (i = min; i < max; i++)
4194 {
4195 if (eptr >= md->end_subject || c == *eptr) break;
4196 eptr++;
4197 }
4198 while (eptr >= pp)
4199 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4200 FAIL;
4201 }
4202 }
4203 /* Control never gets here */
4204
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004205 /* Match a single character type repeatedly; several different opcodes
4206 share code. This is very similar to the code for single characters, but we
4207 repeat it in the interests of efficiency. */
4208
4209 case OP_TYPEEXACT:
4210 min = max = (ecode[1] << 8) + ecode[2];
4211 minimize = TRUE;
4212 ecode += 3;
4213 goto REPEATTYPE;
4214
4215 case OP_TYPEUPTO:
4216 case OP_TYPEMINUPTO:
4217 min = 0;
4218 max = (ecode[1] << 8) + ecode[2];
4219 minimize = *ecode == OP_TYPEMINUPTO;
4220 ecode += 3;
4221 goto REPEATTYPE;
4222
4223 case OP_TYPESTAR:
4224 case OP_TYPEMINSTAR:
4225 case OP_TYPEPLUS:
4226 case OP_TYPEMINPLUS:
4227 case OP_TYPEQUERY:
4228 case OP_TYPEMINQUERY:
4229 c = *ecode++ - OP_TYPESTAR;
4230 minimize = (c & 1) != 0;
4231 min = rep_min[c]; /* Pick up values from tables; */
4232 max = rep_max[c]; /* zero for max => infinity */
4233 if (max == 0) max = INT_MAX;
4234
4235 /* Common code for all repeated single character type matches */
4236
4237 REPEATTYPE:
4238 ctype = *ecode++; /* Code for the character type */
4239
4240 /* First, ensure the minimum number of matches are present. Use inline
4241 code for maximizing the speed, and do the type test once at the start
4242 (i.e. keep it out of the loop). Also test that there are at least the
4243 minimum number of characters before we start. */
4244
4245 if (min > md->end_subject - eptr) FAIL;
4246 if (min > 0) switch(ctype)
4247 {
4248 case OP_ANY:
4249 if (!md->dotall)
4250 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') FAIL; }
4251 else eptr += min;
4252 break;
4253
4254 case OP_NOT_DIGIT:
4255 for (i = 1; i <= min; i++)
4256 if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) FAIL;
4257 break;
4258
4259 case OP_DIGIT:
4260 for (i = 1; i <= min; i++)
4261 if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) FAIL;
4262 break;
4263
4264 case OP_NOT_WHITESPACE:
4265 for (i = 1; i <= min; i++)
4266 if ((pcre_ctypes[*eptr++] & ctype_space) != 0) FAIL;
4267 break;
4268
4269 case OP_WHITESPACE:
4270 for (i = 1; i <= min; i++)
4271 if ((pcre_ctypes[*eptr++] & ctype_space) == 0) FAIL;
4272 break;
4273
4274 case OP_NOT_WORDCHAR:
4275 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
4276 FAIL;
4277 break;
4278
4279 case OP_WORDCHAR:
4280 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
4281 FAIL;
4282 break;
Guido van Rossum50700601997-12-08 17:15:20 +00004283
4284 case OP_NOT_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00004285 for (i = 1; i <= min; i++, eptr++) if (*eptr=='_' || isalnum(*eptr))
Guido van Rossum557dea11997-12-22 22:46:52 +00004286 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00004287 break;
4288
4289 case OP_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00004290 for (i = 1; i <= min; i++, eptr++) if (*eptr!='_' && !isalnum(*eptr))
Guido van Rossum557dea11997-12-22 22:46:52 +00004291 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00004292 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004293 }
4294
4295 /* If min = max, continue at the same level without recursing */
4296
4297 if (min == max) continue;
4298
4299 /* If minimizing, we have to test the rest of the pattern before each
4300 subsequent match, so inlining isn't much help; just use the function. */
4301
4302 if (minimize)
4303 {
4304 for (i = min;; i++)
4305 {
4306 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4307 if (i >= max || eptr >= md->end_subject ||
4308 !match_type(ctype, *eptr++, md->dotall))
4309 FAIL;
4310 }
4311 /* Control never gets here */
4312 }
4313
4314 /* If maximizing it is worth using inline code for speed, doing the type
4315 test once at the start (i.e. keep it out of the loop). */
4316
4317 else
4318 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004319 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004320 switch(ctype)
4321 {
4322 case OP_ANY:
4323 if (!md->dotall)
4324 {
4325 for (i = min; i < max; i++)
4326 {
4327 if (eptr >= md->end_subject || *eptr == '\n') break;
4328 eptr++;
4329 }
4330 }
4331 else
4332 {
4333 c = max - min;
4334 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4335 eptr += c;
4336 }
4337 break;
4338
4339 case OP_NOT_DIGIT:
4340 for (i = min; i < max; i++)
4341 {
4342 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)
4343 break;
4344 eptr++;
4345 }
4346 break;
4347
4348 case OP_DIGIT:
4349 for (i = min; i < max; i++)
4350 {
4351 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)
4352 break;
4353 eptr++;
4354 }
4355 break;
4356
4357 case OP_NOT_WHITESPACE:
4358 for (i = min; i < max; i++)
4359 {
4360 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)
4361 break;
4362 eptr++;
4363 }
4364 break;
4365
4366 case OP_WHITESPACE:
4367 for (i = min; i < max; i++)
4368 {
4369 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)
4370 break;
4371 eptr++;
4372 }
4373 break;
4374
4375 case OP_NOT_WORDCHAR:
4376 for (i = min; i < max; i++)
4377 {
4378 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)
4379 break;
4380 eptr++;
4381 }
4382 break;
4383
4384 case OP_WORDCHAR:
4385 for (i = min; i < max; i++)
4386 {
Guido van Rossum50700601997-12-08 17:15:20 +00004387 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)
4388 break;
4389 eptr++;
4390 }
4391 break;
4392 case OP_NOT_WORDCHAR_L:
4393 for (i = min; i < max; i++)
4394 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004395 if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) ) )
Guido van Rossum50700601997-12-08 17:15:20 +00004396 break;
4397 eptr++;
4398 }
4399 break;
4400
4401 case OP_WORDCHAR_L:
4402 for (i = min; i < max; i++)
4403 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004404 if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) ) )
Guido van Rossum50700601997-12-08 17:15:20 +00004405 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004406 eptr++;
4407 }
4408 break;
4409 }
4410
4411 while (eptr >= pp)
4412 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4413 FAIL;
4414 }
4415 /* Control never gets here */
4416
4417 /* There's been some horrible disaster. */
4418
4419 default:
Guido van Rossum557dea11997-12-22 22:46:52 +00004420 DPRINTF(("Unknown opcode %d\n", *ecode));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004421 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4422 FAIL;
4423 }
4424
4425 /* Do not stick any code in here without much thought; it is assumed
4426 that "continue" in the code above comes out to here to repeat the main
4427 loop. */
4428
4429 } /* End of main loop */
4430/* Control never reaches here */
4431
4432fail:
4433 if (md->point > save_stack_position)
4434 {
4435 /* If there are still points remaining on the stack, pop the next one off */
Guido van Rossumc3861071997-10-08 02:07:40 +00004436 int off_num;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004437
4438 md->point--;
4439 offset_top = md->offset_top[md->point];
4440 eptr = md->eptr[md->point];
4441 ecode = md->ecode[md->point];
4442 off_num = md->off_num[md->point];
4443 md->offset_vector[off_num] = md->r1[md->point];
4444 md->offset_vector[off_num+1] = md->r2[md->point];
4445 goto match_loop;
4446 }
4447 /* Failure, and nothing left on the stack, so end this function call */
4448
4449 /* Restore the top of the stack to where it was before this function
4450 call. This lets us use one stack for everything; recursive calls
4451 can push and pop information, and may increase the stack. When
4452 the call returns, the parent function can resume pushing and
4453 popping wherever it was. */
4454
4455 md->point = save_stack_position;
4456 return FALSE;
4457
4458succeed:
4459 return TRUE;
4460}
4461
4462
Guido van Rossum50700601997-12-08 17:15:20 +00004463
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004464/*************************************************
Guido van Rossum557dea11997-12-22 22:46:52 +00004465* Segregate setjmp() *
4466*************************************************/
4467
4468/* The -Wall option of gcc gives warnings for all local variables when setjmp()
4469is used, even if the coding conforms to the rules of ANSI C. To avoid this, we
4470hide it in a separate function. This is called only when PCRE_EXTRA is set,
4471since it's needed only for the extension \X option, and with any luck, a good
4472compiler will spot the tail recursion and compile it efficiently.
4473
4474Arguments:
4475 eptr pointer in subject
4476 ecode position in code
4477 offset_top current top pointer
4478 md pointer to "static" info for the match
4479
4480Returns: TRUE if matched
4481*/
4482
4483static BOOL
4484match_with_setjmp(const uschar *eptr, const uschar *ecode, int offset_top,
4485 match_data *match_block)
4486{
4487return setjmp(match_block->fail_env) == 0 &&
4488 match(eptr, ecode, offset_top, match_block);
4489}
4490
4491
4492
4493/*************************************************
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004494* Execute a Regular Expression *
4495*************************************************/
4496
4497/* This function applies a compiled re to a subject string and picks out
4498portions of the string if it matches. Two elements in the vector are set for
4499each substring: the offsets to the start and end of the substring.
4500
4501Arguments:
Guido van Rossum50700601997-12-08 17:15:20 +00004502 external_re points to the compiled expression
4503 external_extra points to "hints" from pcre_study() or is NULL
4504 subject points to the subject string
4505 length length of subject string (may contain binary zeros)
4506 options option bits
4507 offsets points to a vector of ints to be filled in with offsets
4508 offsetcount the number of elements in the vector
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004509
Guido van Rossum50700601997-12-08 17:15:20 +00004510Returns: > 0 => success; value is the number of elements filled in
4511 = 0 => success, but offsets is not big enough
4512 -1 => failed to match
4513 < -1 => some kind of unexpected problem
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004514*/
4515
4516int
Guido van Rossum50700601997-12-08 17:15:20 +00004517pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
Guido van Rossum816671c1998-03-10 04:55:29 +00004518 const char *subject, int length, int start_pos, int options,
4519 int *offsets, int offsetcount)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004520{
Guido van Rossum58132c61997-12-17 00:24:13 +00004521 /* The "volatile" directives are to make gcc -Wall stop complaining
4522 that these variables can be clobbered by the longjmp. Hopefully
4523 they won't cost too much performance. */
Guido van Rossum042ff9e1998-04-03 21:13:31 +00004524volatile int resetcount, ocount;
4525volatile int first_char = -1;
Tim Peters54925f92000-07-05 22:56:52 +00004526const uschar * volatile start_bits = NULL;
4527const uschar * volatile start_match = (const uschar *)subject + start_pos;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004528match_data match_block;
Guido van Rossum58132c61997-12-17 00:24:13 +00004529const uschar *end_subject;
4530const real_pcre *re = (const real_pcre *)external_re;
4531const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00004532volatile BOOL using_temporary_offsets = FALSE;
4533volatile BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4534volatile BOOL startline = (re->options & PCRE_STARTLINE) != 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004535
4536if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4537
4538if (re == NULL || subject == NULL ||
4539 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4540if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4541
Guido van Rossum58132c61997-12-17 00:24:13 +00004542match_block.start_subject = (const uschar *)subject;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004543match_block.end_subject = match_block.start_subject + length;
4544end_subject = match_block.end_subject;
4545
Guido van Rossum50700601997-12-08 17:15:20 +00004546match_block.caseless = ((re->options | options) & PCRE_CASELESS) != 0;
4547match_block.runtime_caseless = match_block.caseless &&
4548 (re->options & PCRE_CASELESS) == 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004549
Guido van Rossum50700601997-12-08 17:15:20 +00004550match_block.multiline = ((re->options | options) & PCRE_MULTILINE) != 0;
4551match_block.dotall = ((re->options | options) & PCRE_DOTALL) != 0;
4552match_block.endonly = ((re->options | options) & PCRE_DOLLAR_ENDONLY) != 0;
4553
4554match_block.notbol = (options & PCRE_NOTBOL) != 0;
4555match_block.noteol = (options & PCRE_NOTEOL) != 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004556
4557match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
4558
4559/* Set the stack state to empty */
4560 match_block.off_num = match_block.offset_top = NULL;
4561 match_block.r1 = match_block.r2 = NULL;
4562 match_block.eptr = match_block.ecode = NULL;
4563 match_block.point = match_block.length = 0;
4564
Guido van Rossum50700601997-12-08 17:15:20 +00004565/* If the expression has got more back references than the offsets supplied can
4566hold, we get a temporary bit of working store to use during the matching.
Guido van Rossum557dea11997-12-22 22:46:52 +00004567Otherwise, we can use the vector supplied, rounding down its size to a multiple
4568of 2. */
Guido van Rossum50700601997-12-08 17:15:20 +00004569
Guido van Rossum557dea11997-12-22 22:46:52 +00004570ocount = offsetcount & (-2);
4571if (re->top_backref > 0 && re->top_backref >= ocount/2)
Guido van Rossum50700601997-12-08 17:15:20 +00004572 {
4573 ocount = re->top_backref * 2 + 2;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00004574 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
Guido van Rossum50700601997-12-08 17:15:20 +00004575 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
Guido van Rossum557dea11997-12-22 22:46:52 +00004576 using_temporary_offsets = TRUE;
4577 DPRINTF(("Got memory to hold back references\n"));
Guido van Rossum50700601997-12-08 17:15:20 +00004578 }
4579else match_block.offset_vector = offsets;
4580
4581match_block.offset_end = ocount;
4582match_block.offset_overflow = FALSE;
4583
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004584/* Compute the minimum number of offsets that we need to reset each time. Doing
4585this makes a huge difference to execution time when there aren't many brackets
4586in the pattern. */
4587
4588resetcount = 2 + re->top_bracket * 2;
Guido van Rossum50700601997-12-08 17:15:20 +00004589if (resetcount > offsetcount) resetcount = ocount;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004590
4591/* If MULTILINE is set at exec time but was not set at compile time, and the
4592anchored flag is set, we must re-check because a setting provoked by ^ in the
4593pattern is not right in multi-line mode. Calling is_anchored() again here does
4594the right check, because multiline is now set. If it now yields FALSE, the
4595expression must have had ^ starting some of its branches. Check to see if
4596that is true for *all* branches, and if so, set the startline flag. */
4597
Guido van Rossum557dea11997-12-22 22:46:52 +00004598if (match_block.multiline && anchored && (re->options & PCRE_MULTILINE) == 0 &&
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004599 !is_anchored(re->code, match_block.multiline))
4600 {
4601 anchored = FALSE;
4602 if (is_startline(re->code)) startline = TRUE;
4603 }
4604
4605/* Set up the first character to match, if available. The first_char value is
4606never set for an anchored regular expression, but the anchoring may be forced
4607at run time, so we have to test for anchoring. The first char may be unset for
4608an unanchored pattern, of course. If there's no first char and the pattern was
4609studied, the may be a bitmap of possible first characters. However, we can
4610use this only if the caseless state of the studying was correct. */
4611
4612if (!anchored)
4613 {
4614 if ((re->options & PCRE_FIRSTSET) != 0)
4615 {
4616 first_char = re->first_char;
4617 if (match_block.caseless) first_char = pcre_lcc[first_char];
4618 }
4619 else
4620 if (!startline && extra != NULL &&
4621 (extra->options & PCRE_STUDY_MAPPED) != 0 &&
4622 ((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless)
4623 start_bits = extra->start_bits;
4624 }
4625
4626/* Loop for unanchored matches; for anchored regexps the loop runs just once. */
4627
4628do
4629 {
Guido van Rossum557dea11997-12-22 22:46:52 +00004630 int rc;
Guido van Rossum50700601997-12-08 17:15:20 +00004631 register int *iptr = match_block.offset_vector;
4632 register int *iend = iptr + resetcount;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004633
4634 /* Reset the maximum number of extractions we might see. */
4635
4636 while (iptr < iend) *iptr++ = -1;
4637
4638 /* Advance to a unique first char if possible */
4639
4640 if (first_char >= 0)
4641 {
4642 if (match_block.caseless)
4643 while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
4644 start_match++;
4645 else
4646 while (start_match < end_subject && *start_match != first_char)
4647 start_match++;
4648 }
4649
4650 /* Or to just after \n for a multiline match if possible */
4651
4652 else if (startline)
4653 {
4654 if (start_match > match_block.start_subject)
4655 {
4656 while (start_match < end_subject && start_match[-1] != '\n')
4657 start_match++;
4658 }
4659 }
4660
4661 /* Or to a non-unique first char */
4662
4663 else if (start_bits != NULL)
4664 {
4665 while (start_match < end_subject)
4666 {
4667 register int c = *start_match;
Guido van Rossum50700601997-12-08 17:15:20 +00004668 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004669 }
4670 }
4671
Guido van Rossum557dea11997-12-22 22:46:52 +00004672#ifdef DEBUG /* Sigh. Some compilers never learn. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004673 printf(">>>> Match against: ");
4674 pchars(start_match, end_subject - start_match, TRUE, &match_block);
4675 printf("\n");
Guido van Rossum57ba4f31997-12-02 20:40:28 +00004676#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004677
4678 /* When a match occurs, substrings will be set for all internal extractions;
4679 we just need to set up the whole thing as substring 0 before returning. If
Guido van Rossum50700601997-12-08 17:15:20 +00004680 there were too many extractions, set the return code to zero. In the case
4681 where we had to get some local store to hold offsets for backreferences, copy
4682 those back references that we can. In this case there need not be overflow
4683 if certain parts of the pattern were not used.
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004684
Guido van Rossum50700601997-12-08 17:15:20 +00004685 Before starting the match, we have to set up a longjmp() target to enable
Guido van Rossum557dea11997-12-22 22:46:52 +00004686 the "cut" operation to fail a match completely without backtracking. This
4687 is done in a separate function to avoid compiler warnings. We need not do
4688 it unless PCRE_EXTRA is set, since only in that case is the "cut" operation
4689 enabled. */
Guido van Rossum50700601997-12-08 17:15:20 +00004690
4691 /* To handle errors such as running out of memory for the failure
4692 stack, we need to save this location via setjmp(), so
4693 error-handling code can call longjmp() to jump out of deeply-nested code. */
4694 if (setjmp(match_block.error_env)==0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004695 {
Guido van Rossum50700601997-12-08 17:15:20 +00004696
Guido van Rossum557dea11997-12-22 22:46:52 +00004697 if ((re->options & PCRE_EXTRA) != 0)
Guido van Rossum50700601997-12-08 17:15:20 +00004698 {
Guido van Rossum557dea11997-12-22 22:46:52 +00004699 if (!match_with_setjmp(start_match, re->code, 2, &match_block))
4700 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004701 }
Guido van Rossum557dea11997-12-22 22:46:52 +00004702 else if (!match(start_match, re->code, 2, &match_block)) continue;
4703
4704 /* Copy the offset information from temporary store if necessary */
4705
4706 if (using_temporary_offsets)
4707 {
4708 if (offsetcount >= 4)
4709 {
4710 memcpy(offsets + 2, match_block.offset_vector + 2,
4711 (offsetcount - 2) * sizeof(int));
4712 DPRINTF(("Copied offsets from temporary memory\n"));
4713 }
4714 if (match_block.end_offset_top > offsetcount)
4715 match_block.offset_overflow = TRUE;
4716
4717 DPRINTF(("Freeing temporary memory\n"));
4718 (pcre_free)(match_block.offset_vector);
4719 }
4720
4721 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
4722
4723 if (match_block.offset_end < 2) rc = 0; else
4724 {
4725 offsets[0] = start_match - match_block.start_subject;
4726 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
4727 }
4728
4729 DPRINTF((">>>> returning %d\n", rc));
4730 free_stack(&match_block);
4731 return rc;
Guido van Rossum50700601997-12-08 17:15:20 +00004732 } /* End of (if setjmp(match_block.error_env)...) */
Guido van Rossum042ff9e1998-04-03 21:13:31 +00004733 free_stack(&match_block);
4734
Guido van Rossum50700601997-12-08 17:15:20 +00004735 /* Return an error code; pcremodule.c will preserve the exception */
4736 if (PyErr_Occurred()) return PCRE_ERROR_NOMEMORY;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004737 }
4738while (!anchored &&
4739 match_block.errorcode == PCRE_ERROR_NOMATCH &&
4740 start_match++ < end_subject);
4741
Guido van Rossum557dea11997-12-22 22:46:52 +00004742if (using_temporary_offsets)
4743 {
4744 DPRINTF(("Freeing temporary memory\n"));
4745 (pcre_free)(match_block.offset_vector);
4746 }
4747
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004748#ifdef DEBUG
4749printf(">>>> returning %d\n", match_block.errorcode);
4750#endif
Guido van Rossum50700601997-12-08 17:15:20 +00004751
Barry Warsawb80667d1999-01-27 21:41:08 +00004752 free_stack(&match_block);
Guido van Rossum557dea11997-12-22 22:46:52 +00004753 return match_block.errorcode;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004754}
4755
4756/* End of pcre.c */