blob: 5f21005360dc78a39945e72f64b239c1b0dd3a36 [file] [log] [blame]
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001
2/*************************************************
3* Perl-Compatible Regular Expressions *
4*************************************************/
5
6/* DO NOT EDIT THIS FILE! */
7
8/* This file is automatically written by the merge-files.py script
9included with the PCRE distribution for Python; it's produced from
10several C files, and code is removed in the process. If you want to
11modify the code or track down bugs, it will be much easier to work
12with the code in its original, multiple-file form. Don't edit this
13file by hand, or submit patches to it.
14
15The Python-specific PCRE distribution can be retrieved from
16 http://starship.skyport.net/crew/amk/regex/
17
Guido van Rossum58132c61997-12-17 00:24:13 +000018The unmodified original PCRE distribution is available at
19ftp://ftp.cus.cam.ac.uk/pub/software/programs/pcre/, and is originally
20written by: Philip Hazel <ph10@cam.ac.uk>
Guido van Rossum51b3aa31997-10-06 14:43:11 +000021
22Extensively modified by the Python String-SIG: <string-sig@python.org>
23Send bug reports to: <string-sig@python.org>
24(They'll figure out if it's a bug in PCRE or in the Python-specific
25changes.)
26
27 Copyright (c) 1997 University of Cambridge
28
29-----------------------------------------------------------------------------
30Permission is granted to anyone to use this software for any purpose on any
31computer system, and to redistribute it freely, subject to the following
32restrictions:
33
341. This software is distributed in the hope that it will be useful,
35 but WITHOUT ANY WARRANTY; without even the implied warranty of
36 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
37
382. The origin of this software must not be misrepresented, either by
39 explicit claim or by omission.
40
413. Altered versions must be plainly marked as such, and must not be
42 misrepresented as being the original software.
43-----------------------------------------------------------------------------
44*/
45
46
47#define FOR_PYTHON
Guido van Rossum51b3aa31997-10-06 14:43:11 +000048#include "Python.h"
Martin v. Löwisd4233b22002-03-15 09:16:40 +000049#include "pcre-int.h"
Guido van Rossum50700601997-12-08 17:15:20 +000050#include <ctype.h>
Guido van Rossum51b3aa31997-10-06 14:43:11 +000051#include "graminit.h"
52
53/*************************************************
54* Perl-Compatible Regular Expressions *
55*************************************************/
56
57/* This file is automatically written by the makechartables auxiliary
58program. If you edit it by hand, you might like to edit the Makefile to
59prevent its ever being regenerated. */
60
61/* This table is a lower casing table. */
62
63unsigned char pcre_lcc[] = {
64 0, 1, 2, 3, 4, 5, 6, 7,
65 8, 9, 10, 11, 12, 13, 14, 15,
66 16, 17, 18, 19, 20, 21, 22, 23,
67 24, 25, 26, 27, 28, 29, 30, 31,
68 32, 33, 34, 35, 36, 37, 38, 39,
69 40, 41, 42, 43, 44, 45, 46, 47,
70 48, 49, 50, 51, 52, 53, 54, 55,
71 56, 57, 58, 59, 60, 61, 62, 63,
72 64, 97, 98, 99,100,101,102,103,
73 104,105,106,107,108,109,110,111,
74 112,113,114,115,116,117,118,119,
75 120,121,122, 91, 92, 93, 94, 95,
76 96, 97, 98, 99,100,101,102,103,
77 104,105,106,107,108,109,110,111,
78 112,113,114,115,116,117,118,119,
79 120,121,122,123,124,125,126,127,
80 128,129,130,131,132,133,134,135,
81 136,137,138,139,140,141,142,143,
82 144,145,146,147,148,149,150,151,
83 152,153,154,155,156,157,158,159,
84 160,161,162,163,164,165,166,167,
85 168,169,170,171,172,173,174,175,
86 176,177,178,179,180,181,182,183,
87 184,185,186,187,188,189,190,191,
88 192,193,194,195,196,197,198,199,
89 200,201,202,203,204,205,206,207,
90 208,209,210,211,212,213,214,215,
91 216,217,218,219,220,221,222,223,
92 224,225,226,227,228,229,230,231,
93 232,233,234,235,236,237,238,239,
94 240,241,242,243,244,245,246,247,
95 248,249,250,251,252,253,254,255 };
96
Guido van Rossum50700601997-12-08 17:15:20 +000097/* This table is a case flipping table. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +000098
Guido van Rossum50700601997-12-08 17:15:20 +000099unsigned char pcre_fcc[] = {
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000100 0, 1, 2, 3, 4, 5, 6, 7,
101 8, 9, 10, 11, 12, 13, 14, 15,
102 16, 17, 18, 19, 20, 21, 22, 23,
103 24, 25, 26, 27, 28, 29, 30, 31,
104 32, 33, 34, 35, 36, 37, 38, 39,
105 40, 41, 42, 43, 44, 45, 46, 47,
106 48, 49, 50, 51, 52, 53, 54, 55,
107 56, 57, 58, 59, 60, 61, 62, 63,
Guido van Rossum50700601997-12-08 17:15:20 +0000108 64, 97, 98, 99,100,101,102,103,
109 104,105,106,107,108,109,110,111,
110 112,113,114,115,116,117,118,119,
111 120,121,122, 91, 92, 93, 94, 95,
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000112 96, 65, 66, 67, 68, 69, 70, 71,
113 72, 73, 74, 75, 76, 77, 78, 79,
114 80, 81, 82, 83, 84, 85, 86, 87,
115 88, 89, 90,123,124,125,126,127,
116 128,129,130,131,132,133,134,135,
117 136,137,138,139,140,141,142,143,
118 144,145,146,147,148,149,150,151,
119 152,153,154,155,156,157,158,159,
120 160,161,162,163,164,165,166,167,
121 168,169,170,171,172,173,174,175,
122 176,177,178,179,180,181,182,183,
123 184,185,186,187,188,189,190,191,
124 192,193,194,195,196,197,198,199,
125 200,201,202,203,204,205,206,207,
126 208,209,210,211,212,213,214,215,
127 216,217,218,219,220,221,222,223,
128 224,225,226,227,228,229,230,231,
129 232,233,234,235,236,237,238,239,
130 240,241,242,243,244,245,246,247,
131 248,249,250,251,252,253,254,255 };
132
Guido van Rossum50700601997-12-08 17:15:20 +0000133/* This table contains bit maps for digits, letters, 'word' chars, and
134white space. Each map is 32 bytes long and the bits run from the least
135significant end of each byte. */
136
137unsigned char pcre_cbits[] = {
138 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
139 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
140 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
141 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
142
143 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
144 0xfe,0xff,0xff,0x07,0xfe,0xff,0xff,0x07,
145 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
146 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
147
148 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
149 0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07,
150 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
151 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
152
153 0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00,
154 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
155 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
156 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 };
157
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000158/* This table identifies various classes of character by individual bits:
Guido van Rossum50700601997-12-08 17:15:20 +0000159 0x01 white space character
160 0x02 letter
161 0x04 decimal digit
162 0x08 hexadecimal digit
163 0x10 alphanumeric or '_'
164 0x80 regular expression metacharacter or binary zero
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000165*/
166
167unsigned char pcre_ctypes[] = {
168 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
169 0x00,0x01,0x01,0x01,0x01,0x01,0x00,0x00, /* 8- 15 */
170 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
171 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
172 0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /* - ' */
173 0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /* ( - / */
Guido van Rossum50700601997-12-08 17:15:20 +0000174 0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c, /* 0 - 7 */
175 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x80, /* 8 - ? */
176 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* @ - G */
177 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* H - O */
178 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* P - W */
179 0x12,0x12,0x12,0x80,0x00,0x00,0x80,0x10, /* X - _ */
180 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* ` - g */
181 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* h - o */
182 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* p - w */
183 0x12,0x12,0x12,0x80,0x80,0x00,0x00,0x00, /* x -127 */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000184 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
185 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
186 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
187 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
188 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
189 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
190 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
191 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
192 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
193 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
194 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
195 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
196 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
197 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
198 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
199 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
200
Guido van Rossum50700601997-12-08 17:15:20 +0000201/* End of chartables.c */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000202/*************************************************
203* Perl-Compatible Regular Expressions *
204*************************************************/
205
206/*
207This is a library of functions to support regular expressions whose syntax
208and semantics are as close as possible to those of the Perl 5 language. See
209the file Tech.Notes for some information on the internals.
210
211Written by: Philip Hazel <ph10@cam.ac.uk>
212
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000213 Copyright (c) 1998 University of Cambridge
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000214
215-----------------------------------------------------------------------------
216Permission is granted to anyone to use this software for any purpose on any
217computer system, and to redistribute it freely, subject to the following
218restrictions:
219
2201. This software is distributed in the hope that it will be useful,
221 but WITHOUT ANY WARRANTY; without even the implied warranty of
222 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
223
2242. The origin of this software must not be misrepresented, either by
225 explicit claim or by omission.
226
2273. Altered versions must be plainly marked as such, and must not be
228 misrepresented as being the original software.
229-----------------------------------------------------------------------------
230*/
231
232
233/* Include the internals header, which itself includes Standard C headers plus
234the external pcre header. */
235
236
237
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000238
239/*************************************************
240* Create bitmap of starting chars *
241*************************************************/
242
243/* This function scans a compiled unanchored expression and attempts to build a
244bitmap of the set of initial characters. If it can't, it returns FALSE. As time
245goes by, we may be able to get more clever at doing this.
246
247Arguments:
248 code points to an expression
249 start_bits points to a 32-byte table, initialized to 0
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000250
251Returns: TRUE if table built, FALSE otherwise
252*/
253
254static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +0000255set_start_bits(const uschar *code, uschar *start_bits)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000256{
Guido van Rossum50700601997-12-08 17:15:20 +0000257register int c;
Guido van Rossum95864d31998-12-21 18:35:49 +0000258volatile int dummy;
Guido van Rossum50700601997-12-08 17:15:20 +0000259
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000260do
261 {
Guido van Rossum58132c61997-12-17 00:24:13 +0000262 const uschar *tcode = code + 3;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000263 BOOL try_next = TRUE;
264
265 while (try_next)
266 {
267 try_next = FALSE;
268
269 if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
270 {
Guido van Rossum50700601997-12-08 17:15:20 +0000271 if (!set_start_bits(tcode, start_bits)) return FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000272 }
273
274 else switch(*tcode)
275 {
276 default:
277 return FALSE;
278
279 /* BRAZERO does the bracket, but carries on. */
280
281 case OP_BRAZERO:
282 case OP_BRAMINZERO:
Guido van Rossum50700601997-12-08 17:15:20 +0000283 if (!set_start_bits(++tcode, start_bits)) return FALSE;
Guido van Rossum95864d31998-12-21 18:35:49 +0000284 dummy = 1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000285 do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
286 tcode += 3;
287 try_next = TRUE;
288 break;
289
290 /* Single-char * or ? sets the bit and tries the next item */
291
292 case OP_STAR:
293 case OP_MINSTAR:
294 case OP_QUERY:
295 case OP_MINQUERY:
Guido van Rossum50700601997-12-08 17:15:20 +0000296 start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000297 tcode += 2;
298 try_next = TRUE;
299 break;
300
301 /* Single-char upto sets the bit and tries the next */
302
303 case OP_UPTO:
304 case OP_MINUPTO:
Guido van Rossum50700601997-12-08 17:15:20 +0000305 start_bits[tcode[3]/8] |= (1 << (tcode[3]&7));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000306 tcode += 4;
307 try_next = TRUE;
308 break;
309
310 /* At least one single char sets the bit and stops */
311
312 case OP_EXACT: /* Fall through */
313 tcode++;
314
315 case OP_CHARS: /* Fall through */
316 tcode++;
317
318 case OP_PLUS:
319 case OP_MINPLUS:
Guido van Rossum50700601997-12-08 17:15:20 +0000320 start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000321 break;
322
323 /* Single character type sets the bits and stops */
324
325 case OP_NOT_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000326 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000327 break;
328
329 case OP_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000330 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000331 break;
332
333 case OP_NOT_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000334 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000335 break;
336
337 case OP_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000338 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000339 break;
340
341 case OP_NOT_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000342 for (c = 0; c < 32; c++)
343 start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000344 break;
345
346 case OP_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000347 for (c = 0; c < 32; c++)
348 start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000349 break;
350
351 /* One or more character type fudges the pointer and restarts, knowing
352 it will hit a single character type and stop there. */
353
354 case OP_TYPEPLUS:
355 case OP_TYPEMINPLUS:
356 tcode++;
357 try_next = TRUE;
358 break;
359
360 case OP_TYPEEXACT:
361 tcode += 3;
362 try_next = TRUE;
363 break;
364
365 /* Zero or more repeats of character types set the bits and then
366 try again. */
367
368 case OP_TYPEUPTO:
369 case OP_TYPEMINUPTO:
370 tcode += 2; /* Fall through */
371
372 case OP_TYPESTAR:
373 case OP_TYPEMINSTAR:
374 case OP_TYPEQUERY:
375 case OP_TYPEMINQUERY:
376 switch(tcode[1])
377 {
378 case OP_NOT_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000379 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000380 break;
381
382 case OP_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000383 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000384 break;
385
386 case OP_NOT_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000387 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000388 break;
389
390 case OP_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000391 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000392 break;
393
394 case OP_NOT_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000395 for (c = 0; c < 32; c++)
396 start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000397 break;
398
399 case OP_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000400 for (c = 0; c < 32; c++)
401 start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000402 break;
403 }
404
405 tcode += 2;
406 try_next = TRUE;
407 break;
408
409 /* Character class: set the bits and either carry on or not,
410 according to the repeat count. */
411
412 case OP_CLASS:
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000413 case OP_NEGCLASS:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000414 {
Guido van Rossum50700601997-12-08 17:15:20 +0000415 tcode++;
416 for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
417 tcode += 32;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000418 switch (*tcode)
419 {
420 case OP_CRSTAR:
421 case OP_CRMINSTAR:
422 case OP_CRQUERY:
423 case OP_CRMINQUERY:
424 tcode++;
425 try_next = TRUE;
426 break;
427
428 case OP_CRRANGE:
429 case OP_CRMINRANGE:
430 if (((tcode[1] << 8) + tcode[2]) == 0)
431 {
432 tcode += 5;
433 try_next = TRUE;
434 }
435 break;
436 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000437 }
438 break; /* End of class handling */
439
440 } /* End of switch */
441 } /* End of try_next loop */
442
443 code += (code[1] << 8) + code[2]; /* Advance to next branch */
444 }
445while (*code == OP_ALT);
446return TRUE;
447}
448
449
450
451/*************************************************
452* Study a compiled expression *
453*************************************************/
454
455/* This function is handed a compiled expression that it must study to produce
456information that will speed up the matching. It returns a pcre_extra block
457which then gets handed back to pcre_exec().
458
459Arguments:
460 re points to the compiled expression
461 options contains option bits
462 errorptr points to where to place error messages;
463 set NULL unless error
464
465Returns: pointer to a pcre_extra block,
466 NULL on error or if no optimization possible
467*/
468
469pcre_extra *
Guido van Rossum58132c61997-12-17 00:24:13 +0000470pcre_study(const pcre *external_re, int options, const char **errorptr)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000471{
472BOOL caseless;
473uschar start_bits[32];
474real_pcre_extra *extra;
Guido van Rossum58132c61997-12-17 00:24:13 +0000475const real_pcre *re = (const real_pcre *)external_re;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000476
477*errorptr = NULL;
478
479if (re == NULL || re->magic_number != MAGIC_NUMBER)
480 {
481 *errorptr = "argument is not a compiled regular expression";
482 return NULL;
483 }
484
485if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
486 {
487 *errorptr = "unknown or incorrect option bit(s) set";
488 return NULL;
489 }
490
Guido van Rossum50700601997-12-08 17:15:20 +0000491/* Caseless can either be from the compiled regex or from options. */
492
493caseless = ((re->options | options) & PCRE_CASELESS) != 0;
494
Thomas Wouters7e474022000-07-16 12:04:32 +0000495/* For an anchored pattern, or an unanchored pattern that has a first char, or a
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000496multiline pattern that matches only at "line starts", no further processing at
497present. */
498
499if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
500 return NULL;
501
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000502/* See if we can find a fixed set of initial characters for the pattern. */
503
Guido van Rossum50700601997-12-08 17:15:20 +0000504memset(start_bits, 0, 32 * sizeof(uschar));
505if (!set_start_bits(re->code, start_bits)) return NULL;
506
507/* If this studying is caseless, scan the created bit map and duplicate the
508bits for any letters. */
509
510if (caseless)
511 {
512 register int c;
513 for (c = 0; c < 256; c++)
514 {
515 if ((start_bits[c/8] & (1 << (c&7))) != 0 &&
516 (pcre_ctypes[c] & ctype_letter) != 0)
517 {
518 int d = pcre_fcc[c];
519 start_bits[d/8] |= (1 << (d&7));
520 }
521 }
522 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000523
524/* Get an "extra" block and put the information therein. */
525
526extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
527
528if (extra == NULL)
529 {
530 *errorptr = "failed to get memory";
531 return NULL;
532 }
Guido van Rossum50700601997-12-08 17:15:20 +0000533
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000534extra->options = PCRE_STUDY_MAPPED | (caseless? PCRE_STUDY_CASELESS : 0);
Guido van Rossum50700601997-12-08 17:15:20 +0000535memcpy(extra->start_bits, start_bits, sizeof(start_bits));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000536
537return (pcre_extra *)extra;
538}
539
Guido van Rossum50700601997-12-08 17:15:20 +0000540/* End of study.c */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000541/*************************************************
542* Perl-Compatible Regular Expressions *
543*************************************************/
544
545/*
546This is a library of functions to support regular expressions whose syntax
547and semantics are as close as possible to those of the Perl 5 language. See
548the file Tech.Notes for some information on the internals.
549
550Written by: Philip Hazel <ph10@cam.ac.uk>
551
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000552 Copyright (c) 1998 University of Cambridge
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000553
554-----------------------------------------------------------------------------
555Permission is granted to anyone to use this software for any purpose on any
556computer system, and to redistribute it freely, subject to the following
557restrictions:
558
5591. This software is distributed in the hope that it will be useful,
560 but WITHOUT ANY WARRANTY; without even the implied warranty of
561 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
562
5632. The origin of this software must not be misrepresented, either by
564 explicit claim or by omission.
565
5663. Altered versions must be plainly marked as such, and must not be
567 misrepresented as being the original software.
568-----------------------------------------------------------------------------
569*/
570
571
572/* Define DEBUG to get debugging output on stdout. */
573
574/* #define DEBUG */
575
Guido van Rossum557dea11997-12-22 22:46:52 +0000576/* Use a macro for debugging printing, 'cause that eliminates the the use
577of #ifdef inline, and there are *still* stupid compilers about that don't like
578indented pre-processor statements. I suppose it's only been 10 years... */
579
580#ifdef DEBUG
581#define DPRINTF(p) printf p
582#else
583#define DPRINTF(p) /*nothing*/
584#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000585
586/* Include the internals header, which itself includes Standard C headers plus
587the external pcre header. */
588
589
Guido van Rossum50700601997-12-08 17:15:20 +0000590
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000591
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000592#ifndef Py_eval_input
593/* For Python 1.4, graminit.h has to be explicitly included */
594#define Py_eval_input eval_input
Guido van Rossum50700601997-12-08 17:15:20 +0000595
596#endif /* FOR_PYTHON */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000597
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000598/* Allow compilation as C++ source code, should anybody want to do that. */
599
600#ifdef __cplusplus
601#define class pcre_class
602#endif
603
604
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000605/* Min and max values for the common repeats; for the maxima, 0 => infinity */
606
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000607static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
608static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000609
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000610/* Text forms of OP_ values and things, for debugging (not all used) */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000611
612#ifdef DEBUG
Guido van Rossum58132c61997-12-17 00:24:13 +0000613static const char *OP_names[] = {
614 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
Guido van Rossum50700601997-12-08 17:15:20 +0000615 "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z",
616 "localized \\B", "localized \\b", "localized \\W", "localized \\w",
617 "^", "$", "Any", "chars",
618 "not",
619 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000620 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
621 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
622 "*", "*?", "+", "+?", "?", "??", "{", "{",
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000623 "class", "negclass", "classL", "Ref",
Guido van Rossum50700601997-12-08 17:15:20 +0000624 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000625 "Brazero", "Braminzero", "Bra"
626};
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000627#endif
628
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000629/* Table for handling escaped characters in the range '0'-'z'. Positive returns
630are simple data values; negative values are for special things like \d and so
631on. Zero means further processing is needed (for things like \x), or the escape
632is invalid. */
633
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000634static const short int escapes[] = {
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000635 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
636 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
637 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
638 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
639 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
640 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
641 '`', 7, -ESC_b, 0, -ESC_d, 0, '\f', 0, /* ` - g */
642 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
643 0, 0, '\r', -ESC_s, '\t', 0, '\v', -ESC_w, /* p - w */
644 0, 0, 0 /* x - z */
645};
646
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000647/* Definition to allow mutual recursion */
648
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000649static BOOL
650compile_regex(int, int *, uschar **, const uschar **, const char **,
651 PyObject *);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000652
653/* Structure for passing "static" information around between the functions
654doing the matching, so that they are thread-safe. */
655
656typedef struct match_data {
657 int errorcode; /* As it says */
658 int *offset_vector; /* Offset vector */
659 int offset_end; /* One past the end */
660 BOOL offset_overflow; /* Set if too many extractions */
661 BOOL caseless; /* Case-independent flag */
Guido van Rossum50700601997-12-08 17:15:20 +0000662 BOOL runtime_caseless; /* Caseless forced at run time */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000663 BOOL multiline; /* Multiline flag */
Guido van Rossum50700601997-12-08 17:15:20 +0000664 BOOL notbol; /* NOTBOL flag */
665 BOOL noteol; /* NOTEOL flag */
666 BOOL dotall; /* Dot matches any char */
667 BOOL endonly; /* Dollar not before final \n */
Guido van Rossum58132c61997-12-17 00:24:13 +0000668 const uschar *start_subject; /* Start of the subject string */
669 const uschar *end_subject; /* End of the subject string */
Guido van Rossum50700601997-12-08 17:15:20 +0000670 jmp_buf fail_env; /* Environment for longjump() break out */
Guido van Rossum58132c61997-12-17 00:24:13 +0000671 const uschar *end_match_ptr; /* Subject position at end match */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000672 int end_offset_top; /* Highwater mark at end of match */
Guido van Rossum50700601997-12-08 17:15:20 +0000673 jmp_buf error_env; /* For longjmp() if an error occurs deep inside a
674 matching operation */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000675 int length; /* Length of the allocated stacks */
676 int point; /* Point to add next item pushed onto stacks */
677 /* Pointers to the 6 stacks */
678 int *off_num, *offset_top, *r1, *r2;
Guido van Rossum58132c61997-12-17 00:24:13 +0000679 const uschar **eptr, **ecode;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000680} match_data;
681
682
683
684/*************************************************
Guido van Rossum50700601997-12-08 17:15:20 +0000685* Global variables *
686*************************************************/
687
688/* PCRE is thread-clean and doesn't use any global variables in the normal
689sense. However, it calls memory allocation and free functions via the two
690indirections below, which are can be changed by the caller, but are shared
691between all threads. */
692
693void *(*pcre_malloc)(size_t) = malloc;
694void (*pcre_free)(void *) = free;
695
696
697
698
699/*************************************************
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000700* Return version string *
701*************************************************/
702
Guido van Rossum58132c61997-12-17 00:24:13 +0000703const char *
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000704pcre_version(void)
705{
706return PCRE_VERSION;
707}
708
709
710
711
712/*************************************************
713* Return info about a compiled pattern *
714*************************************************/
715
716/* This function picks potentially useful data out of the private
717structure.
718
719Arguments:
720 external_re points to compiled code
721 optptr where to pass back the options
722 first_char where to pass back the first character,
723 or -1 if multiline and all branches start ^,
724 or -2 otherwise
725
726Returns: number of identifying extraction brackets
727 or negative values on error
728*/
729
730int
Guido van Rossum50700601997-12-08 17:15:20 +0000731pcre_info(const pcre *external_re, int *optptr, int *first_char)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000732{
Guido van Rossum58132c61997-12-17 00:24:13 +0000733const real_pcre *re = (real_pcre *)external_re;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000734if (re == NULL) return PCRE_ERROR_NULL;
735if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
736if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
737if (first_char != NULL)
738 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
739 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
740return re->top_bracket;
741}
742
743
744
745
746#ifdef DEBUG
747/*************************************************
748* Debugging function to print chars *
749*************************************************/
750
751/* Print a sequence of chars in printable format, stopping at the end of the
752subject if the requested.
753
754Arguments:
755 p points to characters
756 length number to print
757 is_subject TRUE if printing from within md->start_subject
758 md pointer to matching data block, if is_subject is TRUE
759
760Returns: nothing
761*/
762
Guido van Rossum557dea11997-12-22 22:46:52 +0000763static void
764pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000765{
766int c;
767if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
768while (length-- > 0)
769 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
770}
771#endif
772
773
774
775
776/*************************************************
777* Check subpattern for empty operand *
778*************************************************/
779
780/* This function checks a bracketed subpattern to see if any of the paths
781through it could match an empty string. This is used to diagnose an error if
782such a subpattern is followed by a quantifier with an unlimited upper bound.
783
784Argument:
785 code points to the opening bracket
786
787Returns: TRUE or FALSE
788*/
789
790static BOOL
791could_be_empty(uschar *code)
792{
793do {
794 uschar *cc = code + 3;
795
796 /* Scan along the opcodes for this branch; as soon as we find something
797 that matches a non-empty string, break out and advance to test the next
798 branch. If we get to the end of the branch, return TRUE for the whole
799 sub-expression. */
800
801 for (;;)
802 {
803 /* Test an embedded subpattern; if it could not be empty, break the
804 loop. Otherwise carry on in the branch. */
805
Guido van Rossum50700601997-12-08 17:15:20 +0000806 if ((int)(*cc) >= OP_BRA || (int)(*cc) == OP_ONCE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000807 {
808 if (!could_be_empty(cc)) break;
809 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
810 cc += 3;
811 }
812
813 else switch (*cc)
814 {
815 /* Reached end of a branch: the subpattern may match the empty string */
816
817 case OP_ALT:
818 case OP_KET:
819 case OP_KETRMAX:
820 case OP_KETRMIN:
821 return TRUE;
822
Guido van Rossumd0f432b1998-02-20 21:45:14 +0000823 /* Skip over entire bracket groups with zero lower bound */
824
825 case OP_BRAZERO:
826 case OP_BRAMINZERO:
827 cc++;
828 /* Fall through */
829
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000830 /* Skip over assertive subpatterns */
831
832 case OP_ASSERT:
833 case OP_ASSERT_NOT:
834 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
835 cc += 3;
836 break;
837
838 /* Skip over things that don't match chars */
839
840 case OP_SOD:
841 case OP_EOD:
842 case OP_CIRC:
843 case OP_DOLL:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000844 case OP_NOT_WORD_BOUNDARY:
845 case OP_WORD_BOUNDARY:
Guido van Rossum50700601997-12-08 17:15:20 +0000846 case OP_NOT_WORD_BOUNDARY_L:
847 case OP_WORD_BOUNDARY_L:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000848 cc++;
849 break;
850
851 /* Skip over simple repeats with zero lower bound */
852
853 case OP_STAR:
854 case OP_MINSTAR:
855 case OP_QUERY:
856 case OP_MINQUERY:
Guido van Rossum50700601997-12-08 17:15:20 +0000857 case OP_NOTSTAR:
858 case OP_NOTMINSTAR:
859 case OP_NOTQUERY:
860 case OP_NOTMINQUERY:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000861 case OP_TYPESTAR:
862 case OP_TYPEMINSTAR:
863 case OP_TYPEQUERY:
864 case OP_TYPEMINQUERY:
865 cc += 2;
866 break;
867
868 /* Skip over UPTOs (lower bound is zero) */
869
870 case OP_UPTO:
871 case OP_MINUPTO:
872 case OP_TYPEUPTO:
873 case OP_TYPEMINUPTO:
874 cc += 4;
875 break;
876
877 /* Check a class or a back reference for a zero minimum */
878
879 case OP_CLASS:
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000880 case OP_NEGCLASS:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000881 case OP_REF:
Guido van Rossum50700601997-12-08 17:15:20 +0000882 case OP_CLASS_L:
883 switch(*cc)
884 {
885 case (OP_REF): cc += 2; break;
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000886 case (OP_CLASS): case (OP_NEGCLASS): cc += 1+32; break;
Guido van Rossum50700601997-12-08 17:15:20 +0000887 case (OP_CLASS_L): cc += 1+1+32; break;
888 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000889
890 switch (*cc)
891 {
892 case OP_CRSTAR:
893 case OP_CRMINSTAR:
894 case OP_CRQUERY:
895 case OP_CRMINQUERY:
896 cc++;
897 break;
898
899 case OP_CRRANGE:
900 case OP_CRMINRANGE:
901 if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
902 cc += 3;
903 break;
904
905 default:
906 goto NEXT_BRANCH;
907 }
908 break;
909
910 /* Anything else matches at least one character */
911
912 default:
913 goto NEXT_BRANCH;
914 }
915 }
916
917 NEXT_BRANCH:
918 code += (code[1] << 8) + code[2];
919 }
920while (*code == OP_ALT);
921
922/* No branches match the empty string */
923
924return FALSE;
925}
926
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000927/* Determine the length of a group ID in an expression like
928 (?P<foo_123>...)
929Arguments:
930 ptr pattern position pointer (say that 3 times fast)
931 finalchar the character that will mark the end of the ID
932 errorptr points to the pointer to the error message
933*/
934
935static int
Guido van Rossum58132c61997-12-17 00:24:13 +0000936get_group_id(const uschar *ptr, char finalchar, const char **errorptr)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000937{
Guido van Rossum58132c61997-12-17 00:24:13 +0000938 const uschar *start = ptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000939
940 /* If the first character is not in \w, or is in \w but is a digit,
941 report an error */
942 if (!(pcre_ctypes[*ptr] & ctype_word) ||
943 (pcre_ctypes[*ptr++] & ctype_digit))
944 {
945 *errorptr = "(?P identifier must start with a letter or underscore";
946 return 0;
947 }
948
949 /* Increment ptr until we either hit a null byte, the desired
950 final character, or a non-word character */
951 for(; (*ptr != 0) && (*ptr != finalchar) &&
952 (pcre_ctypes[*ptr] & ctype_word); ptr++)
953 {
Guido van Rossumc3861071997-10-08 02:07:40 +0000954 /* Empty loop body */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000955 }
956 if (*ptr==finalchar)
957 return ptr-start;
958 if (*ptr==0)
959 {
960 *errorptr = "unterminated (?P identifier";
961 return 0;
962 }
963 *errorptr = "illegal character in (?P identifier";
964 return 0;
965}
966
967/*************************************************
968* Handle escapes *
969*************************************************/
970
971/* This function is called when a \ has been encountered. It either returns a
972positive value for a simple escape such as \n, or a negative value which
973encodes one of the more complicated things such as \d. On entry, ptr is
974pointing at the \. On exit, it is on the final character of the escape
975sequence.
976
977Arguments:
978 ptrptr points to the pattern position pointer
979 errorptr points to the pointer to the error message
Guido van Rossum50700601997-12-08 17:15:20 +0000980 bracount number of previous extracting brackets
981 options the options bits
982 isclass TRUE if inside a character class
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000983
984Returns: zero or positive => a data character
985 negative => a special escape sequence
986 on error, errorptr is set
987*/
988
989static int
Guido van Rossum58132c61997-12-17 00:24:13 +0000990check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
991 int options, BOOL isclass)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000992{
Guido van Rossum58132c61997-12-17 00:24:13 +0000993const uschar *ptr = *ptrptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000994int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
995int i;
996
Guido van Rossum50700601997-12-08 17:15:20 +0000997if (c == 0) *errorptr = ERR1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000998
999/* Digits or letters may have special meaning; all others are literals. */
1000
1001else if (c < '0' || c > 'z') {}
1002
1003/* Do an initial lookup in a table. A non-zero result is something that can be
1004returned immediately. Otherwise further processing may be required. */
1005
1006else if ((i = escapes[c - '0']) != 0) c = i;
1007
1008/* Escapes that need further processing, or are illegal. */
1009
Guido van Rossum50700601997-12-08 17:15:20 +00001010else
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001011 {
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001012
Guido van Rossum50700601997-12-08 17:15:20 +00001013 switch (c)
1014 {
1015 /* The handling of escape sequences consisting of a string of digits
1016 starting with one that is not zero is not straightforward. By experiment,
1017 the way Perl works seems to be as follows:
1018
1019 Outside a character class, the digits are read as a decimal number. If the
1020 number is less than 10, or if there are that many previous extracting
1021 left brackets, then it is a back reference. Otherwise, up to three octal
1022 digits are read to form an escaped byte. Thus \123 is likely to be octal
1023 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
1024 value is greater than 377, the least significant 8 bits are taken. Inside a
1025 character class, \ followed by a digit is always an octal number. */
1026
1027 case '1': case '2': case '3': case '4': case '5':
1028 case '6': case '7': case '8': case '9':
1029
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001030 {
1031 /* PYTHON: Try to compute an octal value for a character */
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001032 for(c=0, i=0; ptr[i]!=0 && i<3; i++)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001033 {
1034 if (( pcre_ctypes[ ptr[i] ] & ctype_odigit) != 0)
Andrew M. Kuchling3b96d0b2000-08-02 13:41:18 +00001035 c = (c * 8 + ptr[i]-'0') & 255;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001036 else
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001037 break; /* Non-octal character--break out of the loop */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001038 }
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001039 /* It's a character if there were exactly 3 octal digits, or if
1040 we're inside a character class and there was at least one
1041 octal digit. */
1042 if ( (i == 3) || (isclass && i!=0) )
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001043 {
1044 ptr += i-1;
1045 break;
1046 }
1047 c = ptr[0]; /* Restore the first character after the \ */
1048 c -= '0'; i = 1;
1049 while (i<2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0)
1050 {
1051 c = c * 10 + ptr[1] - '0';
1052 ptr++; i++;
1053 }
1054 if (c > 255 - ESC_REF) *errorptr = "back reference too big";
1055 c = -(ESC_REF + c);
1056 }
1057 break;
1058
Guido van Rossum50700601997-12-08 17:15:20 +00001059 /* \0 always starts an octal number, but we may drop through to here with a
1060 larger first octal digit */
1061
1062 case '0':
1063 c -= '0';
1064 while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&
1065 ptr[1] != '8' && ptr[1] != '9')
Andrew M. Kuchling94c34522000-06-01 03:02:48 +00001066 c = (c * 8 + *(++ptr) - '0') & 255;
Guido van Rossum50700601997-12-08 17:15:20 +00001067 break;
1068
1069 /* Special escapes not starting with a digit are straightforward */
1070
1071 case 'x':
1072 c = 0;
1073 while ( (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001074 {
Guido van Rossum50700601997-12-08 17:15:20 +00001075 ptr++;
1076 c = c * 16 + pcre_lcc[*ptr] -
1077 (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
1078 c &= 255;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001079 }
1080 break;
1081
1082
Guido van Rossum50700601997-12-08 17:15:20 +00001083 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1084 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
1085 for Perl compatibility, it is a literal. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001086
Guido van Rossum50700601997-12-08 17:15:20 +00001087 default:
1088 if ((options & PCRE_EXTRA) != 0) switch(c)
1089 {
1090 case 'X':
1091 c = -ESC_X; /* This could be a lookup if it ever got into Perl */
1092 break;
1093
1094 default:
1095 *errorptr = ERR3;
1096 break;
1097 }
1098 break;
1099 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001100 }
1101
1102*ptrptr = ptr;
1103return c;
1104}
1105
1106
1107
1108/*************************************************
Guido van Rossum50700601997-12-08 17:15:20 +00001109* Check for counted repeat *
1110*************************************************/
1111
1112/* This function is called when a '{' is encountered in a place where it might
1113start a quantifier. It looks ahead to see if it really is a quantifier or not.
1114It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1115where the ddds are digits.
1116
1117Arguments:
1118 p pointer to the first char after '{'
1119
1120Returns: TRUE or FALSE
1121*/
1122
1123static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00001124is_counted_repeat(const uschar *p)
Guido van Rossum50700601997-12-08 17:15:20 +00001125{
1126if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1127while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1128if (*p == '}') return TRUE;
1129
1130if (*p++ != ',') return FALSE;
1131if (*p == '}') return TRUE;
1132
1133if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1134while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1135return (*p == '}');
1136}
1137
1138
1139
1140/*************************************************
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001141* Read repeat counts *
1142*************************************************/
1143
Guido van Rossum50700601997-12-08 17:15:20 +00001144/* Read an item of the form {n,m} and return the values. This is called only
1145after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1146so the syntax is guaranteed to be correct, but we need to check the values.
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001147
1148Arguments:
1149 p pointer to first char after '{'
1150 minp pointer to int for min
1151 maxp pointer to int for max
1152 returned as -1 if no max
1153 errorptr points to pointer to error message
1154
1155Returns: pointer to '}' on success;
1156 current ptr on error, with errorptr set
1157*/
1158
Guido van Rossum58132c61997-12-17 00:24:13 +00001159static const uschar *
1160read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001161{
1162int min = 0;
1163int max = -1;
1164
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001165while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
1166
1167if (*p == '}') max = min; else
1168 {
Guido van Rossum50700601997-12-08 17:15:20 +00001169 if (*(++p) != '}')
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001170 {
1171 max = 0;
1172 while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001173 if (max < min)
1174 {
Guido van Rossum50700601997-12-08 17:15:20 +00001175 *errorptr = ERR4;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001176 return p;
1177 }
1178 }
1179 }
1180
1181/* Do paranoid checks, then fill in the required variables, and pass back the
1182pointer to the terminating '}'. */
1183
Guido van Rossum50700601997-12-08 17:15:20 +00001184if (min > 65535 || max > 65535)
1185 *errorptr = ERR5;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001186else
1187 {
1188 *minp = min;
1189 *maxp = max;
1190 }
1191return p;
1192}
1193
1194
1195
1196/*************************************************
1197* Compile one branch *
1198*************************************************/
1199
1200/* Scan the pattern, compiling it into the code vector.
1201
1202Arguments:
Guido van Rossum50700601997-12-08 17:15:20 +00001203 options the option bits
1204 bracket points to number of brackets used
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001205 code points to the pointer to the current code point
1206 ptrptr points to the current pattern pointer
1207 errorptr points to pointer to error message
1208
1209Returns: TRUE on success
1210 FALSE, with *errorptr set on error
1211*/
1212
1213static BOOL
Guido van Rossum50700601997-12-08 17:15:20 +00001214compile_branch(int options, int *brackets, uschar **codeptr,
Guido van Rossum58132c61997-12-17 00:24:13 +00001215 const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001216{
1217int repeat_type, op_type;
1218int repeat_min, repeat_max;
1219int bravalue, length;
Guido van Rossumdda66961998-05-07 15:32:44 +00001220int greedy_default, greedy_non_default;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001221register int c;
1222register uschar *code = *codeptr;
Guido van Rossum58132c61997-12-17 00:24:13 +00001223const uschar *ptr = *ptrptr;
1224const uschar *oldptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001225uschar *previous = NULL;
Guido van Rossum50700601997-12-08 17:15:20 +00001226uschar class[32];
1227uschar *class_flag; /* Pointer to the single-byte flag for OP_CLASS_L */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001228
Guido van Rossumdda66961998-05-07 15:32:44 +00001229/* Set up the default and non-default settings for greediness */
1230
1231greedy_default = ((options & PCRE_UNGREEDY) != 0);
1232greedy_non_default = greedy_default ^ 1;
1233
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001234/* Switch on next character until the end of the branch */
1235
1236for (;; ptr++)
1237 {
Guido van Rossum50700601997-12-08 17:15:20 +00001238 BOOL negate_class;
1239 int class_charcount;
1240 int class_lastchar;
1241
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001242 c = *ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00001243 if ((options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001244 {
1245 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1246 if (c == '#')
1247 {
1248 while ((c = *(++ptr)) != 0 && c != '\n');
1249 continue;
1250 }
1251 }
1252
1253 switch(c)
1254 {
1255 /* The branch terminates at end of string, |, or ). */
1256
1257 case 0:
1258 case '|':
1259 case ')':
1260 *codeptr = code;
1261 *ptrptr = ptr;
1262 return TRUE;
1263
1264 /* Handle single-character metacharacters */
1265
1266 case '^':
1267 previous = NULL;
1268 *code++ = OP_CIRC;
1269 break;
1270
1271 case '$':
1272 previous = NULL;
1273 *code++ = OP_DOLL;
1274 break;
1275
1276 case '.':
1277 previous = code;
1278 *code++ = OP_ANY;
1279 break;
1280
Guido van Rossum50700601997-12-08 17:15:20 +00001281 /* Character classes. These always build a 32-byte bitmap of the permitted
1282 characters, except in the special case where there is only one character.
1283 For negated classes, we build the map as usual, then invert it at the end.
1284 */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001285
1286 case '[':
Guido van Rossum50700601997-12-08 17:15:20 +00001287 previous = code;
1288 if (options & PCRE_LOCALE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001289 {
Guido van Rossum50700601997-12-08 17:15:20 +00001290 *code++ = OP_CLASS_L;
1291 /* Set the flag for localized classes (like \w) to 0 */
1292 class_flag = code;
1293 *class_flag = 0;
1294 }
1295 else
1296 {
1297 *code++ = OP_CLASS;
1298 class_flag = NULL;
1299 }
1300
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001301 /* If the first character is '^', set the negation flag, and use a
1302 different opcode. This only matters if caseless matching is specified at
1303 runtime. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001304
Guido van Rossum50700601997-12-08 17:15:20 +00001305 if ((c = *(++ptr)) == '^')
1306 {
1307 negate_class = TRUE;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001308 if (*(code-1)==OP_CLASS) *(code-1) = OP_NEGCLASS;
Guido van Rossum50700601997-12-08 17:15:20 +00001309 c = *(++ptr);
1310 }
1311 else negate_class = FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001312
Guido van Rossum50700601997-12-08 17:15:20 +00001313 /* Keep a count of chars so that we can optimize the case of just a single
1314 character. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001315
Guido van Rossum50700601997-12-08 17:15:20 +00001316 class_charcount = 0;
1317 class_lastchar = -1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001318
Guido van Rossum50700601997-12-08 17:15:20 +00001319 /* Initialize the 32-char bit map to all zeros. We have to build the
1320 map in a temporary bit of store, in case the class contains only 1
1321 character, because in that case the compiled code doesn't use the
1322 bit map. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001323
Guido van Rossum50700601997-12-08 17:15:20 +00001324 memset(class, 0, 32 * sizeof(uschar));
1325
1326 /* Process characters until ] is reached. By writing this as a "do" it
1327 means that an initial ] is taken as a data character. */
1328
1329 do
1330 {
1331 if (c == 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001332 {
Guido van Rossum50700601997-12-08 17:15:20 +00001333 *errorptr = ERR6;
1334 goto FAILED;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001335 }
1336
Guido van Rossum50700601997-12-08 17:15:20 +00001337 /* Backslash may introduce a single character, or it may introduce one
1338 of the specials, which just set a flag. Escaped items are checked for
1339 validity in the pre-compiling pass. The sequence \b is a special case.
Guido van Rossum58132c61997-12-17 00:24:13 +00001340 Inside a class (and only there) it is treated as backspace. Elsewhere
Guido van Rossum50700601997-12-08 17:15:20 +00001341 it marks a word boundary. Other escapes have preset maps ready to
1342 or into the one we are building. We assume they have more than one
1343 character in them, so set class_count bigger than one. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001344
Guido van Rossum50700601997-12-08 17:15:20 +00001345 if (c == '\\')
1346 {
1347 c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1348 if (-c == ESC_b) c = '\b';
1349 else if (c < 0)
1350 {
1351 class_charcount = 10;
1352 switch (-c)
1353 {
1354 case ESC_d:
Guido van Rossum50700601997-12-08 17:15:20 +00001355 {
1356 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];
1357 }
1358 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001359
Guido van Rossum50700601997-12-08 17:15:20 +00001360 case ESC_D:
Guido van Rossum50700601997-12-08 17:15:20 +00001361 {
1362 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];
1363 }
1364 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001365
Guido van Rossum50700601997-12-08 17:15:20 +00001366 case ESC_w:
1367 if (options & PCRE_LOCALE)
1368 {
1369 *class_flag |= 1;
1370 }
1371 else
1372 {
1373 for (c = 0; c < 32; c++)
1374 class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1375 }
1376 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001377
Guido van Rossum50700601997-12-08 17:15:20 +00001378 case ESC_W:
1379 if (options & PCRE_LOCALE)
1380 {
1381 *class_flag |= 2;
1382 }
1383 else
1384 {
1385 for (c = 0; c < 32; c++)
1386 class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1387 }
1388 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001389
Guido van Rossum50700601997-12-08 17:15:20 +00001390 case ESC_s:
Guido van Rossum50700601997-12-08 17:15:20 +00001391 {
1392 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];
1393 }
1394 continue;
1395
1396 case ESC_S:
Guido van Rossum50700601997-12-08 17:15:20 +00001397 {
1398 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];
1399 }
1400 continue;
1401
1402 default:
1403 *errorptr = ERR7;
1404 goto FAILED;
1405 }
1406 }
1407 /* Fall through if single character */
1408 }
1409
1410 /* A single character may be followed by '-' to form a range. However,
1411 Perl does not permit ']' to be the end of the range. A '-' character
1412 here is treated as a literal. */
1413
1414 if (ptr[1] == '-' && ptr[2] != ']')
1415 {
1416 int d;
1417 ptr += 2;
1418 d = *ptr;
1419
1420 if (d == 0)
1421 {
1422 *errorptr = ERR6;
1423 goto FAILED;
1424 }
1425
1426 /* The second part of a range can be a single-character escape, but
1427 not any of the other escapes. */
1428
1429 if (d == '\\')
1430 {
1431 d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1432 if (d < 0)
1433 {
1434 if (d == -ESC_b) d = '\b'; else
1435 {
1436 *errorptr = ERR7;
1437 goto FAILED;
1438 }
1439 }
1440 }
1441
1442 if (d < c)
1443 {
1444 *errorptr = ERR8;
1445 goto FAILED;
1446 }
1447
1448 for (; c <= d; c++)
1449 {
1450 class[c/8] |= (1 << (c&7));
1451 if ((options & PCRE_CASELESS) != 0)
1452 {
1453 int uc = pcre_fcc[c]; /* flip case */
1454 class[uc/8] |= (1 << (uc&7));
1455 }
1456 class_charcount++; /* in case a one-char range */
1457 class_lastchar = c;
1458 }
1459 continue; /* Go get the next char in the class */
1460 }
1461
1462 /* Handle a lone single character - we can get here for a normal
1463 non-escape char, or after \ that introduces a single character. */
1464
1465 class [c/8] |= (1 << (c&7));
1466 if ((options & PCRE_CASELESS) != 0)
1467 {
1468 c = pcre_fcc[c]; /* flip case */
1469 class[c/8] |= (1 << (c&7));
1470 }
1471 class_charcount++;
1472 class_lastchar = c;
1473 }
1474
1475 /* Loop until ']' reached; the check for end of string happens inside the
1476 loop. This "while" is the end of the "do" above. */
1477
1478 while ((c = *(++ptr)) != ']');
1479
1480 /* If class_charcount is 1 and class_lastchar is not negative, we saw
1481 precisely one character. This doesn't need the whole 32-byte bit map.
1482 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1483 it's negative. */
1484
1485 if (class_charcount == 1 && class_lastchar >= 0)
1486 {
1487 if (negate_class)
1488 {
1489 code[-1] = OP_NOT;
1490 }
1491 else
1492 {
1493 code[-1] = OP_CHARS;
1494 *code++ = 1;
1495 }
1496 *code++ = class_lastchar;
1497 }
1498
1499 /* Otherwise, negate the 32-byte map if necessary, and copy it into
1500 the code vector. */
1501
1502 else
1503 {
1504 /* If this is a localized opcode, bump the code pointer up */
1505 if (class_flag) code++;
1506 if (negate_class)
1507 {
1508 if (class_flag) *class_flag = (*class_flag) ^ 63;
1509 for (c = 0; c < 32; c++) code[c] = ~class[c];
1510 }
1511 else
1512 memcpy(code, class, 32);
1513 code += 32;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001514 }
1515 break;
1516
1517 /* Various kinds of repeat */
1518
1519 case '{':
Guido van Rossum50700601997-12-08 17:15:20 +00001520 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001521 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
1522 if (*errorptr != NULL) goto FAILED;
1523 goto REPEAT;
1524
1525 case '*':
1526 repeat_min = 0;
1527 repeat_max = -1;
1528 goto REPEAT;
1529
1530 case '+':
1531 repeat_min = 1;
1532 repeat_max = -1;
1533 goto REPEAT;
1534
1535 case '?':
1536 repeat_min = 0;
1537 repeat_max = 1;
1538
1539 REPEAT:
1540 if (previous == NULL)
1541 {
Guido van Rossum50700601997-12-08 17:15:20 +00001542 *errorptr = ERR9;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001543 goto FAILED;
1544 }
1545
Guido van Rossumdda66961998-05-07 15:32:44 +00001546 /* If the next character is '?' this is a minimizing repeat, by default,
1547 but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001548 next character. */
1549
Guido van Rossumdda66961998-05-07 15:32:44 +00001550 if (ptr[1] == '?')
1551 { repeat_type = greedy_non_default; ptr++; }
1552 else repeat_type = greedy_default;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001553
Guido van Rossum50700601997-12-08 17:15:20 +00001554 /* If the maximum is zero then the minimum must also be zero; Perl allows
1555 this case, so we do too - by simply omitting the item altogether. */
1556
1557 if (repeat_max == 0) code = previous;
1558
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001559 /* If previous was a string of characters, chop off the last one and use it
1560 as the subject of the repeat. If there was only one character, we can
1561 abolish the previous item altogether. */
1562
Guido van Rossum50700601997-12-08 17:15:20 +00001563 else if (*previous == OP_CHARS)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001564 {
1565 int len = previous[1];
1566 if (len == 1)
1567 {
1568 c = previous[2];
1569 code = previous;
1570 }
1571 else
1572 {
1573 c = previous[len+1];
1574 previous[1]--;
1575 code--;
1576 }
1577 op_type = 0; /* Use single-char op codes */
1578 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1579 }
1580
Guido van Rossum50700601997-12-08 17:15:20 +00001581 /* If previous was a single negated character ([^a] or similar), we use
1582 one of the special opcodes, replacing it. The code is shared with single-
1583 character repeats by adding a suitable offset into repeat_type. */
1584
1585 else if ((int)*previous == OP_NOT)
1586 {
1587 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
1588 c = previous[1];
1589 code = previous;
1590 goto OUTPUT_SINGLE_REPEAT;
1591 }
1592
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001593 /* If previous was a character type match (\d or similar), abolish it and
1594 create a suitable repeat item. The code is shared with single-character
1595 repeats by adding a suitable offset into repeat_type. */
1596
Guido van Rossum50700601997-12-08 17:15:20 +00001597 else if ((int)*previous < OP_CIRC || *previous == OP_ANY)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001598 {
1599 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1600 c = *previous;
1601 code = previous;
1602
1603 OUTPUT_SINGLE_REPEAT:
1604 repeat_type += op_type; /* Combine both values for many cases */
1605
1606 /* A minimum of zero is handled either as the special case * or ?, or as
1607 an UPTO, with the maximum given. */
1608
1609 if (repeat_min == 0)
1610 {
1611 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1612 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1613 else
1614 {
1615 *code++ = OP_UPTO + repeat_type;
1616 *code++ = repeat_max >> 8;
1617 *code++ = (repeat_max & 255);
1618 }
1619 }
1620
1621 /* The case {1,} is handled as the special case + */
1622
1623 else if (repeat_min == 1 && repeat_max == -1)
1624 *code++ = OP_PLUS + repeat_type;
1625
1626 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1627 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1628
1629 else
1630 {
1631 if (repeat_min != 1)
1632 {
1633 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1634 *code++ = repeat_min >> 8;
1635 *code++ = (repeat_min & 255);
1636 }
1637
Thomas Wouters7e474022000-07-16 12:04:32 +00001638 /* If the minimum is 1 and the previous item was a character string,
1639 we either have to put back the item that got canceled if the string
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001640 length was 1, or add the character back onto the end of a longer
Guido van Rossumdda66961998-05-07 15:32:44 +00001641 string. For a character type nothing need be done; it will just get
1642 put back naturally. Note that the final character is always going to
1643 get added below. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001644
1645 else if (*previous == OP_CHARS)
1646 {
1647 if (code == previous) code += 2; else previous[1]++;
1648 }
1649
Guido van Rossumdda66961998-05-07 15:32:44 +00001650 /* For a single negated character we also have to put back the
Thomas Wouters7e474022000-07-16 12:04:32 +00001651 item that got canceled. */
Guido van Rossumdda66961998-05-07 15:32:44 +00001652
1653 else if (*previous == OP_NOT) code++;
1654
Guido van Rossum557dea11997-12-22 22:46:52 +00001655 /* If the maximum is unlimited, insert an OP_STAR. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001656
Guido van Rossum557dea11997-12-22 22:46:52 +00001657 if (repeat_max < 0)
1658 {
1659 *code++ = c;
1660 *code++ = OP_STAR + repeat_type;
1661 }
1662
1663 /* Else insert an UPTO if the max is greater than the min. */
1664
1665 else if (repeat_max != repeat_min)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001666 {
1667 *code++ = c;
1668 repeat_max -= repeat_min;
1669 *code++ = OP_UPTO + repeat_type;
1670 *code++ = repeat_max >> 8;
1671 *code++ = (repeat_max & 255);
1672 }
1673 }
1674
1675 /* The character or character type itself comes last in all cases. */
1676
1677 *code++ = c;
1678 }
1679
1680 /* If previous was a character class or a back reference, we put the repeat
1681 stuff after it. */
1682
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001683 else if (*previous == OP_CLASS || *previous == OP_NEGCLASS ||
1684 *previous==OP_CLASS_L || *previous == OP_REF)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001685 {
1686 if (repeat_min == 0 && repeat_max == -1)
1687 *code++ = OP_CRSTAR + repeat_type;
1688 else if (repeat_min == 1 && repeat_max == -1)
1689 *code++ = OP_CRPLUS + repeat_type;
1690 else if (repeat_min == 0 && repeat_max == 1)
1691 *code++ = OP_CRQUERY + repeat_type;
1692 else
1693 {
1694 *code++ = OP_CRRANGE + repeat_type;
1695 *code++ = repeat_min >> 8;
1696 *code++ = repeat_min & 255;
1697 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1698 *code++ = repeat_max >> 8;
1699 *code++ = repeat_max & 255;
1700 }
1701 }
1702
1703 /* If previous was a bracket group, we may have to replicate it in certain
1704 cases. If the maximum repeat count is unlimited, check that the bracket
1705 group cannot match the empty string, and diagnose an error if it can. */
1706
1707 else if ((int)*previous >= OP_BRA)
1708 {
1709 int i;
Guido van Rossum557dea11997-12-22 22:46:52 +00001710 int len = code - previous;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001711
1712 if (repeat_max == -1 && could_be_empty(previous))
1713 {
Guido van Rossum50700601997-12-08 17:15:20 +00001714 *errorptr = ERR10;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001715 goto FAILED;
1716 }
1717
1718 /* If the minimum is greater than zero, and the maximum is unlimited or
1719 equal to the minimum, the first copy remains where it is, and is
1720 replicated up to the minimum number of times. This case includes the +
1721 repeat, but of course no replication is needed in that case. */
1722
1723 if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1724 {
1725 for (i = 1; i < repeat_min; i++)
1726 {
Guido van Rossum557dea11997-12-22 22:46:52 +00001727 memcpy(code, previous, len);
1728 code += len;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001729 }
1730 }
1731
1732 /* If the minimum is zero, stick BRAZERO in front of the first copy.
1733 Then, if there is a fixed upper limit, replicated up to that many times,
1734 sticking BRAZERO in front of all the optional ones. */
1735
1736 else
1737 {
1738 if (repeat_min == 0)
1739 {
Guido van Rossum557dea11997-12-22 22:46:52 +00001740 memmove(previous+1, previous, len);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001741 code++;
1742 *previous++ = OP_BRAZERO + repeat_type;
1743 }
1744
1745 for (i = 1; i < repeat_min; i++)
1746 {
Guido van Rossum557dea11997-12-22 22:46:52 +00001747 memcpy(code, previous, len);
1748 code += len;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001749 }
1750
1751 for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1752 {
1753 *code++ = OP_BRAZERO + repeat_type;
Guido van Rossum557dea11997-12-22 22:46:52 +00001754 memcpy(code, previous, len);
1755 code += len;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001756 }
1757 }
1758
1759 /* If the maximum is unlimited, set a repeater in the final copy. */
1760
1761 if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
1762 }
1763
1764 /* Else there's some kind of shambles */
1765
1766 else
1767 {
Guido van Rossum50700601997-12-08 17:15:20 +00001768 *errorptr = ERR11;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001769 goto FAILED;
1770 }
1771
1772 /* In all case we no longer have a previous item. */
1773
1774 previous = NULL;
1775 break;
1776
1777
1778 /* Start of nested bracket sub-expression, or comment or lookahead.
1779 First deal with special things that can come after a bracket; all are
1780 introduced by ?, and the appearance of any of them means that this is not a
1781 referencing group. They were checked for validity in the first pass over
1782 the string, so we don't have to check for syntax errors here. */
1783
1784 case '(':
1785 previous = code; /* Only real brackets can be repeated */
1786 if (*(++ptr) == '?')
1787 {
1788 bravalue = OP_BRA;
1789
1790 switch (*(++ptr))
1791 {
1792 case '#':
1793 case 'i':
Guido van Rossumbd49ac41997-12-10 23:05:53 +00001794 case 'L':
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001795 case 'm':
1796 case 's':
1797 case 'x':
1798 ptr++;
1799 while (*ptr != ')') ptr++;
1800 previous = NULL;
1801 continue;
1802
1803 case ':': /* Non-extracting bracket */
1804 ptr++;
1805 break;
1806
1807 case '=': /* Assertions can't be repeated */
1808 bravalue = OP_ASSERT;
1809 ptr++;
1810 previous = NULL;
1811 break;
1812
1813 case '!':
1814 bravalue = OP_ASSERT_NOT;
1815 ptr++;
1816 previous = NULL;
1817 break;
1818
1819 case ('P'):
1820 ptr++;
1821 if (*ptr=='<')
1822 {
1823 /* (?P<groupname>...) */
1824 int idlen;
1825 PyObject *string, *intobj;
1826
1827 ptr++;
1828 idlen = get_group_id(ptr, '>', errorptr);
1829 if (*errorptr) {
1830 goto FAILED;
1831 }
Guido van Rossum57ba4f31997-12-02 20:40:28 +00001832 string = PyString_FromStringAndSize((char*)ptr, idlen);
Guido van Rossum50700601997-12-08 17:15:20 +00001833 intobj = PyInt_FromLong( brackets[0] + 1 );
Guido van Rossum58132c61997-12-17 00:24:13 +00001834 if (intobj == NULL || string == NULL)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001835 {
1836 Py_XDECREF(string);
1837 Py_XDECREF(intobj);
1838 *errorptr = "exception raised";
1839 goto FAILED;
1840 }
1841 PyDict_SetItem(dictionary, string, intobj);
Guido van Rossum58132c61997-12-17 00:24:13 +00001842 Py_DECREF(string); Py_DECREF(intobj); /* XXX DECREF commented out! */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001843 ptr += idlen+1; /* Point to rest of expression */
1844 goto do_grouping_bracket;
1845 }
1846 if (*ptr=='=')
1847 {
1848 /* (?P=groupname) */
1849 int idlen, refnum;
1850 PyObject *string, *intobj;
1851
1852 ptr++;
1853 idlen = get_group_id(ptr, ')', errorptr);
1854 if (*errorptr) {
1855 goto FAILED;
1856 }
Guido van Rossum50700601997-12-08 17:15:20 +00001857 string = PyString_FromStringAndSize((char *)ptr, idlen);
Guido van Rossumc3861071997-10-08 02:07:40 +00001858 if (string==NULL) {
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001859 *errorptr = "exception raised";
1860 goto FAILED;
1861 }
1862 intobj = PyDict_GetItem(dictionary, string);
1863 if (intobj==NULL) {
Guido van Rossumc3861071997-10-08 02:07:40 +00001864 Py_DECREF(string);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001865 *errorptr = "?P= group identifier isn't defined";
1866 goto FAILED;
1867 }
1868
1869 refnum = PyInt_AsLong(intobj);
Guido van Rossum1eadb411997-12-15 17:33:24 +00001870 Py_DECREF(string);
Guido van Rossum58132c61997-12-17 00:24:13 +00001871 /* The caller doesn't own the reference to the value
1872 returned from PyDict_GetItem, so intobj is not
1873 DECREF'ed. */
1874
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001875 *code++ = OP_REF;
1876 *code++ = refnum;
1877 /* The continue will cause the top-level for() loop to
1878 be resumed, so ptr will be immediately incremented.
1879 Therefore, the following line adds just idlen, not
1880 idlen+1 */
1881 ptr += idlen;
1882 continue;
1883 }
1884 /* The character after ?P is neither < nor =, so
1885 report an error. Add more Python-extensions here. */
1886 *errorptr="unknown after (?P";
1887 goto FAILED;
Guido van Rossum50700601997-12-08 17:15:20 +00001888
1889 case '>': /* "Match once" brackets */
1890 if ((options & PCRE_EXTRA) != 0) /* Not yet standard */
1891 {
1892 bravalue = OP_ONCE;
1893 ptr++;
1894 previous = NULL;
1895 break;
1896 }
1897 /* Else fall through */
1898
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001899 default:
Guido van Rossum50700601997-12-08 17:15:20 +00001900 *errorptr = ERR12;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001901 goto FAILED;
1902 }
1903 }
1904
1905 /* Else we have a referencing group */
1906
1907 else
1908 {
1909 do_grouping_bracket:
Guido van Rossum50700601997-12-08 17:15:20 +00001910 if (++(*brackets) > EXTRACT_MAX)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001911 {
Guido van Rossum50700601997-12-08 17:15:20 +00001912 *errorptr = ERR13;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001913 goto FAILED;
1914 }
Guido van Rossum50700601997-12-08 17:15:20 +00001915 bravalue = OP_BRA + *brackets;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001916 }
1917
1918 /* Process nested bracketed re; at end pointer is on the bracket. We copy
1919 code into a non-register variable in order to be able to pass its address
1920 because some compilers complain otherwise. */
1921
1922 *code = bravalue;
1923 {
1924 uschar *mcode = code;
Guido van Rossum50700601997-12-08 17:15:20 +00001925 if (!compile_regex(options, brackets, &mcode, &ptr, errorptr, dictionary))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001926 goto FAILED;
1927 code = mcode;
1928 }
1929
1930 if (*ptr != ')')
1931 {
Guido van Rossum50700601997-12-08 17:15:20 +00001932 *errorptr = ERR14;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001933 goto FAILED;
1934 }
1935 break;
1936
1937 /* Check \ for being a real metacharacter; if not, fall through and handle
1938 it as a data character at the start of a string. Escape items are checked
1939 for validity in the pre-compiling pass. */
1940
1941 case '\\':
1942 oldptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00001943 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001944
1945 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1946 are arranged to be the negation of the corresponding OP_values. For the
1947 back references, the values are ESC_REF plus the reference number. Only
1948 back references and those types that consume a character may be repeated.
1949 We can test for values between ESC_b and ESC_Z for the latter; this may
1950 have to change if any new ones are ever created. */
1951
1952 if (c < 0)
1953 {
1954 if (-c >= ESC_REF)
1955 {
Guido van Rossum50700601997-12-08 17:15:20 +00001956 int refnum = -c - ESC_REF;
1957 if (*brackets < refnum)
1958 {
1959 *errorptr = ERR15;
1960 goto FAILED;
1961 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001962 previous = code;
1963 *code++ = OP_REF;
1964 *code++ = refnum;
1965 }
1966 else
1967 {
Guido van Rossum50700601997-12-08 17:15:20 +00001968 previous = (-c > ESC_b && -c < ESC_X)? code : NULL;
1969 if ( (options & PCRE_LOCALE) != 0)
1970 {
1971 switch (c)
1972 {
1973 case (-ESC_b): c = -OP_WORD_BOUNDARY_L; break;
1974 case (-ESC_B): c = -OP_NOT_WORD_BOUNDARY_L; break;
1975 case (-ESC_w): c = -OP_WORDCHAR_L; break;
1976 case (-ESC_W): c = -OP_NOT_WORDCHAR_L; break;
1977 }
1978 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001979 *code++ = -c;
1980 }
1981 continue;
1982 }
1983
Guido van Rossum58132c61997-12-17 00:24:13 +00001984 /* Data character: Reset and fall through */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001985
1986 ptr = oldptr;
1987 c = '\\';
1988
1989 /* Handle a run of data characters until a metacharacter is encountered.
1990 The first character is guaranteed not to be whitespace or # when the
1991 extended flag is set. */
1992
Guido van Rossum50700601997-12-08 17:15:20 +00001993 NORMAL_CHAR:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001994 default:
1995 previous = code;
1996 *code = OP_CHARS;
1997 code += 2;
1998 length = 0;
1999
2000 do
2001 {
Guido van Rossum50700601997-12-08 17:15:20 +00002002 if ((options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002003 {
2004 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
2005 if (c == '#')
2006 {
2007 while ((c = *(++ptr)) != 0 && c != '\n');
2008 if (c == 0) break;
2009 continue;
2010 }
2011 }
2012
2013 /* Backslash may introduce a data char or a metacharacter. Escaped items
2014 are checked for validity in the pre-compiling pass. Stop the string
2015 before a metaitem. */
2016
2017 if (c == '\\')
2018 {
2019 oldptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00002020 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002021 if (c < 0) { ptr = oldptr; break; }
2022 }
2023
2024 /* Ordinary character or single-char escape */
2025
2026 *code++ = c;
2027 length++;
2028 }
2029
2030 /* This "while" is the end of the "do" above. */
2031
2032 while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2033
2034 /* Compute the length and set it in the data vector, and advance to
2035 the next state. */
2036
2037 previous[1] = length;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00002038 if (length < 255) ptr--;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002039 break;
2040 }
2041 } /* end of big loop */
2042
2043/* Control never reaches here by falling through, only by a goto for all the
2044error states. Pass back the position in the pattern so that it can be displayed
2045to the user for diagnosing the error. */
2046
2047FAILED:
2048*ptrptr = ptr;
2049return FALSE;
2050}
2051
2052
2053
2054
2055/*************************************************
2056* Compile sequence of alternatives *
2057*************************************************/
2058
2059/* On entry, ptr is pointing past the bracket character, but on return
2060it points to the closing bracket, or vertical bar, or end of string.
2061The code variable is pointing at the byte into which the BRA operator has been
2062stored.
2063
2064Argument:
Guido van Rossum50700601997-12-08 17:15:20 +00002065 options the option bits
2066 brackets -> int containing the number of extracting brackets used
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002067 codeptr -> the address of the current code pointer
2068 ptrptr -> the address of the current pattern pointer
2069 errorptr -> pointer to error message
2070
2071Returns: TRUE on success
2072*/
2073
2074static BOOL
Guido van Rossum50700601997-12-08 17:15:20 +00002075compile_regex(int options, int *brackets, uschar **codeptr,
Guido van Rossum58132c61997-12-17 00:24:13 +00002076 const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002077{
Guido van Rossum58132c61997-12-17 00:24:13 +00002078const uschar *ptr = *ptrptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002079uschar *code = *codeptr;
2080uschar *start_bracket = code;
2081
2082for (;;)
2083 {
2084 int length;
2085 uschar *last_branch = code;
2086
2087 code += 3;
Guido van Rossum50700601997-12-08 17:15:20 +00002088 if (!compile_branch(options, brackets, &code, &ptr, errorptr, dictionary))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002089 {
2090 *ptrptr = ptr;
2091 return FALSE;
2092 }
2093
2094 /* Fill in the length of the last branch */
2095
2096 length = code - last_branch;
2097 last_branch[1] = length >> 8;
2098 last_branch[2] = length & 255;
2099
2100 /* Reached end of expression, either ')' or end of pattern. Insert a
2101 terminating ket and the length of the whole bracketed item, and return,
2102 leaving the pointer at the terminating char. */
2103
2104 if (*ptr != '|')
2105 {
2106 length = code - start_bracket;
2107 *code++ = OP_KET;
2108 *code++ = length >> 8;
2109 *code++ = length & 255;
2110 *codeptr = code;
2111 *ptrptr = ptr;
2112 return TRUE;
2113 }
2114
2115 /* Another branch follows; insert an "or" node and advance the pointer. */
2116
2117 *code = OP_ALT;
2118 ptr++;
2119 }
2120/* Control never reaches here */
2121}
2122
2123
2124
2125/*************************************************
2126* Check for anchored expression *
2127*************************************************/
2128
2129/* Try to find out if this is an anchored regular expression. Consider each
2130alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2131all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2132it's anchored. However, if this is a multiline pattern, then only OP_SOD
2133counts, since OP_CIRC can match in the middle.
2134
2135A branch is also implicitly anchored if it starts with .* because that will try
2136the rest of the pattern at all possible matching points, so there is no point
2137trying them again.
2138
2139Argument: points to start of expression (the bracket)
2140Returns: TRUE or FALSE
2141*/
2142
2143static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00002144is_anchored(register const uschar *code, BOOL multiline)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002145{
2146do {
2147 int op = (int)code[3];
Guido van Rossum50700601997-12-08 17:15:20 +00002148 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002149 { if (!is_anchored(code+3, multiline)) return FALSE; }
2150 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2151 { if (code[4] != OP_ANY) return FALSE; }
2152 else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;
2153 code += (code[1] << 8) + code[2];
2154 }
2155while (*code == OP_ALT);
2156return TRUE;
2157}
2158
2159
2160
2161/*************************************************
2162* Check for start with \n line expression *
2163*************************************************/
2164
2165/* This is called for multiline expressions to try to find out if every branch
2166starts with ^ so that "first char" processing can be done to speed things up.
2167
2168Argument: points to start of expression (the bracket)
2169Returns: TRUE or FALSE
2170*/
2171
2172static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00002173is_startline(const uschar *code)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002174{
2175do {
2176 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2177 { if (!is_startline(code+3)) return FALSE; }
2178 else if (code[3] != OP_CIRC) return FALSE;
2179 code += (code[1] << 8) + code[2];
2180 }
2181while (*code == OP_ALT);
2182return TRUE;
2183}
2184
2185
2186
2187/*************************************************
2188* Check for fixed first char *
2189*************************************************/
2190
2191/* Try to find out if there is a fixed first character. This is called for
2192unanchored expressions, as it speeds up their processing quite considerably.
2193Consider each alternative branch. If they all start with the same char, or with
2194a bracket all of whose alternatives start with the same char (recurse ad lib),
2195then we return that char, otherwise -1.
2196
2197Argument: points to start of expression (the bracket)
2198Returns: -1 or the fixed first char
2199*/
2200
2201static int
2202find_firstchar(uschar *code)
2203{
2204register int c = -1;
2205do
2206 {
2207 register int charoffset = 4;
2208
2209 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2210 {
2211 register int d;
2212 if ((d = find_firstchar(code+3)) < 0) return -1;
2213 if (c < 0) c = d; else if (c != d) return -1;
2214 }
2215
2216 else switch(code[3])
2217 {
2218 default:
2219 return -1;
2220
2221 case OP_EXACT: /* Fall through */
2222 charoffset++;
2223
2224 case OP_CHARS: /* Fall through */
2225 charoffset++;
2226
2227 case OP_PLUS:
2228 case OP_MINPLUS:
2229 if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;
2230 break;
2231 }
2232 code += (code[1] << 8) + code[2];
2233 }
2234while (*code == OP_ALT);
2235return c;
2236}
2237
2238
2239
2240/*************************************************
2241* Compile a Regular Expression *
2242*************************************************/
2243
2244/* This function takes a string and returns a pointer to a block of store
2245holding a compiled version of the expression.
2246
2247Arguments:
2248 pattern the regular expression
2249 options various option bits
2250 errorptr pointer to pointer to error text
2251 erroroffset ptr offset in pattern where error was detected
2252
2253Returns: pointer to compiled data block, or NULL on error,
2254 with errorptr and erroroffset set
2255*/
2256
2257pcre *
Guido van Rossum58132c61997-12-17 00:24:13 +00002258pcre_compile(const char *pattern, int options, const char **errorptr,
Guido van Rossum50700601997-12-08 17:15:20 +00002259 int *erroroffset, PyObject *dictionary)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002260{
2261real_pcre *re;
2262int spaces = 0;
2263int length = 3; /* For initial BRA plus length */
2264int runlength;
2265int c, size;
Guido van Rossum50700601997-12-08 17:15:20 +00002266int bracount = 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002267int brastack[200];
Guido van Rossum50700601997-12-08 17:15:20 +00002268int top_backref = 0;
Guido van Rossum58132c61997-12-17 00:24:13 +00002269unsigned int brastackptr = 0;
2270uschar *code;
2271const uschar *ptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002272
2273#ifdef DEBUG
2274uschar *code_base, *code_end;
2275#endif
2276
Guido van Rossum50700601997-12-08 17:15:20 +00002277/* We can't pass back an error message if errorptr is NULL; I guess the best we
2278can do is just return NULL. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002279
Guido van Rossum50700601997-12-08 17:15:20 +00002280if (errorptr == NULL) return NULL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002281*errorptr = NULL;
Guido van Rossum50700601997-12-08 17:15:20 +00002282
2283/* However, we can give a message for this error */
2284
2285if (erroroffset == NULL)
2286 {
2287 *errorptr = ERR16;
2288 return NULL;
2289 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002290*erroroffset = 0;
2291
2292if ((options & ~PUBLIC_OPTIONS) != 0)
2293 {
Guido van Rossum50700601997-12-08 17:15:20 +00002294 *errorptr = ERR17;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002295 return NULL;
2296 }
2297
Guido van Rossum557dea11997-12-22 22:46:52 +00002298DPRINTF(("------------------------------------------------------------------\n"));
2299DPRINTF(("%s\n", pattern));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002300
2301/* The first thing to do is to make a pass over the pattern to compute the
2302amount of store required to hold the compiled code. This does not have to be
2303perfect as long as errors are overestimates. At the same time we can detect any
2304internal flag settings. Make an attempt to correct for any counted white space
2305if an "extended" flag setting appears late in the pattern. We can't be so
2306clever for #-comments. */
2307
Guido van Rossum58132c61997-12-17 00:24:13 +00002308ptr = (const uschar *)(pattern - 1);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002309while ((c = *(++ptr)) != 0)
2310 {
Guido van Rossum50700601997-12-08 17:15:20 +00002311 int min, max;
2312 int class_charcount;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002313
2314 if ((pcre_ctypes[c] & ctype_space) != 0)
2315 {
Guido van Rossum50700601997-12-08 17:15:20 +00002316 if ((options & PCRE_EXTENDED) != 0) continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002317 spaces++;
2318 }
2319
Guido van Rossum50700601997-12-08 17:15:20 +00002320 if (c == '#' && (options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002321 {
2322 while ((c = *(++ptr)) != 0 && c != '\n');
2323 continue;
2324 }
2325
2326 switch(c)
2327 {
2328 /* A backslashed item may be an escaped "normal" character or a
2329 character type. For a "normal" character, put the pointers and
2330 character back so that tests for whitespace etc. in the input
2331 are done correctly. */
2332
2333 case '\\':
2334 {
Guido van Rossum58132c61997-12-17 00:24:13 +00002335 const uschar *save_ptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00002336 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002337 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2338 if (c >= 0)
2339 {
2340 ptr = save_ptr;
2341 c = '\\';
2342 goto NORMAL_CHAR;
2343 }
2344 }
2345 length++;
2346
2347 /* A back reference needs an additional char, plus either one or 5
Guido van Rossum50700601997-12-08 17:15:20 +00002348 bytes for a repeat. We also need to keep the value of the highest
2349 back reference. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002350
2351 if (c <= -ESC_REF)
2352 {
Guido van Rossum50700601997-12-08 17:15:20 +00002353 int refnum = -c - ESC_REF;
2354 if (refnum > top_backref) top_backref = refnum;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002355 length++; /* For single back reference */
Guido van Rossum50700601997-12-08 17:15:20 +00002356 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002357 {
2358 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2359 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2360 if ((min == 0 && (max == 1 || max == -1)) ||
2361 (min == 1 && max == -1))
2362 length++;
2363 else length += 5;
2364 if (ptr[1] == '?') ptr++;
2365 }
2366 }
2367 continue;
2368
2369 case '^':
2370 case '.':
2371 case '$':
2372 case '*': /* These repeats won't be after brackets; */
2373 case '+': /* those are handled separately */
2374 case '?':
2375 length++;
2376 continue;
2377
2378 /* This covers the cases of repeats after a single char, metachar, class,
2379 or back reference. */
2380
2381 case '{':
Guido van Rossum50700601997-12-08 17:15:20 +00002382 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002383 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
2384 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2385 if ((min == 0 && (max == 1 || max == -1)) ||
2386 (min == 1 && max == -1))
2387 length++;
2388 else
2389 {
2390 length--; /* Uncount the original char or metachar */
2391 if (min == 1) length++; else if (min > 0) length += 4;
2392 if (max > 0) length += 4; else length += 2;
2393 }
2394 if (ptr[1] == '?') ptr++;
2395 continue;
2396
2397 /* An alternation contains an offset to the next branch or ket. */
2398 case '|':
2399 length += 3;
2400 continue;
2401
Guido van Rossum50700601997-12-08 17:15:20 +00002402 /* A character class uses 33 characters. Don't worry about character types
2403 that aren't allowed in classes - they'll get picked up during the compile.
2404 A character class that contains only one character uses 2 or 3 bytes,
2405 depending on whether it is negated or not. Notice this where we can. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002406
2407 case '[':
Guido van Rossum50700601997-12-08 17:15:20 +00002408 class_charcount = 0;
2409 if (*(++ptr) == '^') ptr++;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002410 do
2411 {
Guido van Rossum50700601997-12-08 17:15:20 +00002412 if (*ptr == '\\')
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002413 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002414 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002415 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
Guido van Rossum557dea11997-12-22 22:46:52 +00002416 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002417 }
Guido van Rossum50700601997-12-08 17:15:20 +00002418 else class_charcount++;
2419 ptr++;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002420 }
2421 while (*ptr != 0 && *ptr != ']');
2422
Guido van Rossum50700601997-12-08 17:15:20 +00002423 /* Repeats for negated single chars are handled by the general code */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002424
Guido van Rossum50700601997-12-08 17:15:20 +00002425 if (class_charcount == 1) length += 3; else
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002426 {
Guido van Rossum50700601997-12-08 17:15:20 +00002427 length += 33;
2428 if (options & PCRE_LOCALE) length++; /* Add a byte for the localization flag */
2429
2430 /* A repeat needs either 1 or 5 bytes. */
2431
Guido van Rossum557dea11997-12-22 22:46:52 +00002432 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
Guido van Rossum50700601997-12-08 17:15:20 +00002433 {
2434 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2435 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2436 if ((min == 0 && (max == 1 || max == -1)) ||
2437 (min == 1 && max == -1))
2438 length++;
2439 else length += 5;
2440 if (ptr[1] == '?') ptr++;
2441 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002442 }
2443 continue;
2444
2445 /* Brackets may be genuine groups or special things */
2446
2447 case '(':
2448
2449 /* Handle special forms of bracket, which all start (? */
2450
2451 if (ptr[1] == '?') switch (c = ptr[2])
2452 {
2453 /* Skip over comments entirely */
2454 case '#':
2455 ptr += 3;
2456 while (*ptr != 0 && *ptr != ')') ptr++;
2457 if (*ptr == 0)
2458 {
Guido van Rossum50700601997-12-08 17:15:20 +00002459 *errorptr = ERR18;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002460 goto PCRE_ERROR_RETURN;
2461 }
2462 continue;
2463
2464 /* Non-referencing groups and lookaheads just move the pointer on, and
Guido van Rossum50700601997-12-08 17:15:20 +00002465 then behave like a non-special bracket, except that they don't increment
2466 the count of extracting brackets. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002467
2468 case ':':
2469 case '=':
2470 case '!':
2471 ptr += 2;
2472 break;
2473
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002474 case ('P'):
2475 {
2476 int idlen;
2477 switch (*ptr++) {
2478 case ('<'):
2479 idlen = get_group_id(ptr++, '>', errorptr);
2480 if (*errorptr) goto PCRE_ERROR_RETURN;
2481 ptr += idlen+1;
2482 break;
2483 case ('='):
2484 idlen = get_group_id(ptr++, ')', errorptr);
2485 if (*errorptr) goto PCRE_ERROR_RETURN;
2486 ptr += idlen+1;
2487 length++;
2488 break;
2489 }
2490 }
2491 break;
2492
Guido van Rossum50700601997-12-08 17:15:20 +00002493 /* Ditto for the "once only" bracket, allowed only if the extra bit
2494 is set. */
2495
2496 case '>':
2497 if ((options & PCRE_EXTRA) != 0)
2498 {
2499 ptr += 2;
2500 break;
2501 }
Guido van Rossumdda66961998-05-07 15:32:44 +00002502 /* Else fall through */
Guido van Rossum50700601997-12-08 17:15:20 +00002503
2504 /* Else loop setting valid options until ) is met. Anything else is an
2505 error. */
2506
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002507 default:
2508 ptr += 2;
2509 for (;; ptr++)
2510 {
2511 if ((c = *ptr) == 'i')
2512 {
2513 options |= PCRE_CASELESS;
2514 continue;
2515 }
Guido van Rossumbd49ac41997-12-10 23:05:53 +00002516 else if ((c = *ptr) == 'L')
Guido van Rossum50700601997-12-08 17:15:20 +00002517 {
2518 options |= PCRE_LOCALE;
2519 continue;
2520 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002521 else if ((c = *ptr) == 'm')
2522 {
2523 options |= PCRE_MULTILINE;
2524 continue;
2525 }
Guido van Rossum50700601997-12-08 17:15:20 +00002526 else if (c == 's')
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002527 {
2528 options |= PCRE_DOTALL;
2529 continue;
2530 }
2531 else if (c == 'x')
2532 {
2533 options |= PCRE_EXTENDED;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002534 length -= spaces; /* Already counted spaces */
2535 continue;
2536 }
2537 else if (c == ')') break;
2538
Guido van Rossum50700601997-12-08 17:15:20 +00002539 *errorptr = ERR12;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002540 goto PCRE_ERROR_RETURN;
2541 }
2542 continue; /* End of this bracket handling */
2543 }
2544
Guido van Rossum50700601997-12-08 17:15:20 +00002545 /* Extracting brackets must be counted so we can process escapes in a
2546 Perlish way. */
2547
2548 else bracount++;
2549
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002550 /* Non-special forms of bracket. Save length for computing whole length
2551 at end if there's a repeat that requires duplication of the group. */
2552
2553 if (brastackptr >= sizeof(brastack)/sizeof(int))
2554 {
Guido van Rossum50700601997-12-08 17:15:20 +00002555 *errorptr = ERR19;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002556 goto PCRE_ERROR_RETURN;
2557 }
2558
2559 brastack[brastackptr++] = length;
2560 length += 3;
2561 continue;
2562
2563 /* Handle ket. Look for subsequent max/min; for certain sets of values we
Guido van Rossum557dea11997-12-22 22:46:52 +00002564 have to replicate this bracket up to that many times. If brastackptr is
2565 0 this is an unmatched bracket which will generate an error, but take care
2566 not to try to access brastack[-1]. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002567
2568 case ')':
2569 length += 3;
2570 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002571 int minval = 1;
2572 int maxval = 1;
2573 int duplength = (brastackptr > 0)? length - brastack[--brastackptr] : 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002574
2575 /* Leave ptr at the final char; for read_repeat_counts this happens
2576 automatically; for the others we need an increment. */
2577
Guido van Rossum50700601997-12-08 17:15:20 +00002578 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002579 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002580 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002581 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2582 }
Guido van Rossum557dea11997-12-22 22:46:52 +00002583 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2584 else if (c == '+') { maxval = -1; ptr++; }
2585 else if (c == '?') { minval = 0; ptr++; }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002586
Guido van Rossum557dea11997-12-22 22:46:52 +00002587 /* If there is a minimum > 1 we have to replicate up to minval-1 times;
2588 if there is a limited maximum we have to replicate up to maxval-1 times
2589 and allow for a BRAZERO item before each optional copy, as we also have
2590 to do before the first copy if the minimum is zero. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002591
Guido van Rossum557dea11997-12-22 22:46:52 +00002592 if (minval == 0) length++;
2593 else if (minval > 1) length += (minval - 1) * duplength;
2594 if (maxval > minval) length += (maxval - minval) * (duplength + 1);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002595 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002596 continue;
2597
2598 /* Non-special character. For a run of such characters the length required
2599 is the number of characters + 2, except that the maximum run length is 255.
2600 We won't get a skipped space or a non-data escape or the start of a #
2601 comment as the first character, so the length can't be zero. */
2602
2603 NORMAL_CHAR:
2604 default:
2605 length += 2;
2606 runlength = 0;
2607 do
2608 {
2609 if ((pcre_ctypes[c] & ctype_space) != 0)
2610 {
Guido van Rossum50700601997-12-08 17:15:20 +00002611 if ((options & PCRE_EXTENDED) != 0) continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002612 spaces++;
2613 }
2614
Guido van Rossum50700601997-12-08 17:15:20 +00002615 if (c == '#' && (options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002616 {
2617 while ((c = *(++ptr)) != 0 && c != '\n');
2618 continue;
2619 }
2620
2621 /* Backslash may introduce a data char or a metacharacter; stop the
2622 string before the latter. */
2623
2624 if (c == '\\')
2625 {
Guido van Rossum58132c61997-12-17 00:24:13 +00002626 const uschar *saveptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00002627 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002628 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2629 if (c < 0) { ptr = saveptr; break; }
2630 }
2631
2632 /* Ordinary character or single-char escape */
2633
2634 runlength++;
2635 }
2636
2637 /* This "while" is the end of the "do" above. */
2638
2639 while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2640
2641 ptr--;
2642 length += runlength;
2643 continue;
2644 }
2645 }
2646
2647length += 4; /* For final KET and END */
2648
2649if (length > 65539)
2650 {
Guido van Rossum50700601997-12-08 17:15:20 +00002651 *errorptr = ERR20;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002652 return NULL;
2653 }
2654
2655/* Compute the size of data block needed and get it, either from malloc or
Guido van Rossum557dea11997-12-22 22:46:52 +00002656externally provided function. We specify "code[0]" in the offsetof() expression
2657rather than just "code", because it has been reported that one broken compiler
2658fails on "code" because it is also an independent variable. It should make no
2659difference to the value of the offsetof(). */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002660
Guido van Rossum557dea11997-12-22 22:46:52 +00002661size = length + offsetof(real_pcre, code[0]);
Guido van Rossum50700601997-12-08 17:15:20 +00002662re = (real_pcre *)(pcre_malloc)(size+50);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002663
2664if (re == NULL)
2665 {
Guido van Rossum50700601997-12-08 17:15:20 +00002666 *errorptr = ERR21;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002667 return NULL;
2668 }
2669
Guido van Rossum557dea11997-12-22 22:46:52 +00002670/* Put in the magic number and the options. */
2671
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002672re->magic_number = MAGIC_NUMBER;
2673re->options = options;
2674
2675/* Set up a starting, non-extracting bracket, then compile the expression. On
2676error, *errorptr will be set non-NULL, so we don't need to look at the result
2677of the function here. */
2678
Guido van Rossum58132c61997-12-17 00:24:13 +00002679ptr = (const uschar *)pattern;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002680code = re->code;
2681*code = OP_BRA;
Guido van Rossum50700601997-12-08 17:15:20 +00002682bracount = 0;
2683(void)compile_regex(options, &bracount, &code, &ptr, errorptr, dictionary);
2684re->top_bracket = bracount;
2685re->top_backref = top_backref;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002686
2687/* If not reached end of pattern on success, there's an excess bracket. */
2688
Guido van Rossum50700601997-12-08 17:15:20 +00002689if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
2690
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002691/* Fill in the terminating state and check for disastrous overflow, but
2692if debugging, leave the test till after things are printed out. */
2693
2694*code++ = OP_END;
2695
Guido van Rossum50700601997-12-08 17:15:20 +00002696
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002697#ifndef DEBUG
Guido van Rossum50700601997-12-08 17:15:20 +00002698if (code - re->code > length) *errorptr = ERR23;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002699#endif
2700
2701/* Failed to compile */
2702
2703if (*errorptr != NULL)
2704 {
2705 (pcre_free)(re);
2706 PCRE_ERROR_RETURN:
Guido van Rossum58132c61997-12-17 00:24:13 +00002707 *erroroffset = ptr - (const uschar *)pattern;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002708 return NULL;
2709 }
2710
2711/* If the anchored option was not passed, set flag if we can determine that it
2712is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
2713we can determine what the first character has to be, because that speeds up
2714unanchored matches no end. In the case of multiline matches, an alternative is
2715to set the PCRE_STARTLINE flag if all branches start with ^. */
2716
2717if ((options & PCRE_ANCHORED) == 0)
2718 {
2719 if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))
2720 re->options |= PCRE_ANCHORED;
2721 else
2722 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002723 int ch = find_firstchar(re->code);
2724 if (ch >= 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002725 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002726 re->first_char = ch;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002727 re->options |= PCRE_FIRSTSET;
2728 }
2729 else if (is_startline(re->code))
2730 re->options |= PCRE_STARTLINE;
2731 }
2732 }
2733
2734/* Print out the compiled data for debugging */
2735
2736#ifdef DEBUG
2737
Guido van Rossum50700601997-12-08 17:15:20 +00002738printf("Length = %d top_bracket = %d top_backref=%d\n",
2739 length, re->top_bracket, re->top_backref);
2740
2741if (re->options != 0)
2742 {
Guido van Rossumdda66961998-05-07 15:32:44 +00002743 printf("%s%s%s%s%s%s%s%s\n",
Guido van Rossum50700601997-12-08 17:15:20 +00002744 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2745 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2746 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2747 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2748 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2749 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
Guido van Rossumdda66961998-05-07 15:32:44 +00002750 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
2751 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
Guido van Rossum50700601997-12-08 17:15:20 +00002752 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002753
2754if ((re->options & PCRE_FIRSTSET) != 0)
2755 {
2756 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2757 else printf("First char = \\x%02x\n", re->first_char);
2758 }
2759
2760code_end = code;
2761code_base = code = re->code;
2762
2763while (code < code_end)
2764 {
2765 int charlength;
2766
2767 printf("%3d ", code - code_base);
2768
2769 if (*code >= OP_BRA)
2770 {
2771 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2772 code += 2;
2773 }
2774
2775 else switch(*code)
2776 {
2777 case OP_CHARS:
2778 charlength = *(++code);
2779 printf("%3d ", charlength);
2780 while (charlength-- > 0)
2781 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2782 break;
2783
2784 case OP_KETRMAX:
2785 case OP_KETRMIN:
2786 case OP_ALT:
2787 case OP_KET:
2788 case OP_ASSERT:
2789 case OP_ASSERT_NOT:
Guido van Rossum50700601997-12-08 17:15:20 +00002790 case OP_ONCE:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002791 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2792 code += 2;
2793 break;
2794
2795 case OP_STAR:
2796 case OP_MINSTAR:
2797 case OP_PLUS:
2798 case OP_MINPLUS:
2799 case OP_QUERY:
2800 case OP_MINQUERY:
2801 case OP_TYPESTAR:
2802 case OP_TYPEMINSTAR:
2803 case OP_TYPEPLUS:
2804 case OP_TYPEMINPLUS:
2805 case OP_TYPEQUERY:
2806 case OP_TYPEMINQUERY:
2807 if (*code >= OP_TYPESTAR)
2808 printf(" %s", OP_names[code[1]]);
2809 else if (isprint(c = code[1])) printf(" %c", c);
2810 else printf(" \\x%02x", c);
2811 printf("%s", OP_names[*code++]);
2812 break;
2813
2814 case OP_EXACT:
2815 case OP_UPTO:
2816 case OP_MINUPTO:
2817 if (isprint(c = code[3])) printf(" %c{", c);
2818 else printf(" \\x%02x{", c);
Guido van Rossum557dea11997-12-22 22:46:52 +00002819 if (*code != OP_EXACT) printf("0,");
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002820 printf("%d}", (code[1] << 8) + code[2]);
2821 if (*code == OP_MINUPTO) printf("?");
2822 code += 3;
2823 break;
2824
2825 case OP_TYPEEXACT:
2826 case OP_TYPEUPTO:
2827 case OP_TYPEMINUPTO:
2828 printf(" %s{", OP_names[code[3]]);
2829 if (*code != OP_TYPEEXACT) printf(",");
2830 printf("%d}", (code[1] << 8) + code[2]);
2831 if (*code == OP_TYPEMINUPTO) printf("?");
2832 code += 3;
2833 break;
2834
Guido van Rossum50700601997-12-08 17:15:20 +00002835 case OP_NOT:
2836 if (isprint(c = *(++code))) printf(" [^%c]", c);
2837 else printf(" [^\\x%02x]", c);
2838 break;
2839
2840 case OP_NOTSTAR:
2841 case OP_NOTMINSTAR:
2842 case OP_NOTPLUS:
2843 case OP_NOTMINPLUS:
2844 case OP_NOTQUERY:
2845 case OP_NOTMINQUERY:
2846 if (isprint(c = code[1])) printf(" [^%c]", c);
2847 else printf(" [^\\x%02x]", c);
2848 printf("%s", OP_names[*code++]);
2849 break;
2850
2851 case OP_NOTEXACT:
2852 case OP_NOTUPTO:
2853 case OP_NOTMINUPTO:
2854 if (isprint(c = code[3])) printf(" [^%c]{", c);
2855 else printf(" [^\\x%02x]{", c);
2856 if (*code != OP_NOTEXACT) printf(",");
2857 printf("%d}", (code[1] << 8) + code[2]);
2858 if (*code == OP_NOTMINUPTO) printf("?");
2859 code += 3;
2860 break;
2861
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002862 case OP_REF:
2863 printf(" \\%d", *(++code));
Guido van Rossum557dea11997-12-22 22:46:52 +00002864 code ++;
2865 goto CLASS_REF_REPEAT;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002866
2867 case OP_CLASS:
Guido van Rossum042ff9e1998-04-03 21:13:31 +00002868 case OP_NEGCLASS:
Guido van Rossum50700601997-12-08 17:15:20 +00002869 case OP_CLASS_L:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002870 {
2871 int i, min, max;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002872
Guido van Rossum50700601997-12-08 17:15:20 +00002873 if (*code==OP_CLASS_L)
2874 {
2875 code++;
2876 printf("Locflag = %i ", *code++);
Guido van Rossum042ff9e1998-04-03 21:13:31 +00002877 printf(" [");
Guido van Rossum50700601997-12-08 17:15:20 +00002878 }
2879 else
Guido van Rossum042ff9e1998-04-03 21:13:31 +00002880 {
2881 if (*code++ == OP_CLASS) printf(" [");
2882 else printf(" ^[");
2883 }
2884
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002885
Guido van Rossum50700601997-12-08 17:15:20 +00002886 for (i = 0; i < 256; i++)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002887 {
Guido van Rossum50700601997-12-08 17:15:20 +00002888 if ((code[i/8] & (1 << (i&7))) != 0)
2889 {
2890 int j;
2891 for (j = i+1; j < 256; j++)
2892 if ((code[j/8] & (1 << (j&7))) == 0) break;
2893 if (i == '-' || i == ']') printf("\\");
2894 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2895 if (--j > i)
2896 {
2897 printf("-");
2898 if (j == '-' || j == ']') printf("\\");
2899 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2900 }
2901 i = j;
2902 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002903 }
2904 printf("]");
Guido van Rossum50700601997-12-08 17:15:20 +00002905 code += 32;
2906 /* code ++;*/
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002907
Guido van Rossum557dea11997-12-22 22:46:52 +00002908 CLASS_REF_REPEAT:
2909
Guido van Rossum50700601997-12-08 17:15:20 +00002910 switch(*code)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002911 {
2912 case OP_CRSTAR:
2913 case OP_CRMINSTAR:
2914 case OP_CRPLUS:
2915 case OP_CRMINPLUS:
2916 case OP_CRQUERY:
2917 case OP_CRMINQUERY:
2918 printf("%s", OP_names[*code]);
2919 break;
2920
2921 case OP_CRRANGE:
2922 case OP_CRMINRANGE:
2923 min = (code[1] << 8) + code[2];
2924 max = (code[3] << 8) + code[4];
2925 if (max == 0) printf("{%d,}", min);
2926 else printf("{%d,%d}", min, max);
2927 if (*code == OP_CRMINRANGE) printf("?");
2928 code += 4;
2929 break;
2930
2931 default:
2932 code--;
2933 }
2934 }
2935 break;
2936
2937 /* Anything else is just a one-node item */
2938
2939 default:
2940 printf(" %s", OP_names[*code]);
2941 break;
2942 }
2943
2944 code++;
2945 printf("\n");
2946 }
2947printf("------------------------------------------------------------------\n");
2948
2949/* This check is done here in the debugging case so that the code that
2950was compiled can be seen. */
2951
2952if (code - re->code > length)
2953 {
Guido van Rossum50700601997-12-08 17:15:20 +00002954 printf("length=%i, code length=%i\n", length, code-re->code);
2955 *errorptr = ERR23;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002956 (pcre_free)(re);
2957 *erroroffset = ptr - (uschar *)pattern;
2958 return NULL;
2959 }
2960#endif
2961
2962return (pcre *)re;
2963}
2964
2965
2966
2967/*************************************************
2968* Match a character type *
2969*************************************************/
2970
2971/* Not used in all the places it might be as it's sometimes faster
2972to put the code inline.
2973
2974Arguments:
2975 type the character type
2976 c the character
Guido van Rossum50700601997-12-08 17:15:20 +00002977 dotall the dotall flag
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002978
2979Returns: TRUE if character is of the type
2980*/
2981
2982static BOOL
2983match_type(int type, int c, BOOL dotall)
2984{
2985
2986#ifdef DEBUG
2987if (isprint(c)) printf("matching subject %c against ", c);
2988 else printf("matching subject \\x%02x against ", c);
2989printf("%s\n", OP_names[type]);
2990#endif
2991
2992switch(type)
2993 {
2994 case OP_ANY: return dotall || c != '\n';
2995 case OP_NOT_DIGIT: return (pcre_ctypes[c] & ctype_digit) == 0;
2996 case OP_DIGIT: return (pcre_ctypes[c] & ctype_digit) != 0;
2997 case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
2998 case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0;
2999 case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0;
3000 case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0;
Guido van Rossum58132c61997-12-17 00:24:13 +00003001 case OP_NOT_WORDCHAR_L: return (c!='_' && !isalnum(c));
3002 case OP_WORDCHAR_L: return (c=='_' || isalnum(c));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003003 }
3004return FALSE;
3005}
3006
3007
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003008
3009/*************************************************
3010* Match a back-reference *
3011*************************************************/
3012
3013/* If a back reference hasn't been set, the match fails.
3014
3015Arguments:
3016 number reference number
3017 eptr points into the subject
3018 length length to be matched
3019 md points to match data block
3020
3021Returns: TRUE if matched
3022*/
3023
3024static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00003025match_ref(int number, register const uschar *eptr, int length, match_data *md)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003026{
Guido van Rossum58132c61997-12-17 00:24:13 +00003027const uschar *p = md->start_subject + md->offset_vector[number];
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003028
3029#ifdef DEBUG
3030if (eptr >= md->end_subject)
3031 printf("matching subject <null>");
3032else
3033 {
3034 printf("matching subject ");
3035 pchars(eptr, length, TRUE, md);
3036 }
3037printf(" against backref ");
3038pchars(p, length, FALSE, md);
3039printf("\n");
3040#endif
3041
3042/* Always fail if not enough characters left */
3043
3044if (length > md->end_subject - p) return FALSE;
3045
Guido van Rossum58132c61997-12-17 00:24:13 +00003046/* Separate the caseless case for speed */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003047
3048if (md->caseless)
3049 { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }
3050else
3051 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3052
3053return TRUE;
3054}
3055
3056static int free_stack(match_data *md)
3057{
3058/* Free any stack space that was allocated by the call to match(). */
Andrew M. Kuchlingfc9d2252000-02-18 19:16:45 +00003059if (md->off_num) PyMem_DEL(md->off_num);
3060if (md->offset_top) PyMem_DEL(md->offset_top);
3061if (md->r1) PyMem_DEL(md->r1);
3062if (md->r2) PyMem_DEL(md->r2);
3063if (md->eptr) PyMem_DEL((char *)md->eptr);
3064if (md->ecode) PyMem_DEL((char *)md->ecode);
Guido van Rossumc3861071997-10-08 02:07:40 +00003065return 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003066}
3067
3068static int grow_stack(match_data *md)
3069{
Guido van Rossum50700601997-12-08 17:15:20 +00003070 if (md->length != 0)
3071 {
3072 md->length = md->length + md->length/2;
3073 }
3074 else
3075 {
3076 int string_len = md->end_subject - md->start_subject + 1;
3077 if (string_len < 80) {md->length = string_len; }
3078 else {md->length = 80;}
3079 }
3080 PyMem_RESIZE(md->offset_top, int, md->length);
Tim Petersaf3e8de2002-04-12 07:22:56 +00003081 /* Can't realloc a pointer-to-const; cast const away. */
3082 md->eptr = (const uschar **)PyMem_Realloc((void *)md->eptr,
3083 sizeof(uschar *) * md->length);
3084 md->ecode = (const uschar **)PyMem_Realloc((void *)md->ecode,
3085 sizeof(uschar *) * md->length);
Guido van Rossum50700601997-12-08 17:15:20 +00003086 PyMem_RESIZE(md->off_num, int, md->length);
3087 PyMem_RESIZE(md->r1, int, md->length);
3088 PyMem_RESIZE(md->r2, int, md->length);
3089 if (md->offset_top == NULL || md->eptr == NULL || md->ecode == NULL ||
3090 md->off_num == NULL || md->r1 == NULL || md->r2 == NULL)
3091 {
Guido van Rossumdda66961998-05-07 15:32:44 +00003092 PyErr_NoMemory();
Guido van Rossum50700601997-12-08 17:15:20 +00003093 longjmp(md->error_env, 1);
3094 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003095 return 0;
3096}
3097
Guido van Rossum50700601997-12-08 17:15:20 +00003098
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003099/*************************************************
3100* Match from current position *
3101*************************************************/
3102
3103/* On entry ecode points to the first opcode, and eptr to the first character.
3104
3105Arguments:
3106 eptr pointer in subject
3107 ecode position in code
3108 offset_top current top pointer
3109 md pointer to "static" info for the match
3110
3111Returns: TRUE if matched
3112*/
3113
3114static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00003115match(register const uschar *eptr, register const uschar *ecode, int offset_top,
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003116 match_data *md)
3117{
3118 int save_stack_position = md->point;
3119match_loop:
3120
3121#define SUCCEED goto succeed
3122#define FAIL goto fail
3123
3124for (;;)
3125 {
3126 int min, max, ctype;
3127 register int i;
3128 register int c;
Guido van Rossum58132c61997-12-17 00:24:13 +00003129 BOOL minimize = FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003130
3131 /* Opening bracket. Check the alternative branches in turn, failing if none
3132 match. We have to set the start offset if required and there is space
3133 in the offset vector so that it is available for subsequent back references
3134 if the bracket matches. However, if the bracket fails, we must put back the
3135 previous value of both offsets in case they were set by a previous copy of
3136 the same bracket. Don't worry about setting the flag for the error case here;
3137 that is handled in the code for KET. */
3138
3139 if ((int)*ecode >= OP_BRA)
3140 {
3141 int number = (*ecode - OP_BRA) << 1;
Guido van Rossum58132c61997-12-17 00:24:13 +00003142 int save_offset1 = 0, save_offset2 = 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003143
Guido van Rossum557dea11997-12-22 22:46:52 +00003144 DPRINTF(("start bracket %d\n", number/2));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003145
3146 if (number > 0 && number < md->offset_end)
3147 {
3148 save_offset1 = md->offset_vector[number];
3149 save_offset2 = md->offset_vector[number+1];
3150 md->offset_vector[number] = eptr - md->start_subject;
3151
Guido van Rossum557dea11997-12-22 22:46:52 +00003152 DPRINTF(("saving %d %d\n", save_offset1, save_offset2));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003153 }
3154
3155 /* Recurse for all the alternatives. */
3156
3157 do
3158 {
3159 if (match(eptr, ecode+3, offset_top, md)) SUCCEED;
3160 ecode += (ecode[1] << 8) + ecode[2];
3161 }
3162 while (*ecode == OP_ALT);
3163
Guido van Rossum557dea11997-12-22 22:46:52 +00003164 DPRINTF(("bracket %d failed\n", number/2));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003165
3166 if (number > 0 && number < md->offset_end)
3167 {
3168 md->offset_vector[number] = save_offset1;
3169 md->offset_vector[number+1] = save_offset2;
3170 }
3171
3172 FAIL;
3173 }
3174
3175 /* Other types of node can be handled by a switch */
3176
3177 switch(*ecode)
3178 {
3179 case OP_END:
3180 md->end_match_ptr = eptr; /* Record where we ended */
3181 md->end_offset_top = offset_top; /* and how many extracts were taken */
3182 SUCCEED;
3183
Guido van Rossum50700601997-12-08 17:15:20 +00003184 /* The equivalent of Prolog's "cut" - if the rest doesn't match, the
3185 whole thing doesn't match, so we have to get out via a longjmp(). */
3186
3187 case OP_CUT:
3188 if (match(eptr, ecode+1, offset_top, md)) SUCCEED;
3189 longjmp(md->fail_env, 1);
3190
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003191 /* Assertion brackets. Check the alternative branches in turn - the
3192 matching won't pass the KET for an assertion. If any one branch matches,
3193 the assertion is true. */
3194
3195 case OP_ASSERT:
3196 do
3197 {
3198 if (match(eptr, ecode+3, offset_top, md)) break;
3199 ecode += (ecode[1] << 8) + ecode[2];
3200 }
3201 while (*ecode == OP_ALT);
3202 if (*ecode == OP_KET) FAIL;
3203
3204 /* Continue from after the assertion, updating the offsets high water
3205 mark, since extracts may have been taken during the assertion. */
3206
3207 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3208 ecode += 3;
3209 offset_top = md->end_offset_top;
3210 continue;
3211
3212 /* Negative assertion: all branches must fail to match */
3213
3214 case OP_ASSERT_NOT:
3215 do
3216 {
3217 if (match(eptr, ecode+3, offset_top, md)) FAIL;
3218 ecode += (ecode[1] << 8) + ecode[2];
3219 }
3220 while (*ecode == OP_ALT);
3221 ecode += 3;
3222 continue;
3223
Guido van Rossum50700601997-12-08 17:15:20 +00003224 /* "Once" brackets are like assertion brackets except that after a match,
3225 the point in the subject string is not moved back. Thus there can never be
3226 a move back into the brackets. Check the alternative branches in turn - the
3227 matching won't pass the KET for this kind of subpattern. If any one branch
3228 matches, we carry on, leaving the subject pointer. */
3229
3230 case OP_ONCE:
3231 do
3232 {
3233 if (match(eptr, ecode+3, offset_top, md)) break;
3234 ecode += (ecode[1] << 8) + ecode[2];
3235 }
3236 while (*ecode == OP_ALT);
Guido van Rossum557dea11997-12-22 22:46:52 +00003237 if (*ecode == OP_KET) FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00003238
3239 /* Continue as from after the assertion, updating the offsets high water
3240 mark, since extracts may have been taken. */
3241
3242 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3243 ecode += 3;
3244 offset_top = md->end_offset_top;
3245 eptr = md->end_match_ptr;
3246 continue;
3247
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003248 /* An alternation is the end of a branch; scan along to find the end of the
3249 bracketed group and go to there. */
3250
3251 case OP_ALT:
3252 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3253 break;
3254
3255 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3256 that it may occur zero times. It may repeat infinitely, or not at all -
3257 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3258 repeat limits are compiled as a number of copies, with the optional ones
3259 preceded by BRAZERO or BRAMINZERO. */
3260
3261 case OP_BRAZERO:
3262 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003263 const uschar *next = ecode+1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003264 if (match(eptr, next, offset_top, md)) SUCCEED;
3265 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3266 ecode = next + 3;
3267 }
3268 break;
3269
3270 case OP_BRAMINZERO:
3271 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003272 const uschar *next = ecode+1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003273 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3274 if (match(eptr, next+3, offset_top, md)) SUCCEED;
3275 ecode++;
3276 }
3277 break;;
3278
3279 /* End of a group, repeated or non-repeating. If we are at the end of
3280 an assertion "group", stop matching and SUCCEED, but record the
3281 current high water mark for use by positive assertions. */
3282
3283 case OP_KET:
3284 case OP_KETRMIN:
3285 case OP_KETRMAX:
3286 {
Guido van Rossum50700601997-12-08 17:15:20 +00003287 int number;
Guido van Rossum58132c61997-12-17 00:24:13 +00003288 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003289
Guido van Rossum50700601997-12-08 17:15:20 +00003290 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003291 {
Guido van Rossum50700601997-12-08 17:15:20 +00003292 md->end_match_ptr = eptr; /* For ONCE */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003293 md->end_offset_top = offset_top;
3294 SUCCEED;
3295 }
3296
3297 /* In all other cases we have to check the group number back at the
3298 start and if necessary complete handling an extraction by setting the
3299 final offset and bumping the high water mark. */
3300
3301 number = (*prev - OP_BRA) << 1;
3302
Guido van Rossum557dea11997-12-22 22:46:52 +00003303 DPRINTF(("end bracket %d\n", number/2));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003304
3305 if (number > 0)
3306 {
3307 if (number >= md->offset_end) md->offset_overflow = TRUE; else
3308 {
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003309 md->offset_vector[number+1] = eptr - md->start_subject;
3310 if (offset_top <= number) offset_top = number + 2;
3311 }
3312 }
3313
3314 /* For a non-repeating ket, just advance to the next node and continue at
3315 this level. */
3316
3317 if (*ecode == OP_KET)
3318 {
3319 ecode += 3;
3320 break;
3321 }
3322
3323 /* The repeating kets try the rest of the pattern or restart from the
3324 preceding bracket, in the appropriate order. */
3325
3326 if (*ecode == OP_KETRMIN)
3327 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003328 const uschar *ptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003329 if (match(eptr, ecode+3, offset_top, md)) goto succeed;
3330 /* Handle alternation inside the BRA...KET; push the additional
Guido van Rossum58132c61997-12-17 00:24:13 +00003331 alternatives onto the stack */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003332 ptr=prev;
3333 do {
3334 ptr += (ptr[1]<<8)+ ptr[2];
3335 if (*ptr==OP_ALT)
3336 {
Guido van Rossum50700601997-12-08 17:15:20 +00003337 if (md->length == md->point)
3338 {
3339 grow_stack(md);
3340 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003341 md->offset_top[md->point] = offset_top;
3342 md->eptr[md->point] = eptr;
3343 md->ecode[md->point] = ptr+3;
3344 md->r1[md->point] = 0;
3345 md->r2[md->point] = 0;
3346 md->off_num[md->point] = 0;
3347 md->point++;
3348 }
3349 } while (*ptr==OP_ALT);
3350 ecode=prev+3; goto match_loop;
3351 }
3352 else /* OP_KETRMAX */
3353 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003354 const uschar *ptr;
3355 /*int points_pushed=0;*/
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003356
3357 /* Push one failure point, that will resume matching at the code after
3358 the KETRMAX opcode. */
Guido van Rossum50700601997-12-08 17:15:20 +00003359 if (md->length == md->point)
3360 {
3361 grow_stack(md);
3362 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003363 md->offset_top[md->point] = offset_top;
3364 md->eptr[md->point] = eptr;
3365 md->ecode[md->point] = ecode+3;
3366 md->r1[md->point] = md->offset_vector[number];
3367 md->r2[md->point] = md->offset_vector[number+1];
3368 md->off_num[md->point] = number;
3369 md->point++;
3370
3371 md->offset_vector[number] = eptr - md->start_subject;
3372 /* Handle alternation inside the BRA...KET; push each of the
Guido van Rossum58132c61997-12-17 00:24:13 +00003373 additional alternatives onto the stack */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003374 ptr=prev;
3375 do {
3376 ptr += (ptr[1]<<8)+ ptr[2];
3377 if (*ptr==OP_ALT)
3378 {
Guido van Rossum50700601997-12-08 17:15:20 +00003379 if (md->length == md->point)
3380 if (md->length == md->point)
3381 {
3382 grow_stack(md);
3383 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003384 md->offset_top[md->point] = offset_top;
3385 md->eptr[md->point] = eptr;
3386 md->ecode[md->point] = ptr+3;
3387 md->r1[md->point] = 0;
3388 md->r2[md->point] = 0;
3389 md->off_num[md->point] = 0;
3390 md->point++;
Guido van Rossum58132c61997-12-17 00:24:13 +00003391 /*points_pushed++;*/
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003392 }
3393 } while (*ptr==OP_ALT);
3394 /* Jump to the first (or only) alternative and resume trying to match */
3395 ecode=prev+3; goto match_loop;
3396 }
3397 }
Guido van Rossum58132c61997-12-17 00:24:13 +00003398
Guido van Rossum50700601997-12-08 17:15:20 +00003399 /* Start of subject unless notbol, or after internal newline if multiline */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003400
3401 case OP_CIRC:
Guido van Rossum50700601997-12-08 17:15:20 +00003402 if (md->notbol && eptr == md->start_subject) FAIL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003403 if (md->multiline)
3404 {
3405 if (eptr != md->start_subject && eptr[-1] != '\n') FAIL;
3406 ecode++;
3407 break;
3408 }
3409 /* ... else fall through */
3410
3411 /* Start of subject assertion */
3412
3413 case OP_SOD:
3414 if (eptr != md->start_subject) FAIL;
3415 ecode++;
3416 break;
3417
Guido van Rossum50700601997-12-08 17:15:20 +00003418 /* Assert before internal newline if multiline, or before
3419 a terminating newline unless endonly is set, else end of subject unless
3420 noteol is set. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003421
3422 case OP_DOLL:
Guido van Rossum50700601997-12-08 17:15:20 +00003423 if (md->noteol && eptr >= md->end_subject) FAIL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003424 if (md->multiline)
3425 {
3426 if (eptr < md->end_subject && *eptr != '\n') FAIL;
3427 ecode++;
3428 break;
3429 }
Guido van Rossum50700601997-12-08 17:15:20 +00003430 else if (!md->endonly)
3431 {
3432 if (eptr < md->end_subject - 1 ||
3433 (eptr == md->end_subject - 1 && *eptr != '\n')) FAIL;
3434 ecode++;
3435 break;
3436 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003437 /* ... else fall through */
3438
3439 /* End of subject assertion */
3440
3441 case OP_EOD:
3442 if (eptr < md->end_subject) FAIL;
3443 ecode++;
3444 break;
3445
3446 /* Word boundary assertions */
3447
3448 case OP_NOT_WORD_BOUNDARY:
3449 case OP_WORD_BOUNDARY:
3450 {
3451 BOOL prev_is_word = (eptr != md->start_subject) &&
3452 ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
3453 BOOL cur_is_word = (eptr < md->end_subject) &&
3454 ((pcre_ctypes[*eptr] & ctype_word) != 0);
3455 if ((*ecode++ == OP_WORD_BOUNDARY)?
3456 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3457 FAIL;
3458 }
3459 break;
3460
Guido van Rossum50700601997-12-08 17:15:20 +00003461 case OP_NOT_WORD_BOUNDARY_L:
3462 case OP_WORD_BOUNDARY_L:
3463 {
3464 BOOL prev_is_word = (eptr != md->start_subject) &&
Guido van Rossum58132c61997-12-17 00:24:13 +00003465 (isalnum(eptr[-1]) || eptr[-1]=='_');
Guido van Rossum50700601997-12-08 17:15:20 +00003466 BOOL cur_is_word = (eptr < md->end_subject) &&
Guido van Rossum58132c61997-12-17 00:24:13 +00003467 (isalnum(*eptr) || *eptr=='_');
Guido van Rossum50700601997-12-08 17:15:20 +00003468 if ((*ecode++ == OP_WORD_BOUNDARY_L)?
3469 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3470 FAIL;
3471 }
3472 break;
3473
3474
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003475 /* Match a single character type; inline for speed */
3476
3477 case OP_ANY:
3478 if (!md->dotall && eptr < md->end_subject && *eptr == '\n') FAIL;
3479 if (eptr++ >= md->end_subject) FAIL;
3480 ecode++;
3481 break;
3482
3483 case OP_NOT_DIGIT:
3484 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)
3485 FAIL;
3486 ecode++;
3487 break;
3488
3489 case OP_DIGIT:
3490 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)
3491 FAIL;
3492 ecode++;
3493 break;
3494
3495 case OP_NOT_WHITESPACE:
3496 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)
3497 FAIL;
3498 ecode++;
3499 break;
3500
3501 case OP_WHITESPACE:
3502 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)
3503 FAIL;
3504 ecode++;
3505 break;
3506
3507 case OP_NOT_WORDCHAR:
3508 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)
3509 FAIL;
3510 ecode++;
3511 break;
3512
3513 case OP_WORDCHAR:
3514 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)
3515 FAIL;
3516 ecode++;
3517 break;
3518
Guido van Rossum50700601997-12-08 17:15:20 +00003519 case OP_NOT_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00003520 if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) ))
Guido van Rossum557dea11997-12-22 22:46:52 +00003521 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00003522 eptr++;
3523 ecode++;
3524 break;
3525
3526 case OP_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00003527 if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) ))
Guido van Rossum557dea11997-12-22 22:46:52 +00003528 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00003529 eptr++;
3530 ecode++;
3531 break;
3532
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003533 /* Match a back reference, possibly repeatedly. Look past the end of the
3534 item to see if there is repeat information following. The code is similar
3535 to that for character classes, but repeated for efficiency. Then obey
3536 similar code to character type repeats - written out again for speed.
3537 However, if the referenced string is the empty string, always treat
3538 it as matched, any number of times (otherwise there could be infinite
3539 loops). */
3540
3541 case OP_REF:
3542 {
3543 int length;
3544 int number = ecode[1] << 1; /* Doubled reference number */
3545 ecode += 2; /* Advance past the item */
3546
3547 if (number >= offset_top || md->offset_vector[number] < 0)
3548 {
3549 md->errorcode = PCRE_ERROR_BADREF;
3550 FAIL;
3551 }
3552
3553 length = md->offset_vector[number+1] - md->offset_vector[number];
3554
3555 switch (*ecode)
3556 {
3557 case OP_CRSTAR:
3558 case OP_CRMINSTAR:
3559 case OP_CRPLUS:
3560 case OP_CRMINPLUS:
3561 case OP_CRQUERY:
3562 case OP_CRMINQUERY:
3563 c = *ecode++ - OP_CRSTAR;
3564 minimize = (c & 1) != 0;
3565 min = rep_min[c]; /* Pick up values from tables; */
3566 max = rep_max[c]; /* zero for max => infinity */
3567 if (max == 0) max = INT_MAX;
3568 break;
3569
3570 case OP_CRRANGE:
3571 case OP_CRMINRANGE:
3572 minimize = (*ecode == OP_CRMINRANGE);
3573 min = (ecode[1] << 8) + ecode[2];
3574 max = (ecode[3] << 8) + ecode[4];
3575 if (max == 0) max = INT_MAX;
3576 ecode += 5;
3577 break;
3578
3579 default: /* No repeat follows */
3580 if (!match_ref(number, eptr, length, md)) FAIL;
3581 eptr += length;
3582 continue; /* With the main loop */
3583 }
3584
3585 /* If the length of the reference is zero, just continue with the
3586 main loop. */
3587
3588 if (length == 0) continue;
3589
3590 /* First, ensure the minimum number of matches are present. We get back
3591 the length of the reference string explicitly rather than passing the
3592 address of eptr, so that eptr can be a register variable. */
3593
3594 for (i = 1; i <= min; i++)
3595 {
3596 if (!match_ref(number, eptr, length, md)) FAIL;
3597 eptr += length;
3598 }
3599
3600 /* If min = max, continue at the same level without recursion.
3601 They are not both allowed to be zero. */
3602
3603 if (min == max) continue;
3604
3605 /* If minimizing, keep trying and advancing the pointer */
3606
3607 if (minimize)
3608 {
3609 for (i = min;; i++)
3610 {
3611 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3612 if (i >= max || !match_ref(number, eptr, length, md))
3613 FAIL;
3614 eptr += length;
3615 }
3616 /* Control never gets here */
3617 }
3618
3619 /* If maximizing, find the longest string and work backwards */
3620
3621 else
3622 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003623 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003624 for (i = min; i < max; i++)
3625 {
3626 if (!match_ref(number, eptr, length, md)) break;
3627 eptr += length;
3628 }
3629 while (eptr >= pp)
3630 {
3631 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3632 eptr -= length;
3633 }
3634 FAIL;
3635 }
3636 }
3637 /* Control never gets here */
3638
3639 /* Match a character class, possibly repeatedly. Look past the end of the
3640 item to see if there is repeat information following. Then obey similar
Guido van Rossum50700601997-12-08 17:15:20 +00003641 code to character type repeats - written out again for speed. If caseless
3642 matching was set at runtime but not at compile time, we have to check both
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003643 versions of a character, and we have to behave differently for positive and
3644 negative classes. This is the only time where OP_CLASS and OP_NEGCLASS are
3645 treated differently. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003646
3647 case OP_CLASS:
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003648 case OP_NEGCLASS:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003649 {
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003650 BOOL nasty_case = *ecode == OP_NEGCLASS && md->runtime_caseless;
Guido van Rossum58132c61997-12-17 00:24:13 +00003651 const uschar *data = ecode + 1; /* Save for matching */
3652 ecode += 33; /* Advance past the item */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003653
3654 switch (*ecode)
3655 {
3656 case OP_CRSTAR:
3657 case OP_CRMINSTAR:
3658 case OP_CRPLUS:
3659 case OP_CRMINPLUS:
3660 case OP_CRQUERY:
3661 case OP_CRMINQUERY:
3662 c = *ecode++ - OP_CRSTAR;
3663 minimize = (c & 1) != 0;
3664 min = rep_min[c]; /* Pick up values from tables; */
3665 max = rep_max[c]; /* zero for max => infinity */
3666 if (max == 0) max = INT_MAX;
3667 break;
3668
3669 case OP_CRRANGE:
3670 case OP_CRMINRANGE:
3671 minimize = (*ecode == OP_CRMINRANGE);
3672 min = (ecode[1] << 8) + ecode[2];
3673 max = (ecode[3] << 8) + ecode[4];
3674 if (max == 0) max = INT_MAX;
3675 ecode += 5;
3676 break;
3677
3678 default: /* No repeat follows */
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003679 min = max = 1;
3680 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003681 }
3682
3683 /* First, ensure the minimum number of matches are present. */
3684
3685 for (i = 1; i <= min; i++)
Guido van Rossum50700601997-12-08 17:15:20 +00003686 {
3687 if (eptr >= md->end_subject) FAIL;
3688 c = *eptr++;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003689
3690 /* Either not runtime caseless, or it was a positive class. For
3691 runtime caseless, continue if either case is in the map. */
3692
3693 if (!nasty_case)
Guido van Rossum50700601997-12-08 17:15:20 +00003694 {
Guido van Rossum50700601997-12-08 17:15:20 +00003695 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003696 if (md->runtime_caseless)
3697 {
3698 c = pcre_fcc[c];
3699 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3700 }
Guido van Rossum50700601997-12-08 17:15:20 +00003701 }
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003702
3703 /* Runtime caseless and it was a negative class. Continue only if
3704 both cases are in the map. */
3705
3706 else
3707 {
3708 if ((data[c/8] & (1 << (c&7))) == 0) FAIL;
3709 c = pcre_fcc[c];
3710 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3711 }
3712
3713 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00003714 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003715
3716 /* If max == min we can continue with the main loop without the
3717 need to recurse. */
3718
3719 if (min == max) continue;
3720
3721 /* If minimizing, keep testing the rest of the expression and advancing
3722 the pointer while it matches the class. */
3723
3724 if (minimize)
3725 {
3726 for (i = min;; i++)
3727 {
3728 if (match(eptr, ecode, offset_top, md)) SUCCEED;
Guido van Rossum50700601997-12-08 17:15:20 +00003729 if (i >= max || eptr >= md->end_subject) FAIL;
3730 c = *eptr++;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003731
3732 /* Either not runtime caseless, or it was a positive class. For
3733 runtime caseless, continue if either case is in the map. */
3734
3735 if (!nasty_case)
Guido van Rossum50700601997-12-08 17:15:20 +00003736 {
Guido van Rossum50700601997-12-08 17:15:20 +00003737 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003738 if (md->runtime_caseless)
3739 {
3740 c = pcre_fcc[c];
3741 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3742 }
Guido van Rossum50700601997-12-08 17:15:20 +00003743 }
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003744
3745 /* Runtime caseless and it was a negative class. Continue only if
3746 both cases are in the map. */
3747
3748 else
3749 {
3750 if ((data[c/8] & (1 << (c&7))) == 0) return FALSE;
3751 c = pcre_fcc[c];
3752 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3753 }
3754
Guido van Rossum50700601997-12-08 17:15:20 +00003755 FAIL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003756 }
3757 /* Control never gets here */
3758 }
3759
3760 /* If maximizing, find the longest possible run, then work backwards. */
3761
3762 else
3763 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003764 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00003765 for (i = min; i < max; eptr++, i++)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003766 {
Guido van Rossum50700601997-12-08 17:15:20 +00003767 if (eptr >= md->end_subject) break;
3768 c = *eptr;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003769
3770 /* Either not runtime caseless, or it was a positive class. For
3771 runtime caseless, continue if either case is in the map. */
3772
3773 if (!nasty_case)
Guido van Rossum50700601997-12-08 17:15:20 +00003774 {
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003775 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3776 if (md->runtime_caseless)
3777 {
3778 c = pcre_fcc[c];
3779 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3780 }
3781 }
3782
3783 /* Runtime caseless and it was a negative class. Continue only if
3784 both cases are in the map. */
3785
3786 else
3787 {
3788 if ((data[c/8] & (1 << (c&7))) == 0) break;
Guido van Rossum50700601997-12-08 17:15:20 +00003789 c = pcre_fcc[c];
3790 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3791 }
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003792
Guido van Rossum50700601997-12-08 17:15:20 +00003793 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003794 }
Guido van Rossum50700601997-12-08 17:15:20 +00003795
3796 while (eptr >= pp)
3797 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3798 FAIL;
3799 }
3800 }
3801 /* Control never gets here */
3802
3803 /* OP_CLASS_L opcode: handles localized character classes */
3804
3805 case OP_CLASS_L:
3806 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003807 const uschar *data = ecode + 1; /* Save for matching */
3808 const uschar locale_flag = *data;
Guido van Rossum50700601997-12-08 17:15:20 +00003809 ecode++; data++; /* The localization support adds an extra byte */
3810
3811 ecode += 33; /* Advance past the item */
3812
3813 switch (*ecode)
3814 {
3815 case OP_CRSTAR:
3816 case OP_CRMINSTAR:
3817 case OP_CRPLUS:
3818 case OP_CRMINPLUS:
3819 case OP_CRQUERY:
3820 case OP_CRMINQUERY:
3821 c = *ecode++ - OP_CRSTAR;
3822 minimize = (c & 1) != 0;
3823 min = rep_min[c]; /* Pick up values from tables; */
3824 max = rep_max[c]; /* zero for max => infinity */
3825 if (max == 0) max = INT_MAX;
3826 break;
3827
3828 case OP_CRRANGE:
3829 case OP_CRMINRANGE:
3830 minimize = (*ecode == OP_CRMINRANGE);
3831 min = (ecode[1] << 8) + ecode[2];
3832 max = (ecode[3] << 8) + ecode[4];
3833 if (max == 0) max = INT_MAX;
3834 ecode += 5;
3835 break;
3836
3837 default: /* No repeat follows */
3838 if (eptr >= md->end_subject) FAIL;
3839 c = *eptr++;
3840 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
Guido van Rossum58132c61997-12-17 00:24:13 +00003841 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3842 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003843#if 0
3844 if ( (locale_flag & 4) && isdigit(c) ) continue; /* Locale \d */
3845 if ( (locale_flag & 8) && !isdigit(c) ) continue; /* Locale \D */
3846 if ( (locale_flag & 16) && isspace(c) ) continue; /* Locale \s */
3847 if ( (locale_flag & 32) && !isspace(c) ) continue; /* Locale \S */
3848#endif
3849
3850 if (md->runtime_caseless)
3851 {
3852 c = pcre_fcc[c];
3853 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
3854
Guido van Rossum58132c61997-12-17 00:24:13 +00003855 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3856 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003857 }
3858 FAIL;
3859 }
3860
3861 /* First, ensure the minimum number of matches are present. */
3862
3863 for (i = 1; i <= min; i++)
3864 {
3865 if (eptr >= md->end_subject) FAIL;
3866 c = *eptr++;
3867 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003868 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3869 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003870
3871 if (md->runtime_caseless)
3872 {
3873 c = pcre_fcc[c];
3874 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003875 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3876 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003877 }
3878 FAIL;
3879 }
3880
3881 /* If max == min we can continue with the main loop without the
3882 need to recurse. */
3883
3884 if (min == max) continue;
3885
3886 /* If minimizing, keep testing the rest of the expression and advancing
3887 the pointer while it matches the class. */
3888
3889 if (minimize)
3890 {
3891 for (i = min;; i++)
3892 {
3893 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3894 if (i >= max || eptr >= md->end_subject) FAIL;
3895 c = *eptr++;
3896 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003897 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3898 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003899
3900 if (md->runtime_caseless)
3901 {
3902 c = pcre_fcc[c];
3903 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003904 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3905 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003906 }
3907 FAIL;
3908 }
3909 /* Control never gets here */
3910 }
3911
3912 /* If maximizing, find the longest possible run, then work backwards. */
3913
3914 else
3915 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003916 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00003917 for (i = min; i < max; eptr++, i++)
3918 {
3919 if (eptr >= md->end_subject) break;
3920 c = *eptr;
3921 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003922 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3923 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003924 if (md->runtime_caseless)
3925 {
3926 c = pcre_fcc[c];
3927 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003928 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3929 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003930 }
3931 break;
3932 }
3933
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003934 while (eptr >= pp)
3935 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3936 FAIL;
3937 }
3938 }
3939 /* Control never gets here */
3940
3941 /* Match a run of characters */
3942
3943 case OP_CHARS:
3944 {
3945 register int length = ecode[1];
3946 ecode += 2;
3947
Guido van Rossum557dea11997-12-22 22:46:52 +00003948#ifdef DEBUG /* Sigh. Some compilers never learn. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003949 if (eptr >= md->end_subject)
3950 printf("matching subject <null> against pattern ");
3951 else
3952 {
3953 printf("matching subject ");
3954 pchars(eptr, length, TRUE, md);
3955 printf(" against pattern ");
3956 }
3957 pchars(ecode, length, FALSE, md);
3958 printf("\n");
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003959#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003960
3961 if (length > md->end_subject - eptr) FAIL;
3962 if (md->caseless)
3963 {
3964 while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) FAIL;
3965 }
3966 else
3967 {
3968 while (length-- > 0) if (*ecode++ != *eptr++) FAIL;
3969 }
3970 }
3971 break;
3972
3973 /* Match a single character repeatedly; different opcodes share code. */
3974
3975 case OP_EXACT:
3976 min = max = (ecode[1] << 8) + ecode[2];
3977 ecode += 3;
3978 goto REPEATCHAR;
3979
3980 case OP_UPTO:
3981 case OP_MINUPTO:
3982 min = 0;
3983 max = (ecode[1] << 8) + ecode[2];
3984 minimize = *ecode == OP_MINUPTO;
3985 ecode += 3;
3986 goto REPEATCHAR;
3987
3988 case OP_STAR:
3989 case OP_MINSTAR:
3990 case OP_PLUS:
3991 case OP_MINPLUS:
3992 case OP_QUERY:
3993 case OP_MINQUERY:
3994 c = *ecode++ - OP_STAR;
3995 minimize = (c & 1) != 0;
3996 min = rep_min[c]; /* Pick up values from tables; */
3997 max = rep_max[c]; /* zero for max => infinity */
3998 if (max == 0) max = INT_MAX;
3999
4000 /* Common code for all repeated single-character matches. We can give
4001 up quickly if there are fewer than the minimum number of characters left in
4002 the subject. */
4003
4004 REPEATCHAR:
4005 if (min > md->end_subject - eptr) FAIL;
4006 c = *ecode++;
4007
4008 /* The code is duplicated for the caseless and caseful cases, for speed,
4009 since matching characters is likely to be quite common. First, ensure the
4010 minimum number of matches are present. If min = max, continue at the same
4011 level without recursing. Otherwise, if minimizing, keep trying the rest of
4012 the expression and advancing one matching character if failing, up to the
4013 maximum. Alternatively, if maximizing, find the maximum number of
4014 characters and work backwards. */
4015
Guido van Rossum557dea11997-12-22 22:46:52 +00004016 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
4017 max, eptr));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004018
4019 if (md->caseless)
4020 {
4021 c = pcre_lcc[c];
4022 for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) FAIL;
4023 if (min == max) continue;
4024 if (minimize)
4025 {
4026 for (i = min;; i++)
4027 {
4028 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4029 if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])
4030 FAIL;
4031 }
4032 /* Control never gets here */
4033 }
4034 else
4035 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004036 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004037 for (i = min; i < max; i++)
4038 {
4039 if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;
4040 eptr++;
4041 }
4042 while (eptr >= pp)
4043 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4044 FAIL;
4045 }
Guido van Rossum50700601997-12-08 17:15:20 +00004046 /* Control never gets here */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004047 }
4048
4049 /* Caseful comparisons */
4050
4051 else
4052 {
4053 for (i = 1; i <= min; i++) if (c != *eptr++) FAIL;
4054 if (min == max) continue;
4055 if (minimize)
4056 {
4057 for (i = min;; i++)
4058 {
4059 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4060 if (i >= max || eptr >= md->end_subject || c != *eptr++) FAIL;
4061 }
4062 /* Control never gets here */
4063 }
4064 else
4065 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004066 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004067 for (i = min; i < max; i++)
4068 {
4069 if (eptr >= md->end_subject || c != *eptr) break;
4070 eptr++;
4071 }
4072 while (eptr >= pp)
4073 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4074 FAIL;
4075 }
4076 }
4077 /* Control never gets here */
4078
Guido van Rossum50700601997-12-08 17:15:20 +00004079 /* Match a negated single character */
4080
4081 case OP_NOT:
Guido van Rossum557dea11997-12-22 22:46:52 +00004082 if (eptr >= md->end_subject) FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00004083 ecode++;
4084 if (md->caseless)
4085 {
4086 if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) FAIL;
4087 }
4088 else
4089 {
4090 if (*ecode++ == *eptr++) FAIL;
4091 }
4092 break;
4093
4094 /* Match a negated single character repeatedly. This is almost a repeat of
4095 the code for a repeated single character, but I haven't found a nice way of
4096 commoning these up that doesn't require a test of the positive/negative
4097 option for each character match. Maybe that wouldn't add very much to the
4098 time taken, but character matching *is* what this is all about... */
4099
4100 case OP_NOTEXACT:
4101 min = max = (ecode[1] << 8) + ecode[2];
4102 ecode += 3;
4103 goto REPEATNOTCHAR;
4104
4105 case OP_NOTUPTO:
4106 case OP_NOTMINUPTO:
4107 min = 0;
4108 max = (ecode[1] << 8) + ecode[2];
4109 minimize = *ecode == OP_NOTMINUPTO;
4110 ecode += 3;
4111 goto REPEATNOTCHAR;
4112
4113 case OP_NOTSTAR:
4114 case OP_NOTMINSTAR:
4115 case OP_NOTPLUS:
4116 case OP_NOTMINPLUS:
4117 case OP_NOTQUERY:
4118 case OP_NOTMINQUERY:
4119 c = *ecode++ - OP_NOTSTAR;
4120 minimize = (c & 1) != 0;
4121 min = rep_min[c]; /* Pick up values from tables; */
4122 max = rep_max[c]; /* zero for max => infinity */
4123 if (max == 0) max = INT_MAX;
4124
4125 /* Common code for all repeated single-character matches. We can give
4126 up quickly if there are fewer than the minimum number of characters left in
4127 the subject. */
4128
4129 REPEATNOTCHAR:
4130 if (min > md->end_subject - eptr) FAIL;
4131 c = *ecode++;
4132
4133 /* The code is duplicated for the caseless and caseful cases, for speed,
4134 since matching characters is likely to be quite common. First, ensure the
4135 minimum number of matches are present. If min = max, continue at the same
4136 level without recursing. Otherwise, if minimizing, keep trying the rest of
4137 the expression and advancing one matching character if failing, up to the
4138 maximum. Alternatively, if maximizing, find the maximum number of
4139 characters and work backwards. */
4140
Guido van Rossum557dea11997-12-22 22:46:52 +00004141 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
4142 max, eptr));
Guido van Rossum50700601997-12-08 17:15:20 +00004143
4144 if (md->caseless)
4145 {
4146 c = pcre_lcc[c];
4147 for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) FAIL;
4148 if (min == max) continue;
4149 if (minimize)
4150 {
4151 for (i = min;; i++)
4152 {
4153 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4154 if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])
4155 FAIL;
4156 }
4157 /* Control never gets here */
4158 }
4159 else
4160 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004161 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00004162 for (i = min; i < max; i++)
4163 {
4164 if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;
4165 eptr++;
4166 }
4167 while (eptr >= pp)
4168 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4169 FAIL;
4170 }
4171 /* Control never gets here */
4172 }
4173
4174 /* Caseful comparisons */
4175
4176 else
4177 {
4178 for (i = 1; i <= min; i++) if (c == *eptr++) FAIL;
4179 if (min == max) continue;
4180 if (minimize)
4181 {
4182 for (i = min;; i++)
4183 {
4184 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4185 if (i >= max || eptr >= md->end_subject || c == *eptr++) FAIL;
4186 }
4187 /* Control never gets here */
4188 }
4189 else
4190 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004191 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00004192 for (i = min; i < max; i++)
4193 {
4194 if (eptr >= md->end_subject || c == *eptr) break;
4195 eptr++;
4196 }
4197 while (eptr >= pp)
4198 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4199 FAIL;
4200 }
4201 }
4202 /* Control never gets here */
4203
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004204 /* Match a single character type repeatedly; several different opcodes
4205 share code. This is very similar to the code for single characters, but we
4206 repeat it in the interests of efficiency. */
4207
4208 case OP_TYPEEXACT:
4209 min = max = (ecode[1] << 8) + ecode[2];
4210 minimize = TRUE;
4211 ecode += 3;
4212 goto REPEATTYPE;
4213
4214 case OP_TYPEUPTO:
4215 case OP_TYPEMINUPTO:
4216 min = 0;
4217 max = (ecode[1] << 8) + ecode[2];
4218 minimize = *ecode == OP_TYPEMINUPTO;
4219 ecode += 3;
4220 goto REPEATTYPE;
4221
4222 case OP_TYPESTAR:
4223 case OP_TYPEMINSTAR:
4224 case OP_TYPEPLUS:
4225 case OP_TYPEMINPLUS:
4226 case OP_TYPEQUERY:
4227 case OP_TYPEMINQUERY:
4228 c = *ecode++ - OP_TYPESTAR;
4229 minimize = (c & 1) != 0;
4230 min = rep_min[c]; /* Pick up values from tables; */
4231 max = rep_max[c]; /* zero for max => infinity */
4232 if (max == 0) max = INT_MAX;
4233
4234 /* Common code for all repeated single character type matches */
4235
4236 REPEATTYPE:
4237 ctype = *ecode++; /* Code for the character type */
4238
4239 /* First, ensure the minimum number of matches are present. Use inline
4240 code for maximizing the speed, and do the type test once at the start
4241 (i.e. keep it out of the loop). Also test that there are at least the
4242 minimum number of characters before we start. */
4243
4244 if (min > md->end_subject - eptr) FAIL;
4245 if (min > 0) switch(ctype)
4246 {
4247 case OP_ANY:
4248 if (!md->dotall)
4249 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') FAIL; }
4250 else eptr += min;
4251 break;
4252
4253 case OP_NOT_DIGIT:
4254 for (i = 1; i <= min; i++)
4255 if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) FAIL;
4256 break;
4257
4258 case OP_DIGIT:
4259 for (i = 1; i <= min; i++)
4260 if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) FAIL;
4261 break;
4262
4263 case OP_NOT_WHITESPACE:
4264 for (i = 1; i <= min; i++)
4265 if ((pcre_ctypes[*eptr++] & ctype_space) != 0) FAIL;
4266 break;
4267
4268 case OP_WHITESPACE:
4269 for (i = 1; i <= min; i++)
4270 if ((pcre_ctypes[*eptr++] & ctype_space) == 0) FAIL;
4271 break;
4272
4273 case OP_NOT_WORDCHAR:
4274 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
4275 FAIL;
4276 break;
4277
4278 case OP_WORDCHAR:
4279 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
4280 FAIL;
4281 break;
Guido van Rossum50700601997-12-08 17:15:20 +00004282
4283 case OP_NOT_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00004284 for (i = 1; i <= min; i++, eptr++) if (*eptr=='_' || isalnum(*eptr))
Guido van Rossum557dea11997-12-22 22:46:52 +00004285 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00004286 break;
4287
4288 case OP_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00004289 for (i = 1; i <= min; i++, eptr++) if (*eptr!='_' && !isalnum(*eptr))
Guido van Rossum557dea11997-12-22 22:46:52 +00004290 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00004291 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004292 }
4293
4294 /* If min = max, continue at the same level without recursing */
4295
4296 if (min == max) continue;
4297
4298 /* If minimizing, we have to test the rest of the pattern before each
4299 subsequent match, so inlining isn't much help; just use the function. */
4300
4301 if (minimize)
4302 {
4303 for (i = min;; i++)
4304 {
4305 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4306 if (i >= max || eptr >= md->end_subject ||
4307 !match_type(ctype, *eptr++, md->dotall))
4308 FAIL;
4309 }
4310 /* Control never gets here */
4311 }
4312
4313 /* If maximizing it is worth using inline code for speed, doing the type
4314 test once at the start (i.e. keep it out of the loop). */
4315
4316 else
4317 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004318 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004319 switch(ctype)
4320 {
4321 case OP_ANY:
4322 if (!md->dotall)
4323 {
4324 for (i = min; i < max; i++)
4325 {
4326 if (eptr >= md->end_subject || *eptr == '\n') break;
4327 eptr++;
4328 }
4329 }
4330 else
4331 {
4332 c = max - min;
4333 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4334 eptr += c;
4335 }
4336 break;
4337
4338 case OP_NOT_DIGIT:
4339 for (i = min; i < max; i++)
4340 {
4341 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)
4342 break;
4343 eptr++;
4344 }
4345 break;
4346
4347 case OP_DIGIT:
4348 for (i = min; i < max; i++)
4349 {
4350 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)
4351 break;
4352 eptr++;
4353 }
4354 break;
4355
4356 case OP_NOT_WHITESPACE:
4357 for (i = min; i < max; i++)
4358 {
4359 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)
4360 break;
4361 eptr++;
4362 }
4363 break;
4364
4365 case OP_WHITESPACE:
4366 for (i = min; i < max; i++)
4367 {
4368 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)
4369 break;
4370 eptr++;
4371 }
4372 break;
4373
4374 case OP_NOT_WORDCHAR:
4375 for (i = min; i < max; i++)
4376 {
4377 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)
4378 break;
4379 eptr++;
4380 }
4381 break;
4382
4383 case OP_WORDCHAR:
4384 for (i = min; i < max; i++)
4385 {
Guido van Rossum50700601997-12-08 17:15:20 +00004386 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)
4387 break;
4388 eptr++;
4389 }
4390 break;
4391 case OP_NOT_WORDCHAR_L:
4392 for (i = min; i < max; i++)
4393 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004394 if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) ) )
Guido van Rossum50700601997-12-08 17:15:20 +00004395 break;
4396 eptr++;
4397 }
4398 break;
4399
4400 case OP_WORDCHAR_L:
4401 for (i = min; i < max; i++)
4402 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004403 if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) ) )
Guido van Rossum50700601997-12-08 17:15:20 +00004404 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004405 eptr++;
4406 }
4407 break;
4408 }
4409
4410 while (eptr >= pp)
4411 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4412 FAIL;
4413 }
4414 /* Control never gets here */
4415
4416 /* There's been some horrible disaster. */
4417
4418 default:
Guido van Rossum557dea11997-12-22 22:46:52 +00004419 DPRINTF(("Unknown opcode %d\n", *ecode));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004420 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4421 FAIL;
4422 }
4423
4424 /* Do not stick any code in here without much thought; it is assumed
4425 that "continue" in the code above comes out to here to repeat the main
4426 loop. */
4427
4428 } /* End of main loop */
4429/* Control never reaches here */
4430
4431fail:
4432 if (md->point > save_stack_position)
4433 {
4434 /* If there are still points remaining on the stack, pop the next one off */
Guido van Rossumc3861071997-10-08 02:07:40 +00004435 int off_num;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004436
4437 md->point--;
4438 offset_top = md->offset_top[md->point];
4439 eptr = md->eptr[md->point];
4440 ecode = md->ecode[md->point];
4441 off_num = md->off_num[md->point];
4442 md->offset_vector[off_num] = md->r1[md->point];
4443 md->offset_vector[off_num+1] = md->r2[md->point];
4444 goto match_loop;
4445 }
4446 /* Failure, and nothing left on the stack, so end this function call */
4447
4448 /* Restore the top of the stack to where it was before this function
4449 call. This lets us use one stack for everything; recursive calls
4450 can push and pop information, and may increase the stack. When
4451 the call returns, the parent function can resume pushing and
4452 popping wherever it was. */
4453
4454 md->point = save_stack_position;
4455 return FALSE;
4456
4457succeed:
4458 return TRUE;
4459}
4460
4461
Guido van Rossum50700601997-12-08 17:15:20 +00004462
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004463/*************************************************
Guido van Rossum557dea11997-12-22 22:46:52 +00004464* Segregate setjmp() *
4465*************************************************/
4466
4467/* The -Wall option of gcc gives warnings for all local variables when setjmp()
4468is used, even if the coding conforms to the rules of ANSI C. To avoid this, we
4469hide it in a separate function. This is called only when PCRE_EXTRA is set,
4470since it's needed only for the extension \X option, and with any luck, a good
4471compiler will spot the tail recursion and compile it efficiently.
4472
4473Arguments:
4474 eptr pointer in subject
4475 ecode position in code
4476 offset_top current top pointer
4477 md pointer to "static" info for the match
4478
4479Returns: TRUE if matched
4480*/
4481
4482static BOOL
4483match_with_setjmp(const uschar *eptr, const uschar *ecode, int offset_top,
4484 match_data *match_block)
4485{
4486return setjmp(match_block->fail_env) == 0 &&
4487 match(eptr, ecode, offset_top, match_block);
4488}
4489
4490
4491
4492/*************************************************
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004493* Execute a Regular Expression *
4494*************************************************/
4495
4496/* This function applies a compiled re to a subject string and picks out
4497portions of the string if it matches. Two elements in the vector are set for
4498each substring: the offsets to the start and end of the substring.
4499
4500Arguments:
Guido van Rossum50700601997-12-08 17:15:20 +00004501 external_re points to the compiled expression
4502 external_extra points to "hints" from pcre_study() or is NULL
4503 subject points to the subject string
4504 length length of subject string (may contain binary zeros)
4505 options option bits
4506 offsets points to a vector of ints to be filled in with offsets
4507 offsetcount the number of elements in the vector
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004508
Guido van Rossum50700601997-12-08 17:15:20 +00004509Returns: > 0 => success; value is the number of elements filled in
4510 = 0 => success, but offsets is not big enough
4511 -1 => failed to match
4512 < -1 => some kind of unexpected problem
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004513*/
4514
4515int
Guido van Rossum50700601997-12-08 17:15:20 +00004516pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
Guido van Rossum816671c1998-03-10 04:55:29 +00004517 const char *subject, int length, int start_pos, int options,
4518 int *offsets, int offsetcount)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004519{
Guido van Rossum58132c61997-12-17 00:24:13 +00004520 /* The "volatile" directives are to make gcc -Wall stop complaining
4521 that these variables can be clobbered by the longjmp. Hopefully
4522 they won't cost too much performance. */
Guido van Rossum042ff9e1998-04-03 21:13:31 +00004523volatile int resetcount, ocount;
4524volatile int first_char = -1;
Tim Peters54925f92000-07-05 22:56:52 +00004525const uschar * volatile start_bits = NULL;
4526const uschar * volatile start_match = (const uschar *)subject + start_pos;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004527match_data match_block;
Guido van Rossum58132c61997-12-17 00:24:13 +00004528const uschar *end_subject;
4529const real_pcre *re = (const real_pcre *)external_re;
4530const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00004531volatile BOOL using_temporary_offsets = FALSE;
4532volatile BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4533volatile BOOL startline = (re->options & PCRE_STARTLINE) != 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004534
4535if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4536
4537if (re == NULL || subject == NULL ||
4538 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4539if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4540
Guido van Rossum58132c61997-12-17 00:24:13 +00004541match_block.start_subject = (const uschar *)subject;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004542match_block.end_subject = match_block.start_subject + length;
4543end_subject = match_block.end_subject;
4544
Guido van Rossum50700601997-12-08 17:15:20 +00004545match_block.caseless = ((re->options | options) & PCRE_CASELESS) != 0;
4546match_block.runtime_caseless = match_block.caseless &&
4547 (re->options & PCRE_CASELESS) == 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004548
Guido van Rossum50700601997-12-08 17:15:20 +00004549match_block.multiline = ((re->options | options) & PCRE_MULTILINE) != 0;
4550match_block.dotall = ((re->options | options) & PCRE_DOTALL) != 0;
4551match_block.endonly = ((re->options | options) & PCRE_DOLLAR_ENDONLY) != 0;
4552
4553match_block.notbol = (options & PCRE_NOTBOL) != 0;
4554match_block.noteol = (options & PCRE_NOTEOL) != 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004555
4556match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
4557
4558/* Set the stack state to empty */
4559 match_block.off_num = match_block.offset_top = NULL;
4560 match_block.r1 = match_block.r2 = NULL;
4561 match_block.eptr = match_block.ecode = NULL;
4562 match_block.point = match_block.length = 0;
4563
Guido van Rossum50700601997-12-08 17:15:20 +00004564/* If the expression has got more back references than the offsets supplied can
4565hold, we get a temporary bit of working store to use during the matching.
Guido van Rossum557dea11997-12-22 22:46:52 +00004566Otherwise, we can use the vector supplied, rounding down its size to a multiple
4567of 2. */
Guido van Rossum50700601997-12-08 17:15:20 +00004568
Guido van Rossum557dea11997-12-22 22:46:52 +00004569ocount = offsetcount & (-2);
4570if (re->top_backref > 0 && re->top_backref >= ocount/2)
Guido van Rossum50700601997-12-08 17:15:20 +00004571 {
4572 ocount = re->top_backref * 2 + 2;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00004573 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
Guido van Rossum50700601997-12-08 17:15:20 +00004574 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
Guido van Rossum557dea11997-12-22 22:46:52 +00004575 using_temporary_offsets = TRUE;
4576 DPRINTF(("Got memory to hold back references\n"));
Guido van Rossum50700601997-12-08 17:15:20 +00004577 }
4578else match_block.offset_vector = offsets;
4579
4580match_block.offset_end = ocount;
4581match_block.offset_overflow = FALSE;
4582
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004583/* Compute the minimum number of offsets that we need to reset each time. Doing
4584this makes a huge difference to execution time when there aren't many brackets
4585in the pattern. */
4586
4587resetcount = 2 + re->top_bracket * 2;
Guido van Rossum50700601997-12-08 17:15:20 +00004588if (resetcount > offsetcount) resetcount = ocount;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004589
4590/* If MULTILINE is set at exec time but was not set at compile time, and the
4591anchored flag is set, we must re-check because a setting provoked by ^ in the
4592pattern is not right in multi-line mode. Calling is_anchored() again here does
4593the right check, because multiline is now set. If it now yields FALSE, the
4594expression must have had ^ starting some of its branches. Check to see if
4595that is true for *all* branches, and if so, set the startline flag. */
4596
Guido van Rossum557dea11997-12-22 22:46:52 +00004597if (match_block.multiline && anchored && (re->options & PCRE_MULTILINE) == 0 &&
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004598 !is_anchored(re->code, match_block.multiline))
4599 {
4600 anchored = FALSE;
4601 if (is_startline(re->code)) startline = TRUE;
4602 }
4603
4604/* Set up the first character to match, if available. The first_char value is
4605never set for an anchored regular expression, but the anchoring may be forced
4606at run time, so we have to test for anchoring. The first char may be unset for
4607an unanchored pattern, of course. If there's no first char and the pattern was
4608studied, the may be a bitmap of possible first characters. However, we can
4609use this only if the caseless state of the studying was correct. */
4610
4611if (!anchored)
4612 {
4613 if ((re->options & PCRE_FIRSTSET) != 0)
4614 {
4615 first_char = re->first_char;
4616 if (match_block.caseless) first_char = pcre_lcc[first_char];
4617 }
4618 else
4619 if (!startline && extra != NULL &&
4620 (extra->options & PCRE_STUDY_MAPPED) != 0 &&
4621 ((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless)
4622 start_bits = extra->start_bits;
4623 }
4624
4625/* Loop for unanchored matches; for anchored regexps the loop runs just once. */
4626
4627do
4628 {
Guido van Rossum557dea11997-12-22 22:46:52 +00004629 int rc;
Guido van Rossum50700601997-12-08 17:15:20 +00004630 register int *iptr = match_block.offset_vector;
4631 register int *iend = iptr + resetcount;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004632
4633 /* Reset the maximum number of extractions we might see. */
4634
4635 while (iptr < iend) *iptr++ = -1;
4636
4637 /* Advance to a unique first char if possible */
4638
4639 if (first_char >= 0)
4640 {
4641 if (match_block.caseless)
4642 while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
4643 start_match++;
4644 else
4645 while (start_match < end_subject && *start_match != first_char)
4646 start_match++;
4647 }
4648
4649 /* Or to just after \n for a multiline match if possible */
4650
4651 else if (startline)
4652 {
4653 if (start_match > match_block.start_subject)
4654 {
4655 while (start_match < end_subject && start_match[-1] != '\n')
4656 start_match++;
4657 }
4658 }
4659
4660 /* Or to a non-unique first char */
4661
4662 else if (start_bits != NULL)
4663 {
4664 while (start_match < end_subject)
4665 {
4666 register int c = *start_match;
Guido van Rossum50700601997-12-08 17:15:20 +00004667 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004668 }
4669 }
4670
Guido van Rossum557dea11997-12-22 22:46:52 +00004671#ifdef DEBUG /* Sigh. Some compilers never learn. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004672 printf(">>>> Match against: ");
4673 pchars(start_match, end_subject - start_match, TRUE, &match_block);
4674 printf("\n");
Guido van Rossum57ba4f31997-12-02 20:40:28 +00004675#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004676
4677 /* When a match occurs, substrings will be set for all internal extractions;
4678 we just need to set up the whole thing as substring 0 before returning. If
Guido van Rossum50700601997-12-08 17:15:20 +00004679 there were too many extractions, set the return code to zero. In the case
4680 where we had to get some local store to hold offsets for backreferences, copy
4681 those back references that we can. In this case there need not be overflow
4682 if certain parts of the pattern were not used.
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004683
Guido van Rossum50700601997-12-08 17:15:20 +00004684 Before starting the match, we have to set up a longjmp() target to enable
Guido van Rossum557dea11997-12-22 22:46:52 +00004685 the "cut" operation to fail a match completely without backtracking. This
4686 is done in a separate function to avoid compiler warnings. We need not do
4687 it unless PCRE_EXTRA is set, since only in that case is the "cut" operation
4688 enabled. */
Guido van Rossum50700601997-12-08 17:15:20 +00004689
4690 /* To handle errors such as running out of memory for the failure
4691 stack, we need to save this location via setjmp(), so
4692 error-handling code can call longjmp() to jump out of deeply-nested code. */
4693 if (setjmp(match_block.error_env)==0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004694 {
Guido van Rossum50700601997-12-08 17:15:20 +00004695
Guido van Rossum557dea11997-12-22 22:46:52 +00004696 if ((re->options & PCRE_EXTRA) != 0)
Guido van Rossum50700601997-12-08 17:15:20 +00004697 {
Guido van Rossum557dea11997-12-22 22:46:52 +00004698 if (!match_with_setjmp(start_match, re->code, 2, &match_block))
4699 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004700 }
Guido van Rossum557dea11997-12-22 22:46:52 +00004701 else if (!match(start_match, re->code, 2, &match_block)) continue;
4702
4703 /* Copy the offset information from temporary store if necessary */
4704
4705 if (using_temporary_offsets)
4706 {
4707 if (offsetcount >= 4)
4708 {
4709 memcpy(offsets + 2, match_block.offset_vector + 2,
4710 (offsetcount - 2) * sizeof(int));
4711 DPRINTF(("Copied offsets from temporary memory\n"));
4712 }
4713 if (match_block.end_offset_top > offsetcount)
4714 match_block.offset_overflow = TRUE;
4715
4716 DPRINTF(("Freeing temporary memory\n"));
4717 (pcre_free)(match_block.offset_vector);
4718 }
4719
4720 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
4721
4722 if (match_block.offset_end < 2) rc = 0; else
4723 {
4724 offsets[0] = start_match - match_block.start_subject;
4725 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
4726 }
4727
4728 DPRINTF((">>>> returning %d\n", rc));
4729 free_stack(&match_block);
4730 return rc;
Guido van Rossum50700601997-12-08 17:15:20 +00004731 } /* End of (if setjmp(match_block.error_env)...) */
Guido van Rossum042ff9e1998-04-03 21:13:31 +00004732 free_stack(&match_block);
4733
Guido van Rossum50700601997-12-08 17:15:20 +00004734 /* Return an error code; pcremodule.c will preserve the exception */
4735 if (PyErr_Occurred()) return PCRE_ERROR_NOMEMORY;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004736 }
4737while (!anchored &&
4738 match_block.errorcode == PCRE_ERROR_NOMATCH &&
4739 start_match++ < end_subject);
4740
Guido van Rossum557dea11997-12-22 22:46:52 +00004741if (using_temporary_offsets)
4742 {
4743 DPRINTF(("Freeing temporary memory\n"));
4744 (pcre_free)(match_block.offset_vector);
4745 }
4746
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004747#ifdef DEBUG
4748printf(">>>> returning %d\n", match_block.errorcode);
4749#endif
Guido van Rossum50700601997-12-08 17:15:20 +00004750
Barry Warsawb80667d1999-01-27 21:41:08 +00004751 free_stack(&match_block);
Guido van Rossum557dea11997-12-22 22:46:52 +00004752 return match_block.errorcode;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004753}
4754
4755/* End of pcre.c */