blob: c6a14ec52813a27e04b4c3b006343c277bcfff68 [file] [log] [blame]
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001
2/*************************************************
3* Perl-Compatible Regular Expressions *
4*************************************************/
5
6/* DO NOT EDIT THIS FILE! */
7
8/* This file is automatically written by the merge-files.py script
9included with the PCRE distribution for Python; it's produced from
10several C files, and code is removed in the process. If you want to
11modify the code or track down bugs, it will be much easier to work
12with the code in its original, multiple-file form. Don't edit this
13file by hand, or submit patches to it.
14
15The Python-specific PCRE distribution can be retrieved from
16 http://starship.skyport.net/crew/amk/regex/
17
Guido van Rossum58132c61997-12-17 00:24:13 +000018The unmodified original PCRE distribution is available at
19ftp://ftp.cus.cam.ac.uk/pub/software/programs/pcre/, and is originally
20written by: Philip Hazel <ph10@cam.ac.uk>
Guido van Rossum51b3aa31997-10-06 14:43:11 +000021
22Extensively modified by the Python String-SIG: <string-sig@python.org>
23Send bug reports to: <string-sig@python.org>
24(They'll figure out if it's a bug in PCRE or in the Python-specific
25changes.)
26
27 Copyright (c) 1997 University of Cambridge
28
29-----------------------------------------------------------------------------
30Permission is granted to anyone to use this software for any purpose on any
31computer system, and to redistribute it freely, subject to the following
32restrictions:
33
341. This software is distributed in the hope that it will be useful,
35 but WITHOUT ANY WARRANTY; without even the implied warranty of
36 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
37
382. The origin of this software must not be misrepresented, either by
39 explicit claim or by omission.
40
413. Altered versions must be plainly marked as such, and must not be
42 misrepresented as being the original software.
43-----------------------------------------------------------------------------
44*/
45
46
47#define FOR_PYTHON
Guido van Rossum58132c61997-12-17 00:24:13 +000048#include "pcre-int.h"
Guido van Rossum51b3aa31997-10-06 14:43:11 +000049#include "Python.h"
Guido van Rossum50700601997-12-08 17:15:20 +000050#include <ctype.h>
Guido van Rossum51b3aa31997-10-06 14:43:11 +000051#include "graminit.h"
52
53/*************************************************
54* Perl-Compatible Regular Expressions *
55*************************************************/
56
57/* This file is automatically written by the makechartables auxiliary
58program. If you edit it by hand, you might like to edit the Makefile to
59prevent its ever being regenerated. */
60
61/* This table is a lower casing table. */
62
63unsigned char pcre_lcc[] = {
64 0, 1, 2, 3, 4, 5, 6, 7,
65 8, 9, 10, 11, 12, 13, 14, 15,
66 16, 17, 18, 19, 20, 21, 22, 23,
67 24, 25, 26, 27, 28, 29, 30, 31,
68 32, 33, 34, 35, 36, 37, 38, 39,
69 40, 41, 42, 43, 44, 45, 46, 47,
70 48, 49, 50, 51, 52, 53, 54, 55,
71 56, 57, 58, 59, 60, 61, 62, 63,
72 64, 97, 98, 99,100,101,102,103,
73 104,105,106,107,108,109,110,111,
74 112,113,114,115,116,117,118,119,
75 120,121,122, 91, 92, 93, 94, 95,
76 96, 97, 98, 99,100,101,102,103,
77 104,105,106,107,108,109,110,111,
78 112,113,114,115,116,117,118,119,
79 120,121,122,123,124,125,126,127,
80 128,129,130,131,132,133,134,135,
81 136,137,138,139,140,141,142,143,
82 144,145,146,147,148,149,150,151,
83 152,153,154,155,156,157,158,159,
84 160,161,162,163,164,165,166,167,
85 168,169,170,171,172,173,174,175,
86 176,177,178,179,180,181,182,183,
87 184,185,186,187,188,189,190,191,
88 192,193,194,195,196,197,198,199,
89 200,201,202,203,204,205,206,207,
90 208,209,210,211,212,213,214,215,
91 216,217,218,219,220,221,222,223,
92 224,225,226,227,228,229,230,231,
93 232,233,234,235,236,237,238,239,
94 240,241,242,243,244,245,246,247,
95 248,249,250,251,252,253,254,255 };
96
Guido van Rossum50700601997-12-08 17:15:20 +000097/* This table is a case flipping table. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +000098
Guido van Rossum50700601997-12-08 17:15:20 +000099unsigned char pcre_fcc[] = {
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000100 0, 1, 2, 3, 4, 5, 6, 7,
101 8, 9, 10, 11, 12, 13, 14, 15,
102 16, 17, 18, 19, 20, 21, 22, 23,
103 24, 25, 26, 27, 28, 29, 30, 31,
104 32, 33, 34, 35, 36, 37, 38, 39,
105 40, 41, 42, 43, 44, 45, 46, 47,
106 48, 49, 50, 51, 52, 53, 54, 55,
107 56, 57, 58, 59, 60, 61, 62, 63,
Guido van Rossum50700601997-12-08 17:15:20 +0000108 64, 97, 98, 99,100,101,102,103,
109 104,105,106,107,108,109,110,111,
110 112,113,114,115,116,117,118,119,
111 120,121,122, 91, 92, 93, 94, 95,
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000112 96, 65, 66, 67, 68, 69, 70, 71,
113 72, 73, 74, 75, 76, 77, 78, 79,
114 80, 81, 82, 83, 84, 85, 86, 87,
115 88, 89, 90,123,124,125,126,127,
116 128,129,130,131,132,133,134,135,
117 136,137,138,139,140,141,142,143,
118 144,145,146,147,148,149,150,151,
119 152,153,154,155,156,157,158,159,
120 160,161,162,163,164,165,166,167,
121 168,169,170,171,172,173,174,175,
122 176,177,178,179,180,181,182,183,
123 184,185,186,187,188,189,190,191,
124 192,193,194,195,196,197,198,199,
125 200,201,202,203,204,205,206,207,
126 208,209,210,211,212,213,214,215,
127 216,217,218,219,220,221,222,223,
128 224,225,226,227,228,229,230,231,
129 232,233,234,235,236,237,238,239,
130 240,241,242,243,244,245,246,247,
131 248,249,250,251,252,253,254,255 };
132
Guido van Rossum50700601997-12-08 17:15:20 +0000133/* This table contains bit maps for digits, letters, 'word' chars, and
134white space. Each map is 32 bytes long and the bits run from the least
135significant end of each byte. */
136
137unsigned char pcre_cbits[] = {
138 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
139 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
140 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
141 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
142
143 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
144 0xfe,0xff,0xff,0x07,0xfe,0xff,0xff,0x07,
145 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
146 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
147
148 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
149 0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07,
150 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
151 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
152
153 0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00,
154 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
155 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
156 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 };
157
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000158/* This table identifies various classes of character by individual bits:
Guido van Rossum50700601997-12-08 17:15:20 +0000159 0x01 white space character
160 0x02 letter
161 0x04 decimal digit
162 0x08 hexadecimal digit
163 0x10 alphanumeric or '_'
164 0x80 regular expression metacharacter or binary zero
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000165*/
166
167unsigned char pcre_ctypes[] = {
168 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
169 0x00,0x01,0x01,0x01,0x01,0x01,0x00,0x00, /* 8- 15 */
170 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
171 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
172 0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /* - ' */
173 0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /* ( - / */
Guido van Rossum50700601997-12-08 17:15:20 +0000174 0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c, /* 0 - 7 */
175 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x80, /* 8 - ? */
176 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* @ - G */
177 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* H - O */
178 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* P - W */
179 0x12,0x12,0x12,0x80,0x00,0x00,0x80,0x10, /* X - _ */
180 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* ` - g */
181 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* h - o */
182 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* p - w */
183 0x12,0x12,0x12,0x80,0x80,0x00,0x00,0x00, /* x -127 */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000184 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
185 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
186 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
187 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
188 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
189 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
190 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
191 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
192 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
193 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
194 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
195 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
196 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
197 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
198 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
199 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
200
Guido van Rossum50700601997-12-08 17:15:20 +0000201/* End of chartables.c */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000202/*************************************************
203* Perl-Compatible Regular Expressions *
204*************************************************/
205
206/*
207This is a library of functions to support regular expressions whose syntax
208and semantics are as close as possible to those of the Perl 5 language. See
209the file Tech.Notes for some information on the internals.
210
211Written by: Philip Hazel <ph10@cam.ac.uk>
212
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000213 Copyright (c) 1998 University of Cambridge
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000214
215-----------------------------------------------------------------------------
216Permission is granted to anyone to use this software for any purpose on any
217computer system, and to redistribute it freely, subject to the following
218restrictions:
219
2201. This software is distributed in the hope that it will be useful,
221 but WITHOUT ANY WARRANTY; without even the implied warranty of
222 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
223
2242. The origin of this software must not be misrepresented, either by
225 explicit claim or by omission.
226
2273. Altered versions must be plainly marked as such, and must not be
228 misrepresented as being the original software.
229-----------------------------------------------------------------------------
230*/
231
232
233/* Include the internals header, which itself includes Standard C headers plus
234the external pcre header. */
235
236
237
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000238
239/*************************************************
240* Create bitmap of starting chars *
241*************************************************/
242
243/* This function scans a compiled unanchored expression and attempts to build a
244bitmap of the set of initial characters. If it can't, it returns FALSE. As time
245goes by, we may be able to get more clever at doing this.
246
247Arguments:
248 code points to an expression
249 start_bits points to a 32-byte table, initialized to 0
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000250
251Returns: TRUE if table built, FALSE otherwise
252*/
253
254static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +0000255set_start_bits(const uschar *code, uschar *start_bits)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000256{
Guido van Rossum50700601997-12-08 17:15:20 +0000257register int c;
Guido van Rossum95864d31998-12-21 18:35:49 +0000258volatile int dummy;
Guido van Rossum50700601997-12-08 17:15:20 +0000259
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000260do
261 {
Guido van Rossum58132c61997-12-17 00:24:13 +0000262 const uschar *tcode = code + 3;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000263 BOOL try_next = TRUE;
264
265 while (try_next)
266 {
267 try_next = FALSE;
268
269 if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
270 {
Guido van Rossum50700601997-12-08 17:15:20 +0000271 if (!set_start_bits(tcode, start_bits)) return FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000272 }
273
274 else switch(*tcode)
275 {
276 default:
277 return FALSE;
278
279 /* BRAZERO does the bracket, but carries on. */
280
281 case OP_BRAZERO:
282 case OP_BRAMINZERO:
Guido van Rossum50700601997-12-08 17:15:20 +0000283 if (!set_start_bits(++tcode, start_bits)) return FALSE;
Guido van Rossum95864d31998-12-21 18:35:49 +0000284 dummy = 1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000285 do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
286 tcode += 3;
287 try_next = TRUE;
288 break;
289
290 /* Single-char * or ? sets the bit and tries the next item */
291
292 case OP_STAR:
293 case OP_MINSTAR:
294 case OP_QUERY:
295 case OP_MINQUERY:
Guido van Rossum50700601997-12-08 17:15:20 +0000296 start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000297 tcode += 2;
298 try_next = TRUE;
299 break;
300
301 /* Single-char upto sets the bit and tries the next */
302
303 case OP_UPTO:
304 case OP_MINUPTO:
Guido van Rossum50700601997-12-08 17:15:20 +0000305 start_bits[tcode[3]/8] |= (1 << (tcode[3]&7));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000306 tcode += 4;
307 try_next = TRUE;
308 break;
309
310 /* At least one single char sets the bit and stops */
311
312 case OP_EXACT: /* Fall through */
313 tcode++;
314
315 case OP_CHARS: /* Fall through */
316 tcode++;
317
318 case OP_PLUS:
319 case OP_MINPLUS:
Guido van Rossum50700601997-12-08 17:15:20 +0000320 start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000321 break;
322
323 /* Single character type sets the bits and stops */
324
325 case OP_NOT_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000326 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000327 break;
328
329 case OP_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000330 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000331 break;
332
333 case OP_NOT_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000334 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000335 break;
336
337 case OP_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000338 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000339 break;
340
341 case OP_NOT_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000342 for (c = 0; c < 32; c++)
343 start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000344 break;
345
346 case OP_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000347 for (c = 0; c < 32; c++)
348 start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000349 break;
350
351 /* One or more character type fudges the pointer and restarts, knowing
352 it will hit a single character type and stop there. */
353
354 case OP_TYPEPLUS:
355 case OP_TYPEMINPLUS:
356 tcode++;
357 try_next = TRUE;
358 break;
359
360 case OP_TYPEEXACT:
361 tcode += 3;
362 try_next = TRUE;
363 break;
364
365 /* Zero or more repeats of character types set the bits and then
366 try again. */
367
368 case OP_TYPEUPTO:
369 case OP_TYPEMINUPTO:
370 tcode += 2; /* Fall through */
371
372 case OP_TYPESTAR:
373 case OP_TYPEMINSTAR:
374 case OP_TYPEQUERY:
375 case OP_TYPEMINQUERY:
376 switch(tcode[1])
377 {
378 case OP_NOT_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000379 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000380 break;
381
382 case OP_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000383 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000384 break;
385
386 case OP_NOT_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000387 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000388 break;
389
390 case OP_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000391 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000392 break;
393
394 case OP_NOT_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000395 for (c = 0; c < 32; c++)
396 start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000397 break;
398
399 case OP_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000400 for (c = 0; c < 32; c++)
401 start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000402 break;
403 }
404
405 tcode += 2;
406 try_next = TRUE;
407 break;
408
409 /* Character class: set the bits and either carry on or not,
410 according to the repeat count. */
411
412 case OP_CLASS:
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000413 case OP_NEGCLASS:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000414 {
Guido van Rossum50700601997-12-08 17:15:20 +0000415 tcode++;
416 for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
417 tcode += 32;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000418 switch (*tcode)
419 {
420 case OP_CRSTAR:
421 case OP_CRMINSTAR:
422 case OP_CRQUERY:
423 case OP_CRMINQUERY:
424 tcode++;
425 try_next = TRUE;
426 break;
427
428 case OP_CRRANGE:
429 case OP_CRMINRANGE:
430 if (((tcode[1] << 8) + tcode[2]) == 0)
431 {
432 tcode += 5;
433 try_next = TRUE;
434 }
435 break;
436 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000437 }
438 break; /* End of class handling */
439
440 } /* End of switch */
441 } /* End of try_next loop */
442
443 code += (code[1] << 8) + code[2]; /* Advance to next branch */
444 }
445while (*code == OP_ALT);
446return TRUE;
447}
448
449
450
451/*************************************************
452* Study a compiled expression *
453*************************************************/
454
455/* This function is handed a compiled expression that it must study to produce
456information that will speed up the matching. It returns a pcre_extra block
457which then gets handed back to pcre_exec().
458
459Arguments:
460 re points to the compiled expression
461 options contains option bits
462 errorptr points to where to place error messages;
463 set NULL unless error
464
465Returns: pointer to a pcre_extra block,
466 NULL on error or if no optimization possible
467*/
468
469pcre_extra *
Guido van Rossum58132c61997-12-17 00:24:13 +0000470pcre_study(const pcre *external_re, int options, const char **errorptr)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000471{
472BOOL caseless;
473uschar start_bits[32];
474real_pcre_extra *extra;
Guido van Rossum58132c61997-12-17 00:24:13 +0000475const real_pcre *re = (const real_pcre *)external_re;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000476
477*errorptr = NULL;
478
479if (re == NULL || re->magic_number != MAGIC_NUMBER)
480 {
481 *errorptr = "argument is not a compiled regular expression";
482 return NULL;
483 }
484
485if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
486 {
487 *errorptr = "unknown or incorrect option bit(s) set";
488 return NULL;
489 }
490
Guido van Rossum50700601997-12-08 17:15:20 +0000491/* Caseless can either be from the compiled regex or from options. */
492
493caseless = ((re->options | options) & PCRE_CASELESS) != 0;
494
Thomas Wouters7e474022000-07-16 12:04:32 +0000495/* For an anchored pattern, or an unanchored pattern that has a first char, or a
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000496multiline pattern that matches only at "line starts", no further processing at
497present. */
498
499if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
500 return NULL;
501
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000502/* See if we can find a fixed set of initial characters for the pattern. */
503
Guido van Rossum50700601997-12-08 17:15:20 +0000504memset(start_bits, 0, 32 * sizeof(uschar));
505if (!set_start_bits(re->code, start_bits)) return NULL;
506
507/* If this studying is caseless, scan the created bit map and duplicate the
508bits for any letters. */
509
510if (caseless)
511 {
512 register int c;
513 for (c = 0; c < 256; c++)
514 {
515 if ((start_bits[c/8] & (1 << (c&7))) != 0 &&
516 (pcre_ctypes[c] & ctype_letter) != 0)
517 {
518 int d = pcre_fcc[c];
519 start_bits[d/8] |= (1 << (d&7));
520 }
521 }
522 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000523
524/* Get an "extra" block and put the information therein. */
525
526extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
527
528if (extra == NULL)
529 {
530 *errorptr = "failed to get memory";
531 return NULL;
532 }
Guido van Rossum50700601997-12-08 17:15:20 +0000533
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000534extra->options = PCRE_STUDY_MAPPED | (caseless? PCRE_STUDY_CASELESS : 0);
Guido van Rossum50700601997-12-08 17:15:20 +0000535memcpy(extra->start_bits, start_bits, sizeof(start_bits));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000536
537return (pcre_extra *)extra;
538}
539
Guido van Rossum50700601997-12-08 17:15:20 +0000540/* End of study.c */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000541/*************************************************
542* Perl-Compatible Regular Expressions *
543*************************************************/
544
545/*
546This is a library of functions to support regular expressions whose syntax
547and semantics are as close as possible to those of the Perl 5 language. See
548the file Tech.Notes for some information on the internals.
549
550Written by: Philip Hazel <ph10@cam.ac.uk>
551
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000552 Copyright (c) 1998 University of Cambridge
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000553
554-----------------------------------------------------------------------------
555Permission is granted to anyone to use this software for any purpose on any
556computer system, and to redistribute it freely, subject to the following
557restrictions:
558
5591. This software is distributed in the hope that it will be useful,
560 but WITHOUT ANY WARRANTY; without even the implied warranty of
561 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
562
5632. The origin of this software must not be misrepresented, either by
564 explicit claim or by omission.
565
5663. Altered versions must be plainly marked as such, and must not be
567 misrepresented as being the original software.
568-----------------------------------------------------------------------------
569*/
570
571
572/* Define DEBUG to get debugging output on stdout. */
573
574/* #define DEBUG */
575
Guido van Rossum557dea11997-12-22 22:46:52 +0000576/* Use a macro for debugging printing, 'cause that eliminates the the use
577of #ifdef inline, and there are *still* stupid compilers about that don't like
578indented pre-processor statements. I suppose it's only been 10 years... */
579
580#ifdef DEBUG
581#define DPRINTF(p) printf p
582#else
583#define DPRINTF(p) /*nothing*/
584#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000585
586/* Include the internals header, which itself includes Standard C headers plus
587the external pcre header. */
588
589
Guido van Rossum50700601997-12-08 17:15:20 +0000590
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000591
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000592#ifndef Py_eval_input
593/* For Python 1.4, graminit.h has to be explicitly included */
594#define Py_eval_input eval_input
Guido van Rossum50700601997-12-08 17:15:20 +0000595
596#endif /* FOR_PYTHON */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000597
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000598/* Allow compilation as C++ source code, should anybody want to do that. */
599
600#ifdef __cplusplus
601#define class pcre_class
602#endif
603
604
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000605/* Min and max values for the common repeats; for the maxima, 0 => infinity */
606
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000607static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
608static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000609
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000610/* Text forms of OP_ values and things, for debugging (not all used) */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000611
612#ifdef DEBUG
Guido van Rossum58132c61997-12-17 00:24:13 +0000613static const char *OP_names[] = {
614 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
Guido van Rossum50700601997-12-08 17:15:20 +0000615 "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z",
616 "localized \\B", "localized \\b", "localized \\W", "localized \\w",
617 "^", "$", "Any", "chars",
618 "not",
619 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000620 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
621 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
622 "*", "*?", "+", "+?", "?", "??", "{", "{",
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000623 "class", "negclass", "classL", "Ref",
Guido van Rossum50700601997-12-08 17:15:20 +0000624 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000625 "Brazero", "Braminzero", "Bra"
626};
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000627#endif
628
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000629/* Table for handling escaped characters in the range '0'-'z'. Positive returns
630are simple data values; negative values are for special things like \d and so
631on. Zero means further processing is needed (for things like \x), or the escape
632is invalid. */
633
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000634static const short int escapes[] = {
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000635 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
636 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
637 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
638 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
639 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
640 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
641 '`', 7, -ESC_b, 0, -ESC_d, 0, '\f', 0, /* ` - g */
642 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
643 0, 0, '\r', -ESC_s, '\t', 0, '\v', -ESC_w, /* p - w */
644 0, 0, 0 /* x - z */
645};
646
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000647/* Definition to allow mutual recursion */
648
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000649static BOOL
650compile_regex(int, int *, uschar **, const uschar **, const char **,
651 PyObject *);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000652
653/* Structure for passing "static" information around between the functions
654doing the matching, so that they are thread-safe. */
655
656typedef struct match_data {
657 int errorcode; /* As it says */
658 int *offset_vector; /* Offset vector */
659 int offset_end; /* One past the end */
660 BOOL offset_overflow; /* Set if too many extractions */
661 BOOL caseless; /* Case-independent flag */
Guido van Rossum50700601997-12-08 17:15:20 +0000662 BOOL runtime_caseless; /* Caseless forced at run time */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000663 BOOL multiline; /* Multiline flag */
Guido van Rossum50700601997-12-08 17:15:20 +0000664 BOOL notbol; /* NOTBOL flag */
665 BOOL noteol; /* NOTEOL flag */
666 BOOL dotall; /* Dot matches any char */
667 BOOL endonly; /* Dollar not before final \n */
Guido van Rossum58132c61997-12-17 00:24:13 +0000668 const uschar *start_subject; /* Start of the subject string */
669 const uschar *end_subject; /* End of the subject string */
Guido van Rossum50700601997-12-08 17:15:20 +0000670 jmp_buf fail_env; /* Environment for longjump() break out */
Guido van Rossum58132c61997-12-17 00:24:13 +0000671 const uschar *end_match_ptr; /* Subject position at end match */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000672 int end_offset_top; /* Highwater mark at end of match */
Guido van Rossum50700601997-12-08 17:15:20 +0000673 jmp_buf error_env; /* For longjmp() if an error occurs deep inside a
674 matching operation */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000675 int length; /* Length of the allocated stacks */
676 int point; /* Point to add next item pushed onto stacks */
677 /* Pointers to the 6 stacks */
678 int *off_num, *offset_top, *r1, *r2;
Guido van Rossum58132c61997-12-17 00:24:13 +0000679 const uschar **eptr, **ecode;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000680} match_data;
681
682
683
684/*************************************************
Guido van Rossum50700601997-12-08 17:15:20 +0000685* Global variables *
686*************************************************/
687
688/* PCRE is thread-clean and doesn't use any global variables in the normal
689sense. However, it calls memory allocation and free functions via the two
690indirections below, which are can be changed by the caller, but are shared
691between all threads. */
692
693void *(*pcre_malloc)(size_t) = malloc;
694void (*pcre_free)(void *) = free;
695
696
697
698
699/*************************************************
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000700* Return version string *
701*************************************************/
702
Guido van Rossum58132c61997-12-17 00:24:13 +0000703const char *
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000704pcre_version(void)
705{
706return PCRE_VERSION;
707}
708
709
710
711
712/*************************************************
713* Return info about a compiled pattern *
714*************************************************/
715
716/* This function picks potentially useful data out of the private
717structure.
718
719Arguments:
720 external_re points to compiled code
721 optptr where to pass back the options
722 first_char where to pass back the first character,
723 or -1 if multiline and all branches start ^,
724 or -2 otherwise
725
726Returns: number of identifying extraction brackets
727 or negative values on error
728*/
729
730int
Guido van Rossum50700601997-12-08 17:15:20 +0000731pcre_info(const pcre *external_re, int *optptr, int *first_char)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000732{
Guido van Rossum58132c61997-12-17 00:24:13 +0000733const real_pcre *re = (real_pcre *)external_re;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000734if (re == NULL) return PCRE_ERROR_NULL;
735if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
736if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
737if (first_char != NULL)
738 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
739 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
740return re->top_bracket;
741}
742
743
744
745
746#ifdef DEBUG
747/*************************************************
748* Debugging function to print chars *
749*************************************************/
750
751/* Print a sequence of chars in printable format, stopping at the end of the
752subject if the requested.
753
754Arguments:
755 p points to characters
756 length number to print
757 is_subject TRUE if printing from within md->start_subject
758 md pointer to matching data block, if is_subject is TRUE
759
760Returns: nothing
761*/
762
Guido van Rossum557dea11997-12-22 22:46:52 +0000763static void
764pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000765{
766int c;
767if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
768while (length-- > 0)
769 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
770}
771#endif
772
773
774
775
776/*************************************************
777* Check subpattern for empty operand *
778*************************************************/
779
780/* This function checks a bracketed subpattern to see if any of the paths
781through it could match an empty string. This is used to diagnose an error if
782such a subpattern is followed by a quantifier with an unlimited upper bound.
783
784Argument:
785 code points to the opening bracket
786
787Returns: TRUE or FALSE
788*/
789
790static BOOL
791could_be_empty(uschar *code)
792{
793do {
794 uschar *cc = code + 3;
795
796 /* Scan along the opcodes for this branch; as soon as we find something
797 that matches a non-empty string, break out and advance to test the next
798 branch. If we get to the end of the branch, return TRUE for the whole
799 sub-expression. */
800
801 for (;;)
802 {
803 /* Test an embedded subpattern; if it could not be empty, break the
804 loop. Otherwise carry on in the branch. */
805
Guido van Rossum50700601997-12-08 17:15:20 +0000806 if ((int)(*cc) >= OP_BRA || (int)(*cc) == OP_ONCE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000807 {
808 if (!could_be_empty(cc)) break;
809 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
810 cc += 3;
811 }
812
813 else switch (*cc)
814 {
815 /* Reached end of a branch: the subpattern may match the empty string */
816
817 case OP_ALT:
818 case OP_KET:
819 case OP_KETRMAX:
820 case OP_KETRMIN:
821 return TRUE;
822
Guido van Rossumd0f432b1998-02-20 21:45:14 +0000823 /* Skip over entire bracket groups with zero lower bound */
824
825 case OP_BRAZERO:
826 case OP_BRAMINZERO:
827 cc++;
828 /* Fall through */
829
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000830 /* Skip over assertive subpatterns */
831
832 case OP_ASSERT:
833 case OP_ASSERT_NOT:
834 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
835 cc += 3;
836 break;
837
838 /* Skip over things that don't match chars */
839
840 case OP_SOD:
841 case OP_EOD:
842 case OP_CIRC:
843 case OP_DOLL:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000844 case OP_NOT_WORD_BOUNDARY:
845 case OP_WORD_BOUNDARY:
Guido van Rossum50700601997-12-08 17:15:20 +0000846 case OP_NOT_WORD_BOUNDARY_L:
847 case OP_WORD_BOUNDARY_L:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000848 cc++;
849 break;
850
851 /* Skip over simple repeats with zero lower bound */
852
853 case OP_STAR:
854 case OP_MINSTAR:
855 case OP_QUERY:
856 case OP_MINQUERY:
Guido van Rossum50700601997-12-08 17:15:20 +0000857 case OP_NOTSTAR:
858 case OP_NOTMINSTAR:
859 case OP_NOTQUERY:
860 case OP_NOTMINQUERY:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000861 case OP_TYPESTAR:
862 case OP_TYPEMINSTAR:
863 case OP_TYPEQUERY:
864 case OP_TYPEMINQUERY:
865 cc += 2;
866 break;
867
868 /* Skip over UPTOs (lower bound is zero) */
869
870 case OP_UPTO:
871 case OP_MINUPTO:
872 case OP_TYPEUPTO:
873 case OP_TYPEMINUPTO:
874 cc += 4;
875 break;
876
877 /* Check a class or a back reference for a zero minimum */
878
879 case OP_CLASS:
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000880 case OP_NEGCLASS:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000881 case OP_REF:
Guido van Rossum50700601997-12-08 17:15:20 +0000882 case OP_CLASS_L:
883 switch(*cc)
884 {
885 case (OP_REF): cc += 2; break;
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000886 case (OP_CLASS): case (OP_NEGCLASS): cc += 1+32; break;
Guido van Rossum50700601997-12-08 17:15:20 +0000887 case (OP_CLASS_L): cc += 1+1+32; break;
888 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000889
890 switch (*cc)
891 {
892 case OP_CRSTAR:
893 case OP_CRMINSTAR:
894 case OP_CRQUERY:
895 case OP_CRMINQUERY:
896 cc++;
897 break;
898
899 case OP_CRRANGE:
900 case OP_CRMINRANGE:
901 if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
902 cc += 3;
903 break;
904
905 default:
906 goto NEXT_BRANCH;
907 }
908 break;
909
910 /* Anything else matches at least one character */
911
912 default:
913 goto NEXT_BRANCH;
914 }
915 }
916
917 NEXT_BRANCH:
918 code += (code[1] << 8) + code[2];
919 }
920while (*code == OP_ALT);
921
922/* No branches match the empty string */
923
924return FALSE;
925}
926
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000927/* Determine the length of a group ID in an expression like
928 (?P<foo_123>...)
929Arguments:
930 ptr pattern position pointer (say that 3 times fast)
931 finalchar the character that will mark the end of the ID
932 errorptr points to the pointer to the error message
933*/
934
935static int
Guido van Rossum58132c61997-12-17 00:24:13 +0000936get_group_id(const uschar *ptr, char finalchar, const char **errorptr)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000937{
Guido van Rossum58132c61997-12-17 00:24:13 +0000938 const uschar *start = ptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000939
940 /* If the first character is not in \w, or is in \w but is a digit,
941 report an error */
942 if (!(pcre_ctypes[*ptr] & ctype_word) ||
943 (pcre_ctypes[*ptr++] & ctype_digit))
944 {
945 *errorptr = "(?P identifier must start with a letter or underscore";
946 return 0;
947 }
948
949 /* Increment ptr until we either hit a null byte, the desired
950 final character, or a non-word character */
951 for(; (*ptr != 0) && (*ptr != finalchar) &&
952 (pcre_ctypes[*ptr] & ctype_word); ptr++)
953 {
Guido van Rossumc3861071997-10-08 02:07:40 +0000954 /* Empty loop body */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000955 }
956 if (*ptr==finalchar)
957 return ptr-start;
958 if (*ptr==0)
959 {
960 *errorptr = "unterminated (?P identifier";
961 return 0;
962 }
963 *errorptr = "illegal character in (?P identifier";
964 return 0;
965}
966
967/*************************************************
968* Handle escapes *
969*************************************************/
970
971/* This function is called when a \ has been encountered. It either returns a
972positive value for a simple escape such as \n, or a negative value which
973encodes one of the more complicated things such as \d. On entry, ptr is
974pointing at the \. On exit, it is on the final character of the escape
975sequence.
976
977Arguments:
978 ptrptr points to the pattern position pointer
979 errorptr points to the pointer to the error message
Guido van Rossum50700601997-12-08 17:15:20 +0000980 bracount number of previous extracting brackets
981 options the options bits
982 isclass TRUE if inside a character class
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000983
984Returns: zero or positive => a data character
985 negative => a special escape sequence
986 on error, errorptr is set
987*/
988
989static int
Guido van Rossum58132c61997-12-17 00:24:13 +0000990check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
991 int options, BOOL isclass)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000992{
Guido van Rossum58132c61997-12-17 00:24:13 +0000993const uschar *ptr = *ptrptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000994int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
995int i;
996
Guido van Rossum50700601997-12-08 17:15:20 +0000997if (c == 0) *errorptr = ERR1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000998
999/* Digits or letters may have special meaning; all others are literals. */
1000
1001else if (c < '0' || c > 'z') {}
1002
1003/* Do an initial lookup in a table. A non-zero result is something that can be
1004returned immediately. Otherwise further processing may be required. */
1005
1006else if ((i = escapes[c - '0']) != 0) c = i;
1007
1008/* Escapes that need further processing, or are illegal. */
1009
Guido van Rossum50700601997-12-08 17:15:20 +00001010else
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001011 {
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001012
Guido van Rossum50700601997-12-08 17:15:20 +00001013 switch (c)
1014 {
1015 /* The handling of escape sequences consisting of a string of digits
1016 starting with one that is not zero is not straightforward. By experiment,
1017 the way Perl works seems to be as follows:
1018
1019 Outside a character class, the digits are read as a decimal number. If the
1020 number is less than 10, or if there are that many previous extracting
1021 left brackets, then it is a back reference. Otherwise, up to three octal
1022 digits are read to form an escaped byte. Thus \123 is likely to be octal
1023 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
1024 value is greater than 377, the least significant 8 bits are taken. Inside a
1025 character class, \ followed by a digit is always an octal number. */
1026
1027 case '1': case '2': case '3': case '4': case '5':
1028 case '6': case '7': case '8': case '9':
1029
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001030 {
1031 /* PYTHON: Try to compute an octal value for a character */
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001032 for(c=0, i=0; ptr[i]!=0 && i<3; i++)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001033 {
1034 if (( pcre_ctypes[ ptr[i] ] & ctype_odigit) != 0)
Andrew M. Kuchling3b96d0b2000-08-02 13:41:18 +00001035 c = (c * 8 + ptr[i]-'0') & 255;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001036 else
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001037 break; /* Non-octal character--break out of the loop */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001038 }
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001039 /* It's a character if there were exactly 3 octal digits, or if
1040 we're inside a character class and there was at least one
1041 octal digit. */
1042 if ( (i == 3) || (isclass && i!=0) )
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001043 {
1044 ptr += i-1;
1045 break;
1046 }
1047 c = ptr[0]; /* Restore the first character after the \ */
1048 c -= '0'; i = 1;
1049 while (i<2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0)
1050 {
1051 c = c * 10 + ptr[1] - '0';
1052 ptr++; i++;
1053 }
1054 if (c > 255 - ESC_REF) *errorptr = "back reference too big";
1055 c = -(ESC_REF + c);
1056 }
1057 break;
1058
Guido van Rossum50700601997-12-08 17:15:20 +00001059 /* \0 always starts an octal number, but we may drop through to here with a
1060 larger first octal digit */
1061
1062 case '0':
1063 c -= '0';
1064 while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&
1065 ptr[1] != '8' && ptr[1] != '9')
Andrew M. Kuchling94c34522000-06-01 03:02:48 +00001066 c = (c * 8 + *(++ptr) - '0') & 255;
Guido van Rossum50700601997-12-08 17:15:20 +00001067 break;
1068
1069 /* Special escapes not starting with a digit are straightforward */
1070
1071 case 'x':
1072 c = 0;
1073 while ( (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001074 {
Guido van Rossum50700601997-12-08 17:15:20 +00001075 ptr++;
1076 c = c * 16 + pcre_lcc[*ptr] -
1077 (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
1078 c &= 255;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001079 }
1080 break;
1081
1082
Guido van Rossum50700601997-12-08 17:15:20 +00001083 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1084 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
1085 for Perl compatibility, it is a literal. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001086
Guido van Rossum50700601997-12-08 17:15:20 +00001087 default:
1088 if ((options & PCRE_EXTRA) != 0) switch(c)
1089 {
1090 case 'X':
1091 c = -ESC_X; /* This could be a lookup if it ever got into Perl */
1092 break;
1093
1094 default:
1095 *errorptr = ERR3;
1096 break;
1097 }
1098 break;
1099 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001100 }
1101
1102*ptrptr = ptr;
1103return c;
1104}
1105
1106
1107
1108/*************************************************
Guido van Rossum50700601997-12-08 17:15:20 +00001109* Check for counted repeat *
1110*************************************************/
1111
1112/* This function is called when a '{' is encountered in a place where it might
1113start a quantifier. It looks ahead to see if it really is a quantifier or not.
1114It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1115where the ddds are digits.
1116
1117Arguments:
1118 p pointer to the first char after '{'
1119
1120Returns: TRUE or FALSE
1121*/
1122
1123static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00001124is_counted_repeat(const uschar *p)
Guido van Rossum50700601997-12-08 17:15:20 +00001125{
1126if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1127while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1128if (*p == '}') return TRUE;
1129
1130if (*p++ != ',') return FALSE;
1131if (*p == '}') return TRUE;
1132
1133if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1134while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1135return (*p == '}');
1136}
1137
1138
1139
1140/*************************************************
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001141* Read repeat counts *
1142*************************************************/
1143
Guido van Rossum50700601997-12-08 17:15:20 +00001144/* Read an item of the form {n,m} and return the values. This is called only
1145after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1146so the syntax is guaranteed to be correct, but we need to check the values.
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001147
1148Arguments:
1149 p pointer to first char after '{'
1150 minp pointer to int for min
1151 maxp pointer to int for max
1152 returned as -1 if no max
1153 errorptr points to pointer to error message
1154
1155Returns: pointer to '}' on success;
1156 current ptr on error, with errorptr set
1157*/
1158
Guido van Rossum58132c61997-12-17 00:24:13 +00001159static const uschar *
1160read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001161{
1162int min = 0;
1163int max = -1;
1164
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001165while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
1166
1167if (*p == '}') max = min; else
1168 {
Guido van Rossum50700601997-12-08 17:15:20 +00001169 if (*(++p) != '}')
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001170 {
1171 max = 0;
1172 while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001173 if (max < min)
1174 {
Guido van Rossum50700601997-12-08 17:15:20 +00001175 *errorptr = ERR4;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001176 return p;
1177 }
1178 }
1179 }
1180
1181/* Do paranoid checks, then fill in the required variables, and pass back the
1182pointer to the terminating '}'. */
1183
Guido van Rossum50700601997-12-08 17:15:20 +00001184if (min > 65535 || max > 65535)
1185 *errorptr = ERR5;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001186else
1187 {
1188 *minp = min;
1189 *maxp = max;
1190 }
1191return p;
1192}
1193
1194
1195
1196/*************************************************
1197* Compile one branch *
1198*************************************************/
1199
1200/* Scan the pattern, compiling it into the code vector.
1201
1202Arguments:
Guido van Rossum50700601997-12-08 17:15:20 +00001203 options the option bits
1204 bracket points to number of brackets used
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001205 code points to the pointer to the current code point
1206 ptrptr points to the current pattern pointer
1207 errorptr points to pointer to error message
1208
1209Returns: TRUE on success
1210 FALSE, with *errorptr set on error
1211*/
1212
1213static BOOL
Guido van Rossum50700601997-12-08 17:15:20 +00001214compile_branch(int options, int *brackets, uschar **codeptr,
Guido van Rossum58132c61997-12-17 00:24:13 +00001215 const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001216{
1217int repeat_type, op_type;
1218int repeat_min, repeat_max;
1219int bravalue, length;
Guido van Rossumdda66961998-05-07 15:32:44 +00001220int greedy_default, greedy_non_default;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001221register int c;
1222register uschar *code = *codeptr;
Guido van Rossum58132c61997-12-17 00:24:13 +00001223const uschar *ptr = *ptrptr;
1224const uschar *oldptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001225uschar *previous = NULL;
Guido van Rossum50700601997-12-08 17:15:20 +00001226uschar class[32];
1227uschar *class_flag; /* Pointer to the single-byte flag for OP_CLASS_L */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001228
Guido van Rossumdda66961998-05-07 15:32:44 +00001229/* Set up the default and non-default settings for greediness */
1230
1231greedy_default = ((options & PCRE_UNGREEDY) != 0);
1232greedy_non_default = greedy_default ^ 1;
1233
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001234/* Switch on next character until the end of the branch */
1235
1236for (;; ptr++)
1237 {
Guido van Rossum50700601997-12-08 17:15:20 +00001238 BOOL negate_class;
1239 int class_charcount;
1240 int class_lastchar;
1241
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001242 c = *ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00001243 if ((options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001244 {
1245 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1246 if (c == '#')
1247 {
1248 while ((c = *(++ptr)) != 0 && c != '\n');
1249 continue;
1250 }
1251 }
1252
1253 switch(c)
1254 {
1255 /* The branch terminates at end of string, |, or ). */
1256
1257 case 0:
1258 case '|':
1259 case ')':
1260 *codeptr = code;
1261 *ptrptr = ptr;
1262 return TRUE;
1263
1264 /* Handle single-character metacharacters */
1265
1266 case '^':
1267 previous = NULL;
1268 *code++ = OP_CIRC;
1269 break;
1270
1271 case '$':
1272 previous = NULL;
1273 *code++ = OP_DOLL;
1274 break;
1275
1276 case '.':
1277 previous = code;
1278 *code++ = OP_ANY;
1279 break;
1280
Guido van Rossum50700601997-12-08 17:15:20 +00001281 /* Character classes. These always build a 32-byte bitmap of the permitted
1282 characters, except in the special case where there is only one character.
1283 For negated classes, we build the map as usual, then invert it at the end.
1284 */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001285
1286 case '[':
Guido van Rossum50700601997-12-08 17:15:20 +00001287 previous = code;
1288 if (options & PCRE_LOCALE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001289 {
Guido van Rossum50700601997-12-08 17:15:20 +00001290 *code++ = OP_CLASS_L;
1291 /* Set the flag for localized classes (like \w) to 0 */
1292 class_flag = code;
1293 *class_flag = 0;
1294 }
1295 else
1296 {
1297 *code++ = OP_CLASS;
1298 class_flag = NULL;
1299 }
1300
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001301 /* If the first character is '^', set the negation flag, and use a
1302 different opcode. This only matters if caseless matching is specified at
1303 runtime. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001304
Guido van Rossum50700601997-12-08 17:15:20 +00001305 if ((c = *(++ptr)) == '^')
1306 {
1307 negate_class = TRUE;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001308 if (*(code-1)==OP_CLASS) *(code-1) = OP_NEGCLASS;
Guido van Rossum50700601997-12-08 17:15:20 +00001309 c = *(++ptr);
1310 }
1311 else negate_class = FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001312
Guido van Rossum50700601997-12-08 17:15:20 +00001313 /* Keep a count of chars so that we can optimize the case of just a single
1314 character. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001315
Guido van Rossum50700601997-12-08 17:15:20 +00001316 class_charcount = 0;
1317 class_lastchar = -1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001318
Guido van Rossum50700601997-12-08 17:15:20 +00001319 /* Initialize the 32-char bit map to all zeros. We have to build the
1320 map in a temporary bit of store, in case the class contains only 1
1321 character, because in that case the compiled code doesn't use the
1322 bit map. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001323
Guido van Rossum50700601997-12-08 17:15:20 +00001324 memset(class, 0, 32 * sizeof(uschar));
1325
1326 /* Process characters until ] is reached. By writing this as a "do" it
1327 means that an initial ] is taken as a data character. */
1328
1329 do
1330 {
1331 if (c == 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001332 {
Guido van Rossum50700601997-12-08 17:15:20 +00001333 *errorptr = ERR6;
1334 goto FAILED;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001335 }
1336
Guido van Rossum50700601997-12-08 17:15:20 +00001337 /* Backslash may introduce a single character, or it may introduce one
1338 of the specials, which just set a flag. Escaped items are checked for
1339 validity in the pre-compiling pass. The sequence \b is a special case.
Guido van Rossum58132c61997-12-17 00:24:13 +00001340 Inside a class (and only there) it is treated as backspace. Elsewhere
Guido van Rossum50700601997-12-08 17:15:20 +00001341 it marks a word boundary. Other escapes have preset maps ready to
1342 or into the one we are building. We assume they have more than one
1343 character in them, so set class_count bigger than one. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001344
Guido van Rossum50700601997-12-08 17:15:20 +00001345 if (c == '\\')
1346 {
1347 c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1348 if (-c == ESC_b) c = '\b';
1349 else if (c < 0)
1350 {
1351 class_charcount = 10;
1352 switch (-c)
1353 {
1354 case ESC_d:
Guido van Rossum50700601997-12-08 17:15:20 +00001355 {
1356 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];
1357 }
1358 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001359
Guido van Rossum50700601997-12-08 17:15:20 +00001360 case ESC_D:
Guido van Rossum50700601997-12-08 17:15:20 +00001361 {
1362 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];
1363 }
1364 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001365
Guido van Rossum50700601997-12-08 17:15:20 +00001366 case ESC_w:
1367 if (options & PCRE_LOCALE)
1368 {
1369 *class_flag |= 1;
1370 }
1371 else
1372 {
1373 for (c = 0; c < 32; c++)
1374 class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1375 }
1376 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001377
Guido van Rossum50700601997-12-08 17:15:20 +00001378 case ESC_W:
1379 if (options & PCRE_LOCALE)
1380 {
1381 *class_flag |= 2;
1382 }
1383 else
1384 {
1385 for (c = 0; c < 32; c++)
1386 class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1387 }
1388 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001389
Guido van Rossum50700601997-12-08 17:15:20 +00001390 case ESC_s:
Guido van Rossum50700601997-12-08 17:15:20 +00001391 {
1392 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];
1393 }
1394 continue;
1395
1396 case ESC_S:
Guido van Rossum50700601997-12-08 17:15:20 +00001397 {
1398 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];
1399 }
1400 continue;
1401
1402 default:
1403 *errorptr = ERR7;
1404 goto FAILED;
1405 }
1406 }
1407 /* Fall through if single character */
1408 }
1409
1410 /* A single character may be followed by '-' to form a range. However,
1411 Perl does not permit ']' to be the end of the range. A '-' character
1412 here is treated as a literal. */
1413
1414 if (ptr[1] == '-' && ptr[2] != ']')
1415 {
1416 int d;
1417 ptr += 2;
1418 d = *ptr;
1419
1420 if (d == 0)
1421 {
1422 *errorptr = ERR6;
1423 goto FAILED;
1424 }
1425
1426 /* The second part of a range can be a single-character escape, but
1427 not any of the other escapes. */
1428
1429 if (d == '\\')
1430 {
1431 d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1432 if (d < 0)
1433 {
1434 if (d == -ESC_b) d = '\b'; else
1435 {
1436 *errorptr = ERR7;
1437 goto FAILED;
1438 }
1439 }
1440 }
1441
1442 if (d < c)
1443 {
1444 *errorptr = ERR8;
1445 goto FAILED;
1446 }
1447
1448 for (; c <= d; c++)
1449 {
1450 class[c/8] |= (1 << (c&7));
1451 if ((options & PCRE_CASELESS) != 0)
1452 {
1453 int uc = pcre_fcc[c]; /* flip case */
1454 class[uc/8] |= (1 << (uc&7));
1455 }
1456 class_charcount++; /* in case a one-char range */
1457 class_lastchar = c;
1458 }
1459 continue; /* Go get the next char in the class */
1460 }
1461
1462 /* Handle a lone single character - we can get here for a normal
1463 non-escape char, or after \ that introduces a single character. */
1464
1465 class [c/8] |= (1 << (c&7));
1466 if ((options & PCRE_CASELESS) != 0)
1467 {
1468 c = pcre_fcc[c]; /* flip case */
1469 class[c/8] |= (1 << (c&7));
1470 }
1471 class_charcount++;
1472 class_lastchar = c;
1473 }
1474
1475 /* Loop until ']' reached; the check for end of string happens inside the
1476 loop. This "while" is the end of the "do" above. */
1477
1478 while ((c = *(++ptr)) != ']');
1479
1480 /* If class_charcount is 1 and class_lastchar is not negative, we saw
1481 precisely one character. This doesn't need the whole 32-byte bit map.
1482 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1483 it's negative. */
1484
1485 if (class_charcount == 1 && class_lastchar >= 0)
1486 {
1487 if (negate_class)
1488 {
1489 code[-1] = OP_NOT;
1490 }
1491 else
1492 {
1493 code[-1] = OP_CHARS;
1494 *code++ = 1;
1495 }
1496 *code++ = class_lastchar;
1497 }
1498
1499 /* Otherwise, negate the 32-byte map if necessary, and copy it into
1500 the code vector. */
1501
1502 else
1503 {
1504 /* If this is a localized opcode, bump the code pointer up */
1505 if (class_flag) code++;
1506 if (negate_class)
1507 {
1508 if (class_flag) *class_flag = (*class_flag) ^ 63;
1509 for (c = 0; c < 32; c++) code[c] = ~class[c];
1510 }
1511 else
1512 memcpy(code, class, 32);
1513 code += 32;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001514 }
1515 break;
1516
1517 /* Various kinds of repeat */
1518
1519 case '{':
Guido van Rossum50700601997-12-08 17:15:20 +00001520 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001521 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
1522 if (*errorptr != NULL) goto FAILED;
1523 goto REPEAT;
1524
1525 case '*':
1526 repeat_min = 0;
1527 repeat_max = -1;
1528 goto REPEAT;
1529
1530 case '+':
1531 repeat_min = 1;
1532 repeat_max = -1;
1533 goto REPEAT;
1534
1535 case '?':
1536 repeat_min = 0;
1537 repeat_max = 1;
1538
1539 REPEAT:
1540 if (previous == NULL)
1541 {
Guido van Rossum50700601997-12-08 17:15:20 +00001542 *errorptr = ERR9;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001543 goto FAILED;
1544 }
1545
Guido van Rossumdda66961998-05-07 15:32:44 +00001546 /* If the next character is '?' this is a minimizing repeat, by default,
1547 but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001548 next character. */
1549
Guido van Rossumdda66961998-05-07 15:32:44 +00001550 if (ptr[1] == '?')
1551 { repeat_type = greedy_non_default; ptr++; }
1552 else repeat_type = greedy_default;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001553
Guido van Rossum50700601997-12-08 17:15:20 +00001554 /* If the maximum is zero then the minimum must also be zero; Perl allows
1555 this case, so we do too - by simply omitting the item altogether. */
1556
1557 if (repeat_max == 0) code = previous;
1558
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001559 /* If previous was a string of characters, chop off the last one and use it
1560 as the subject of the repeat. If there was only one character, we can
1561 abolish the previous item altogether. */
1562
Guido van Rossum50700601997-12-08 17:15:20 +00001563 else if (*previous == OP_CHARS)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001564 {
1565 int len = previous[1];
1566 if (len == 1)
1567 {
1568 c = previous[2];
1569 code = previous;
1570 }
1571 else
1572 {
1573 c = previous[len+1];
1574 previous[1]--;
1575 code--;
1576 }
1577 op_type = 0; /* Use single-char op codes */
1578 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1579 }
1580
Guido van Rossum50700601997-12-08 17:15:20 +00001581 /* If previous was a single negated character ([^a] or similar), we use
1582 one of the special opcodes, replacing it. The code is shared with single-
1583 character repeats by adding a suitable offset into repeat_type. */
1584
1585 else if ((int)*previous == OP_NOT)
1586 {
1587 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
1588 c = previous[1];
1589 code = previous;
1590 goto OUTPUT_SINGLE_REPEAT;
1591 }
1592
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001593 /* If previous was a character type match (\d or similar), abolish it and
1594 create a suitable repeat item. The code is shared with single-character
1595 repeats by adding a suitable offset into repeat_type. */
1596
Guido van Rossum50700601997-12-08 17:15:20 +00001597 else if ((int)*previous < OP_CIRC || *previous == OP_ANY)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001598 {
1599 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1600 c = *previous;
1601 code = previous;
1602
1603 OUTPUT_SINGLE_REPEAT:
1604 repeat_type += op_type; /* Combine both values for many cases */
1605
1606 /* A minimum of zero is handled either as the special case * or ?, or as
1607 an UPTO, with the maximum given. */
1608
1609 if (repeat_min == 0)
1610 {
1611 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1612 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1613 else
1614 {
1615 *code++ = OP_UPTO + repeat_type;
1616 *code++ = repeat_max >> 8;
1617 *code++ = (repeat_max & 255);
1618 }
1619 }
1620
1621 /* The case {1,} is handled as the special case + */
1622
1623 else if (repeat_min == 1 && repeat_max == -1)
1624 *code++ = OP_PLUS + repeat_type;
1625
1626 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1627 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1628
1629 else
1630 {
1631 if (repeat_min != 1)
1632 {
1633 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1634 *code++ = repeat_min >> 8;
1635 *code++ = (repeat_min & 255);
1636 }
1637
Thomas Wouters7e474022000-07-16 12:04:32 +00001638 /* If the minimum is 1 and the previous item was a character string,
1639 we either have to put back the item that got canceled if the string
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001640 length was 1, or add the character back onto the end of a longer
Guido van Rossumdda66961998-05-07 15:32:44 +00001641 string. For a character type nothing need be done; it will just get
1642 put back naturally. Note that the final character is always going to
1643 get added below. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001644
1645 else if (*previous == OP_CHARS)
1646 {
1647 if (code == previous) code += 2; else previous[1]++;
1648 }
1649
Guido van Rossumdda66961998-05-07 15:32:44 +00001650 /* For a single negated character we also have to put back the
Thomas Wouters7e474022000-07-16 12:04:32 +00001651 item that got canceled. */
Guido van Rossumdda66961998-05-07 15:32:44 +00001652
1653 else if (*previous == OP_NOT) code++;
1654
Guido van Rossum557dea11997-12-22 22:46:52 +00001655 /* If the maximum is unlimited, insert an OP_STAR. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001656
Guido van Rossum557dea11997-12-22 22:46:52 +00001657 if (repeat_max < 0)
1658 {
1659 *code++ = c;
1660 *code++ = OP_STAR + repeat_type;
1661 }
1662
1663 /* Else insert an UPTO if the max is greater than the min. */
1664
1665 else if (repeat_max != repeat_min)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001666 {
1667 *code++ = c;
1668 repeat_max -= repeat_min;
1669 *code++ = OP_UPTO + repeat_type;
1670 *code++ = repeat_max >> 8;
1671 *code++ = (repeat_max & 255);
1672 }
1673 }
1674
1675 /* The character or character type itself comes last in all cases. */
1676
1677 *code++ = c;
1678 }
1679
1680 /* If previous was a character class or a back reference, we put the repeat
1681 stuff after it. */
1682
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001683 else if (*previous == OP_CLASS || *previous == OP_NEGCLASS ||
1684 *previous==OP_CLASS_L || *previous == OP_REF)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001685 {
1686 if (repeat_min == 0 && repeat_max == -1)
1687 *code++ = OP_CRSTAR + repeat_type;
1688 else if (repeat_min == 1 && repeat_max == -1)
1689 *code++ = OP_CRPLUS + repeat_type;
1690 else if (repeat_min == 0 && repeat_max == 1)
1691 *code++ = OP_CRQUERY + repeat_type;
1692 else
1693 {
1694 *code++ = OP_CRRANGE + repeat_type;
1695 *code++ = repeat_min >> 8;
1696 *code++ = repeat_min & 255;
1697 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1698 *code++ = repeat_max >> 8;
1699 *code++ = repeat_max & 255;
1700 }
1701 }
1702
1703 /* If previous was a bracket group, we may have to replicate it in certain
1704 cases. If the maximum repeat count is unlimited, check that the bracket
1705 group cannot match the empty string, and diagnose an error if it can. */
1706
1707 else if ((int)*previous >= OP_BRA)
1708 {
1709 int i;
Guido van Rossum557dea11997-12-22 22:46:52 +00001710 int len = code - previous;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001711
1712 if (repeat_max == -1 && could_be_empty(previous))
1713 {
Guido van Rossum50700601997-12-08 17:15:20 +00001714 *errorptr = ERR10;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001715 goto FAILED;
1716 }
1717
1718 /* If the minimum is greater than zero, and the maximum is unlimited or
1719 equal to the minimum, the first copy remains where it is, and is
1720 replicated up to the minimum number of times. This case includes the +
1721 repeat, but of course no replication is needed in that case. */
1722
1723 if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1724 {
1725 for (i = 1; i < repeat_min; i++)
1726 {
Guido van Rossum557dea11997-12-22 22:46:52 +00001727 memcpy(code, previous, len);
1728 code += len;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001729 }
1730 }
1731
1732 /* If the minimum is zero, stick BRAZERO in front of the first copy.
1733 Then, if there is a fixed upper limit, replicated up to that many times,
1734 sticking BRAZERO in front of all the optional ones. */
1735
1736 else
1737 {
1738 if (repeat_min == 0)
1739 {
Guido van Rossum557dea11997-12-22 22:46:52 +00001740 memmove(previous+1, previous, len);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001741 code++;
1742 *previous++ = OP_BRAZERO + repeat_type;
1743 }
1744
1745 for (i = 1; i < repeat_min; i++)
1746 {
Guido van Rossum557dea11997-12-22 22:46:52 +00001747 memcpy(code, previous, len);
1748 code += len;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001749 }
1750
1751 for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1752 {
1753 *code++ = OP_BRAZERO + repeat_type;
Guido van Rossum557dea11997-12-22 22:46:52 +00001754 memcpy(code, previous, len);
1755 code += len;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001756 }
1757 }
1758
1759 /* If the maximum is unlimited, set a repeater in the final copy. */
1760
1761 if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
1762 }
1763
1764 /* Else there's some kind of shambles */
1765
1766 else
1767 {
Guido van Rossum50700601997-12-08 17:15:20 +00001768 *errorptr = ERR11;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001769 goto FAILED;
1770 }
1771
1772 /* In all case we no longer have a previous item. */
1773
1774 previous = NULL;
1775 break;
1776
1777
1778 /* Start of nested bracket sub-expression, or comment or lookahead.
1779 First deal with special things that can come after a bracket; all are
1780 introduced by ?, and the appearance of any of them means that this is not a
1781 referencing group. They were checked for validity in the first pass over
1782 the string, so we don't have to check for syntax errors here. */
1783
1784 case '(':
1785 previous = code; /* Only real brackets can be repeated */
1786 if (*(++ptr) == '?')
1787 {
1788 bravalue = OP_BRA;
1789
1790 switch (*(++ptr))
1791 {
1792 case '#':
1793 case 'i':
Guido van Rossumbd49ac41997-12-10 23:05:53 +00001794 case 'L':
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001795 case 'm':
1796 case 's':
1797 case 'x':
1798 ptr++;
1799 while (*ptr != ')') ptr++;
1800 previous = NULL;
1801 continue;
1802
1803 case ':': /* Non-extracting bracket */
1804 ptr++;
1805 break;
1806
1807 case '=': /* Assertions can't be repeated */
1808 bravalue = OP_ASSERT;
1809 ptr++;
1810 previous = NULL;
1811 break;
1812
1813 case '!':
1814 bravalue = OP_ASSERT_NOT;
1815 ptr++;
1816 previous = NULL;
1817 break;
1818
1819 case ('P'):
1820 ptr++;
1821 if (*ptr=='<')
1822 {
1823 /* (?P<groupname>...) */
1824 int idlen;
1825 PyObject *string, *intobj;
1826
1827 ptr++;
1828 idlen = get_group_id(ptr, '>', errorptr);
1829 if (*errorptr) {
1830 goto FAILED;
1831 }
Guido van Rossum57ba4f31997-12-02 20:40:28 +00001832 string = PyString_FromStringAndSize((char*)ptr, idlen);
Guido van Rossum50700601997-12-08 17:15:20 +00001833 intobj = PyInt_FromLong( brackets[0] + 1 );
Guido van Rossum58132c61997-12-17 00:24:13 +00001834 if (intobj == NULL || string == NULL)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001835 {
1836 Py_XDECREF(string);
1837 Py_XDECREF(intobj);
1838 *errorptr = "exception raised";
1839 goto FAILED;
1840 }
1841 PyDict_SetItem(dictionary, string, intobj);
Guido van Rossum58132c61997-12-17 00:24:13 +00001842 Py_DECREF(string); Py_DECREF(intobj); /* XXX DECREF commented out! */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001843 ptr += idlen+1; /* Point to rest of expression */
1844 goto do_grouping_bracket;
1845 }
1846 if (*ptr=='=')
1847 {
1848 /* (?P=groupname) */
1849 int idlen, refnum;
1850 PyObject *string, *intobj;
1851
1852 ptr++;
1853 idlen = get_group_id(ptr, ')', errorptr);
1854 if (*errorptr) {
1855 goto FAILED;
1856 }
Guido van Rossum50700601997-12-08 17:15:20 +00001857 string = PyString_FromStringAndSize((char *)ptr, idlen);
Guido van Rossumc3861071997-10-08 02:07:40 +00001858 if (string==NULL) {
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001859 *errorptr = "exception raised";
1860 goto FAILED;
1861 }
1862 intobj = PyDict_GetItem(dictionary, string);
1863 if (intobj==NULL) {
Guido van Rossumc3861071997-10-08 02:07:40 +00001864 Py_DECREF(string);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001865 *errorptr = "?P= group identifier isn't defined";
1866 goto FAILED;
1867 }
1868
1869 refnum = PyInt_AsLong(intobj);
Guido van Rossum1eadb411997-12-15 17:33:24 +00001870 Py_DECREF(string);
Guido van Rossum58132c61997-12-17 00:24:13 +00001871 /* The caller doesn't own the reference to the value
1872 returned from PyDict_GetItem, so intobj is not
1873 DECREF'ed. */
1874
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001875 *code++ = OP_REF;
1876 *code++ = refnum;
1877 /* The continue will cause the top-level for() loop to
1878 be resumed, so ptr will be immediately incremented.
1879 Therefore, the following line adds just idlen, not
1880 idlen+1 */
1881 ptr += idlen;
1882 continue;
1883 }
1884 /* The character after ?P is neither < nor =, so
1885 report an error. Add more Python-extensions here. */
1886 *errorptr="unknown after (?P";
1887 goto FAILED;
Guido van Rossum50700601997-12-08 17:15:20 +00001888
1889 case '>': /* "Match once" brackets */
1890 if ((options & PCRE_EXTRA) != 0) /* Not yet standard */
1891 {
1892 bravalue = OP_ONCE;
1893 ptr++;
1894 previous = NULL;
1895 break;
1896 }
1897 /* Else fall through */
1898
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001899 default:
Guido van Rossum50700601997-12-08 17:15:20 +00001900 *errorptr = ERR12;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001901 goto FAILED;
1902 }
1903 }
1904
1905 /* Else we have a referencing group */
1906
1907 else
1908 {
1909 do_grouping_bracket:
Guido van Rossum50700601997-12-08 17:15:20 +00001910 if (++(*brackets) > EXTRACT_MAX)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001911 {
Guido van Rossum50700601997-12-08 17:15:20 +00001912 *errorptr = ERR13;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001913 goto FAILED;
1914 }
Guido van Rossum50700601997-12-08 17:15:20 +00001915 bravalue = OP_BRA + *brackets;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001916 }
1917
1918 /* Process nested bracketed re; at end pointer is on the bracket. We copy
1919 code into a non-register variable in order to be able to pass its address
1920 because some compilers complain otherwise. */
1921
1922 *code = bravalue;
1923 {
1924 uschar *mcode = code;
Guido van Rossum50700601997-12-08 17:15:20 +00001925 if (!compile_regex(options, brackets, &mcode, &ptr, errorptr, dictionary))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001926 goto FAILED;
1927 code = mcode;
1928 }
1929
1930 if (*ptr != ')')
1931 {
Guido van Rossum50700601997-12-08 17:15:20 +00001932 *errorptr = ERR14;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001933 goto FAILED;
1934 }
1935 break;
1936
1937 /* Check \ for being a real metacharacter; if not, fall through and handle
1938 it as a data character at the start of a string. Escape items are checked
1939 for validity in the pre-compiling pass. */
1940
1941 case '\\':
1942 oldptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00001943 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001944
1945 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1946 are arranged to be the negation of the corresponding OP_values. For the
1947 back references, the values are ESC_REF plus the reference number. Only
1948 back references and those types that consume a character may be repeated.
1949 We can test for values between ESC_b and ESC_Z for the latter; this may
1950 have to change if any new ones are ever created. */
1951
1952 if (c < 0)
1953 {
1954 if (-c >= ESC_REF)
1955 {
Guido van Rossum50700601997-12-08 17:15:20 +00001956 int refnum = -c - ESC_REF;
1957 if (*brackets < refnum)
1958 {
1959 *errorptr = ERR15;
1960 goto FAILED;
1961 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001962 previous = code;
1963 *code++ = OP_REF;
1964 *code++ = refnum;
1965 }
1966 else
1967 {
Guido van Rossum50700601997-12-08 17:15:20 +00001968 previous = (-c > ESC_b && -c < ESC_X)? code : NULL;
1969 if ( (options & PCRE_LOCALE) != 0)
1970 {
1971 switch (c)
1972 {
1973 case (-ESC_b): c = -OP_WORD_BOUNDARY_L; break;
1974 case (-ESC_B): c = -OP_NOT_WORD_BOUNDARY_L; break;
1975 case (-ESC_w): c = -OP_WORDCHAR_L; break;
1976 case (-ESC_W): c = -OP_NOT_WORDCHAR_L; break;
1977 }
1978 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001979 *code++ = -c;
1980 }
1981 continue;
1982 }
1983
Guido van Rossum58132c61997-12-17 00:24:13 +00001984 /* Data character: Reset and fall through */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001985
1986 ptr = oldptr;
1987 c = '\\';
1988
1989 /* Handle a run of data characters until a metacharacter is encountered.
1990 The first character is guaranteed not to be whitespace or # when the
1991 extended flag is set. */
1992
Guido van Rossum50700601997-12-08 17:15:20 +00001993 NORMAL_CHAR:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001994 default:
1995 previous = code;
1996 *code = OP_CHARS;
1997 code += 2;
1998 length = 0;
1999
2000 do
2001 {
Guido van Rossum50700601997-12-08 17:15:20 +00002002 if ((options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002003 {
2004 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
2005 if (c == '#')
2006 {
2007 while ((c = *(++ptr)) != 0 && c != '\n');
2008 if (c == 0) break;
2009 continue;
2010 }
2011 }
2012
2013 /* Backslash may introduce a data char or a metacharacter. Escaped items
2014 are checked for validity in the pre-compiling pass. Stop the string
2015 before a metaitem. */
2016
2017 if (c == '\\')
2018 {
2019 oldptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00002020 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002021 if (c < 0) { ptr = oldptr; break; }
2022 }
2023
2024 /* Ordinary character or single-char escape */
2025
2026 *code++ = c;
2027 length++;
2028 }
2029
2030 /* This "while" is the end of the "do" above. */
2031
2032 while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2033
2034 /* Compute the length and set it in the data vector, and advance to
2035 the next state. */
2036
2037 previous[1] = length;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00002038 if (length < 255) ptr--;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002039 break;
2040 }
2041 } /* end of big loop */
2042
2043/* Control never reaches here by falling through, only by a goto for all the
2044error states. Pass back the position in the pattern so that it can be displayed
2045to the user for diagnosing the error. */
2046
2047FAILED:
2048*ptrptr = ptr;
2049return FALSE;
2050}
2051
2052
2053
2054
2055/*************************************************
2056* Compile sequence of alternatives *
2057*************************************************/
2058
2059/* On entry, ptr is pointing past the bracket character, but on return
2060it points to the closing bracket, or vertical bar, or end of string.
2061The code variable is pointing at the byte into which the BRA operator has been
2062stored.
2063
2064Argument:
Guido van Rossum50700601997-12-08 17:15:20 +00002065 options the option bits
2066 brackets -> int containing the number of extracting brackets used
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002067 codeptr -> the address of the current code pointer
2068 ptrptr -> the address of the current pattern pointer
2069 errorptr -> pointer to error message
2070
2071Returns: TRUE on success
2072*/
2073
2074static BOOL
Guido van Rossum50700601997-12-08 17:15:20 +00002075compile_regex(int options, int *brackets, uschar **codeptr,
Guido van Rossum58132c61997-12-17 00:24:13 +00002076 const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002077{
Guido van Rossum58132c61997-12-17 00:24:13 +00002078const uschar *ptr = *ptrptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002079uschar *code = *codeptr;
2080uschar *start_bracket = code;
2081
2082for (;;)
2083 {
2084 int length;
2085 uschar *last_branch = code;
2086
2087 code += 3;
Guido van Rossum50700601997-12-08 17:15:20 +00002088 if (!compile_branch(options, brackets, &code, &ptr, errorptr, dictionary))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002089 {
2090 *ptrptr = ptr;
2091 return FALSE;
2092 }
2093
2094 /* Fill in the length of the last branch */
2095
2096 length = code - last_branch;
2097 last_branch[1] = length >> 8;
2098 last_branch[2] = length & 255;
2099
2100 /* Reached end of expression, either ')' or end of pattern. Insert a
2101 terminating ket and the length of the whole bracketed item, and return,
2102 leaving the pointer at the terminating char. */
2103
2104 if (*ptr != '|')
2105 {
2106 length = code - start_bracket;
2107 *code++ = OP_KET;
2108 *code++ = length >> 8;
2109 *code++ = length & 255;
2110 *codeptr = code;
2111 *ptrptr = ptr;
2112 return TRUE;
2113 }
2114
2115 /* Another branch follows; insert an "or" node and advance the pointer. */
2116
2117 *code = OP_ALT;
2118 ptr++;
2119 }
2120/* Control never reaches here */
2121}
2122
2123
2124
2125/*************************************************
2126* Check for anchored expression *
2127*************************************************/
2128
2129/* Try to find out if this is an anchored regular expression. Consider each
2130alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2131all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2132it's anchored. However, if this is a multiline pattern, then only OP_SOD
2133counts, since OP_CIRC can match in the middle.
2134
2135A branch is also implicitly anchored if it starts with .* because that will try
2136the rest of the pattern at all possible matching points, so there is no point
2137trying them again.
2138
2139Argument: points to start of expression (the bracket)
2140Returns: TRUE or FALSE
2141*/
2142
2143static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00002144is_anchored(register const uschar *code, BOOL multiline)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002145{
2146do {
2147 int op = (int)code[3];
Guido van Rossum50700601997-12-08 17:15:20 +00002148 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002149 { if (!is_anchored(code+3, multiline)) return FALSE; }
2150 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2151 { if (code[4] != OP_ANY) return FALSE; }
2152 else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;
2153 code += (code[1] << 8) + code[2];
2154 }
2155while (*code == OP_ALT);
2156return TRUE;
2157}
2158
2159
2160
2161/*************************************************
2162* Check for start with \n line expression *
2163*************************************************/
2164
2165/* This is called for multiline expressions to try to find out if every branch
2166starts with ^ so that "first char" processing can be done to speed things up.
2167
2168Argument: points to start of expression (the bracket)
2169Returns: TRUE or FALSE
2170*/
2171
2172static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00002173is_startline(const uschar *code)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002174{
2175do {
2176 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2177 { if (!is_startline(code+3)) return FALSE; }
2178 else if (code[3] != OP_CIRC) return FALSE;
2179 code += (code[1] << 8) + code[2];
2180 }
2181while (*code == OP_ALT);
2182return TRUE;
2183}
2184
2185
2186
2187/*************************************************
2188* Check for fixed first char *
2189*************************************************/
2190
2191/* Try to find out if there is a fixed first character. This is called for
2192unanchored expressions, as it speeds up their processing quite considerably.
2193Consider each alternative branch. If they all start with the same char, or with
2194a bracket all of whose alternatives start with the same char (recurse ad lib),
2195then we return that char, otherwise -1.
2196
2197Argument: points to start of expression (the bracket)
2198Returns: -1 or the fixed first char
2199*/
2200
2201static int
2202find_firstchar(uschar *code)
2203{
2204register int c = -1;
2205do
2206 {
2207 register int charoffset = 4;
2208
2209 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2210 {
2211 register int d;
2212 if ((d = find_firstchar(code+3)) < 0) return -1;
2213 if (c < 0) c = d; else if (c != d) return -1;
2214 }
2215
2216 else switch(code[3])
2217 {
2218 default:
2219 return -1;
2220
2221 case OP_EXACT: /* Fall through */
2222 charoffset++;
2223
2224 case OP_CHARS: /* Fall through */
2225 charoffset++;
2226
2227 case OP_PLUS:
2228 case OP_MINPLUS:
2229 if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;
2230 break;
2231 }
2232 code += (code[1] << 8) + code[2];
2233 }
2234while (*code == OP_ALT);
2235return c;
2236}
2237
2238
2239
2240/*************************************************
2241* Compile a Regular Expression *
2242*************************************************/
2243
2244/* This function takes a string and returns a pointer to a block of store
2245holding a compiled version of the expression.
2246
2247Arguments:
2248 pattern the regular expression
2249 options various option bits
2250 errorptr pointer to pointer to error text
2251 erroroffset ptr offset in pattern where error was detected
2252
2253Returns: pointer to compiled data block, or NULL on error,
2254 with errorptr and erroroffset set
2255*/
2256
2257pcre *
Guido van Rossum58132c61997-12-17 00:24:13 +00002258pcre_compile(const char *pattern, int options, const char **errorptr,
Guido van Rossum50700601997-12-08 17:15:20 +00002259 int *erroroffset, PyObject *dictionary)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002260{
2261real_pcre *re;
2262int spaces = 0;
2263int length = 3; /* For initial BRA plus length */
2264int runlength;
2265int c, size;
Guido van Rossum50700601997-12-08 17:15:20 +00002266int bracount = 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002267int brastack[200];
Guido van Rossum50700601997-12-08 17:15:20 +00002268int top_backref = 0;
Guido van Rossum58132c61997-12-17 00:24:13 +00002269unsigned int brastackptr = 0;
2270uschar *code;
2271const uschar *ptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002272
2273#ifdef DEBUG
2274uschar *code_base, *code_end;
2275#endif
2276
Guido van Rossum50700601997-12-08 17:15:20 +00002277/* We can't pass back an error message if errorptr is NULL; I guess the best we
2278can do is just return NULL. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002279
Guido van Rossum50700601997-12-08 17:15:20 +00002280if (errorptr == NULL) return NULL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002281*errorptr = NULL;
Guido van Rossum50700601997-12-08 17:15:20 +00002282
2283/* However, we can give a message for this error */
2284
2285if (erroroffset == NULL)
2286 {
2287 *errorptr = ERR16;
2288 return NULL;
2289 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002290*erroroffset = 0;
2291
2292if ((options & ~PUBLIC_OPTIONS) != 0)
2293 {
Guido van Rossum50700601997-12-08 17:15:20 +00002294 *errorptr = ERR17;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002295 return NULL;
2296 }
2297
Guido van Rossum557dea11997-12-22 22:46:52 +00002298DPRINTF(("------------------------------------------------------------------\n"));
2299DPRINTF(("%s\n", pattern));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002300
2301/* The first thing to do is to make a pass over the pattern to compute the
2302amount of store required to hold the compiled code. This does not have to be
2303perfect as long as errors are overestimates. At the same time we can detect any
2304internal flag settings. Make an attempt to correct for any counted white space
2305if an "extended" flag setting appears late in the pattern. We can't be so
2306clever for #-comments. */
2307
Guido van Rossum58132c61997-12-17 00:24:13 +00002308ptr = (const uschar *)(pattern - 1);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002309while ((c = *(++ptr)) != 0)
2310 {
Guido van Rossum50700601997-12-08 17:15:20 +00002311 int min, max;
2312 int class_charcount;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002313
2314 if ((pcre_ctypes[c] & ctype_space) != 0)
2315 {
Guido van Rossum50700601997-12-08 17:15:20 +00002316 if ((options & PCRE_EXTENDED) != 0) continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002317 spaces++;
2318 }
2319
Guido van Rossum50700601997-12-08 17:15:20 +00002320 if (c == '#' && (options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002321 {
2322 while ((c = *(++ptr)) != 0 && c != '\n');
2323 continue;
2324 }
2325
2326 switch(c)
2327 {
2328 /* A backslashed item may be an escaped "normal" character or a
2329 character type. For a "normal" character, put the pointers and
2330 character back so that tests for whitespace etc. in the input
2331 are done correctly. */
2332
2333 case '\\':
2334 {
Guido van Rossum58132c61997-12-17 00:24:13 +00002335 const uschar *save_ptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00002336 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002337 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2338 if (c >= 0)
2339 {
2340 ptr = save_ptr;
2341 c = '\\';
2342 goto NORMAL_CHAR;
2343 }
2344 }
2345 length++;
2346
2347 /* A back reference needs an additional char, plus either one or 5
Guido van Rossum50700601997-12-08 17:15:20 +00002348 bytes for a repeat. We also need to keep the value of the highest
2349 back reference. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002350
2351 if (c <= -ESC_REF)
2352 {
Guido van Rossum50700601997-12-08 17:15:20 +00002353 int refnum = -c - ESC_REF;
2354 if (refnum > top_backref) top_backref = refnum;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002355 length++; /* For single back reference */
Guido van Rossum50700601997-12-08 17:15:20 +00002356 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002357 {
2358 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2359 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2360 if ((min == 0 && (max == 1 || max == -1)) ||
2361 (min == 1 && max == -1))
2362 length++;
2363 else length += 5;
2364 if (ptr[1] == '?') ptr++;
2365 }
2366 }
2367 continue;
2368
2369 case '^':
2370 case '.':
2371 case '$':
2372 case '*': /* These repeats won't be after brackets; */
2373 case '+': /* those are handled separately */
2374 case '?':
2375 length++;
2376 continue;
2377
2378 /* This covers the cases of repeats after a single char, metachar, class,
2379 or back reference. */
2380
2381 case '{':
Guido van Rossum50700601997-12-08 17:15:20 +00002382 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002383 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
2384 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2385 if ((min == 0 && (max == 1 || max == -1)) ||
2386 (min == 1 && max == -1))
2387 length++;
2388 else
2389 {
2390 length--; /* Uncount the original char or metachar */
2391 if (min == 1) length++; else if (min > 0) length += 4;
2392 if (max > 0) length += 4; else length += 2;
2393 }
2394 if (ptr[1] == '?') ptr++;
2395 continue;
2396
2397 /* An alternation contains an offset to the next branch or ket. */
2398 case '|':
2399 length += 3;
2400 continue;
2401
Guido van Rossum50700601997-12-08 17:15:20 +00002402 /* A character class uses 33 characters. Don't worry about character types
2403 that aren't allowed in classes - they'll get picked up during the compile.
2404 A character class that contains only one character uses 2 or 3 bytes,
2405 depending on whether it is negated or not. Notice this where we can. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002406
2407 case '[':
Guido van Rossum50700601997-12-08 17:15:20 +00002408 class_charcount = 0;
2409 if (*(++ptr) == '^') ptr++;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002410 do
2411 {
Guido van Rossum50700601997-12-08 17:15:20 +00002412 if (*ptr == '\\')
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002413 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002414 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002415 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
Guido van Rossum557dea11997-12-22 22:46:52 +00002416 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002417 }
Guido van Rossum50700601997-12-08 17:15:20 +00002418 else class_charcount++;
2419 ptr++;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002420 }
2421 while (*ptr != 0 && *ptr != ']');
2422
Guido van Rossum50700601997-12-08 17:15:20 +00002423 /* Repeats for negated single chars are handled by the general code */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002424
Guido van Rossum50700601997-12-08 17:15:20 +00002425 if (class_charcount == 1) length += 3; else
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002426 {
Guido van Rossum50700601997-12-08 17:15:20 +00002427 length += 33;
2428 if (options & PCRE_LOCALE) length++; /* Add a byte for the localization flag */
2429
2430 /* A repeat needs either 1 or 5 bytes. */
2431
Guido van Rossum557dea11997-12-22 22:46:52 +00002432 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
Guido van Rossum50700601997-12-08 17:15:20 +00002433 {
2434 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2435 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2436 if ((min == 0 && (max == 1 || max == -1)) ||
2437 (min == 1 && max == -1))
2438 length++;
2439 else length += 5;
2440 if (ptr[1] == '?') ptr++;
2441 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002442 }
2443 continue;
2444
2445 /* Brackets may be genuine groups or special things */
2446
2447 case '(':
2448
2449 /* Handle special forms of bracket, which all start (? */
2450
2451 if (ptr[1] == '?') switch (c = ptr[2])
2452 {
2453 /* Skip over comments entirely */
2454 case '#':
2455 ptr += 3;
2456 while (*ptr != 0 && *ptr != ')') ptr++;
2457 if (*ptr == 0)
2458 {
Guido van Rossum50700601997-12-08 17:15:20 +00002459 *errorptr = ERR18;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002460 goto PCRE_ERROR_RETURN;
2461 }
2462 continue;
2463
2464 /* Non-referencing groups and lookaheads just move the pointer on, and
Guido van Rossum50700601997-12-08 17:15:20 +00002465 then behave like a non-special bracket, except that they don't increment
2466 the count of extracting brackets. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002467
2468 case ':':
2469 case '=':
2470 case '!':
2471 ptr += 2;
2472 break;
2473
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002474 case ('P'):
2475 {
2476 int idlen;
2477 switch (*ptr++) {
2478 case ('<'):
2479 idlen = get_group_id(ptr++, '>', errorptr);
2480 if (*errorptr) goto PCRE_ERROR_RETURN;
2481 ptr += idlen+1;
2482 break;
2483 case ('='):
2484 idlen = get_group_id(ptr++, ')', errorptr);
2485 if (*errorptr) goto PCRE_ERROR_RETURN;
2486 ptr += idlen+1;
2487 length++;
2488 break;
2489 }
2490 }
2491 break;
2492
Guido van Rossum50700601997-12-08 17:15:20 +00002493 /* Ditto for the "once only" bracket, allowed only if the extra bit
2494 is set. */
2495
2496 case '>':
2497 if ((options & PCRE_EXTRA) != 0)
2498 {
2499 ptr += 2;
2500 break;
2501 }
Guido van Rossumdda66961998-05-07 15:32:44 +00002502 /* Else fall through */
Guido van Rossum50700601997-12-08 17:15:20 +00002503
2504 /* Else loop setting valid options until ) is met. Anything else is an
2505 error. */
2506
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002507 default:
2508 ptr += 2;
2509 for (;; ptr++)
2510 {
2511 if ((c = *ptr) == 'i')
2512 {
2513 options |= PCRE_CASELESS;
2514 continue;
2515 }
Guido van Rossumbd49ac41997-12-10 23:05:53 +00002516 else if ((c = *ptr) == 'L')
Guido van Rossum50700601997-12-08 17:15:20 +00002517 {
2518 options |= PCRE_LOCALE;
2519 continue;
2520 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002521 else if ((c = *ptr) == 'm')
2522 {
2523 options |= PCRE_MULTILINE;
2524 continue;
2525 }
Guido van Rossum50700601997-12-08 17:15:20 +00002526 else if (c == 's')
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002527 {
2528 options |= PCRE_DOTALL;
2529 continue;
2530 }
2531 else if (c == 'x')
2532 {
2533 options |= PCRE_EXTENDED;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002534 length -= spaces; /* Already counted spaces */
2535 continue;
2536 }
2537 else if (c == ')') break;
2538
Guido van Rossum50700601997-12-08 17:15:20 +00002539 *errorptr = ERR12;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002540 goto PCRE_ERROR_RETURN;
2541 }
2542 continue; /* End of this bracket handling */
2543 }
2544
Guido van Rossum50700601997-12-08 17:15:20 +00002545 /* Extracting brackets must be counted so we can process escapes in a
2546 Perlish way. */
2547
2548 else bracount++;
2549
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002550 /* Non-special forms of bracket. Save length for computing whole length
2551 at end if there's a repeat that requires duplication of the group. */
2552
2553 if (brastackptr >= sizeof(brastack)/sizeof(int))
2554 {
Guido van Rossum50700601997-12-08 17:15:20 +00002555 *errorptr = ERR19;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002556 goto PCRE_ERROR_RETURN;
2557 }
2558
2559 brastack[brastackptr++] = length;
2560 length += 3;
2561 continue;
2562
2563 /* Handle ket. Look for subsequent max/min; for certain sets of values we
Guido van Rossum557dea11997-12-22 22:46:52 +00002564 have to replicate this bracket up to that many times. If brastackptr is
2565 0 this is an unmatched bracket which will generate an error, but take care
2566 not to try to access brastack[-1]. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002567
2568 case ')':
2569 length += 3;
2570 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002571 int minval = 1;
2572 int maxval = 1;
2573 int duplength = (brastackptr > 0)? length - brastack[--brastackptr] : 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002574
2575 /* Leave ptr at the final char; for read_repeat_counts this happens
2576 automatically; for the others we need an increment. */
2577
Guido van Rossum50700601997-12-08 17:15:20 +00002578 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002579 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002580 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002581 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2582 }
Guido van Rossum557dea11997-12-22 22:46:52 +00002583 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2584 else if (c == '+') { maxval = -1; ptr++; }
2585 else if (c == '?') { minval = 0; ptr++; }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002586
Guido van Rossum557dea11997-12-22 22:46:52 +00002587 /* If there is a minimum > 1 we have to replicate up to minval-1 times;
2588 if there is a limited maximum we have to replicate up to maxval-1 times
2589 and allow for a BRAZERO item before each optional copy, as we also have
2590 to do before the first copy if the minimum is zero. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002591
Guido van Rossum557dea11997-12-22 22:46:52 +00002592 if (minval == 0) length++;
2593 else if (minval > 1) length += (minval - 1) * duplength;
2594 if (maxval > minval) length += (maxval - minval) * (duplength + 1);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002595 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002596 continue;
2597
2598 /* Non-special character. For a run of such characters the length required
2599 is the number of characters + 2, except that the maximum run length is 255.
2600 We won't get a skipped space or a non-data escape or the start of a #
2601 comment as the first character, so the length can't be zero. */
2602
2603 NORMAL_CHAR:
2604 default:
2605 length += 2;
2606 runlength = 0;
2607 do
2608 {
2609 if ((pcre_ctypes[c] & ctype_space) != 0)
2610 {
Guido van Rossum50700601997-12-08 17:15:20 +00002611 if ((options & PCRE_EXTENDED) != 0) continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002612 spaces++;
2613 }
2614
Guido van Rossum50700601997-12-08 17:15:20 +00002615 if (c == '#' && (options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002616 {
2617 while ((c = *(++ptr)) != 0 && c != '\n');
2618 continue;
2619 }
2620
2621 /* Backslash may introduce a data char or a metacharacter; stop the
2622 string before the latter. */
2623
2624 if (c == '\\')
2625 {
Guido van Rossum58132c61997-12-17 00:24:13 +00002626 const uschar *saveptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00002627 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002628 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2629 if (c < 0) { ptr = saveptr; break; }
2630 }
2631
2632 /* Ordinary character or single-char escape */
2633
2634 runlength++;
2635 }
2636
2637 /* This "while" is the end of the "do" above. */
2638
2639 while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2640
2641 ptr--;
2642 length += runlength;
2643 continue;
2644 }
2645 }
2646
2647length += 4; /* For final KET and END */
2648
2649if (length > 65539)
2650 {
Guido van Rossum50700601997-12-08 17:15:20 +00002651 *errorptr = ERR20;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002652 return NULL;
2653 }
2654
2655/* Compute the size of data block needed and get it, either from malloc or
Guido van Rossum557dea11997-12-22 22:46:52 +00002656externally provided function. We specify "code[0]" in the offsetof() expression
2657rather than just "code", because it has been reported that one broken compiler
2658fails on "code" because it is also an independent variable. It should make no
2659difference to the value of the offsetof(). */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002660
Guido van Rossum557dea11997-12-22 22:46:52 +00002661size = length + offsetof(real_pcre, code[0]);
Guido van Rossum50700601997-12-08 17:15:20 +00002662re = (real_pcre *)(pcre_malloc)(size+50);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002663
2664if (re == NULL)
2665 {
Guido van Rossum50700601997-12-08 17:15:20 +00002666 *errorptr = ERR21;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002667 return NULL;
2668 }
2669
Guido van Rossum557dea11997-12-22 22:46:52 +00002670/* Put in the magic number and the options. */
2671
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002672re->magic_number = MAGIC_NUMBER;
2673re->options = options;
2674
2675/* Set up a starting, non-extracting bracket, then compile the expression. On
2676error, *errorptr will be set non-NULL, so we don't need to look at the result
2677of the function here. */
2678
Guido van Rossum58132c61997-12-17 00:24:13 +00002679ptr = (const uschar *)pattern;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002680code = re->code;
2681*code = OP_BRA;
Guido van Rossum50700601997-12-08 17:15:20 +00002682bracount = 0;
2683(void)compile_regex(options, &bracount, &code, &ptr, errorptr, dictionary);
2684re->top_bracket = bracount;
2685re->top_backref = top_backref;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002686
2687/* If not reached end of pattern on success, there's an excess bracket. */
2688
Guido van Rossum50700601997-12-08 17:15:20 +00002689if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
2690
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002691/* Fill in the terminating state and check for disastrous overflow, but
2692if debugging, leave the test till after things are printed out. */
2693
2694*code++ = OP_END;
2695
Guido van Rossum50700601997-12-08 17:15:20 +00002696
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002697#ifndef DEBUG
Guido van Rossum50700601997-12-08 17:15:20 +00002698if (code - re->code > length) *errorptr = ERR23;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002699#endif
2700
2701/* Failed to compile */
2702
2703if (*errorptr != NULL)
2704 {
2705 (pcre_free)(re);
2706 PCRE_ERROR_RETURN:
Guido van Rossum58132c61997-12-17 00:24:13 +00002707 *erroroffset = ptr - (const uschar *)pattern;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002708 return NULL;
2709 }
2710
2711/* If the anchored option was not passed, set flag if we can determine that it
2712is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
2713we can determine what the first character has to be, because that speeds up
2714unanchored matches no end. In the case of multiline matches, an alternative is
2715to set the PCRE_STARTLINE flag if all branches start with ^. */
2716
2717if ((options & PCRE_ANCHORED) == 0)
2718 {
2719 if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))
2720 re->options |= PCRE_ANCHORED;
2721 else
2722 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002723 int ch = find_firstchar(re->code);
2724 if (ch >= 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002725 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002726 re->first_char = ch;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002727 re->options |= PCRE_FIRSTSET;
2728 }
2729 else if (is_startline(re->code))
2730 re->options |= PCRE_STARTLINE;
2731 }
2732 }
2733
2734/* Print out the compiled data for debugging */
2735
2736#ifdef DEBUG
2737
Guido van Rossum50700601997-12-08 17:15:20 +00002738printf("Length = %d top_bracket = %d top_backref=%d\n",
2739 length, re->top_bracket, re->top_backref);
2740
2741if (re->options != 0)
2742 {
Guido van Rossumdda66961998-05-07 15:32:44 +00002743 printf("%s%s%s%s%s%s%s%s\n",
Guido van Rossum50700601997-12-08 17:15:20 +00002744 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2745 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2746 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2747 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2748 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2749 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
Guido van Rossumdda66961998-05-07 15:32:44 +00002750 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
2751 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
Guido van Rossum50700601997-12-08 17:15:20 +00002752 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002753
2754if ((re->options & PCRE_FIRSTSET) != 0)
2755 {
2756 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2757 else printf("First char = \\x%02x\n", re->first_char);
2758 }
2759
2760code_end = code;
2761code_base = code = re->code;
2762
2763while (code < code_end)
2764 {
2765 int charlength;
2766
2767 printf("%3d ", code - code_base);
2768
2769 if (*code >= OP_BRA)
2770 {
2771 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2772 code += 2;
2773 }
2774
2775 else switch(*code)
2776 {
2777 case OP_CHARS:
2778 charlength = *(++code);
2779 printf("%3d ", charlength);
2780 while (charlength-- > 0)
2781 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2782 break;
2783
2784 case OP_KETRMAX:
2785 case OP_KETRMIN:
2786 case OP_ALT:
2787 case OP_KET:
2788 case OP_ASSERT:
2789 case OP_ASSERT_NOT:
Guido van Rossum50700601997-12-08 17:15:20 +00002790 case OP_ONCE:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002791 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2792 code += 2;
2793 break;
2794
2795 case OP_STAR:
2796 case OP_MINSTAR:
2797 case OP_PLUS:
2798 case OP_MINPLUS:
2799 case OP_QUERY:
2800 case OP_MINQUERY:
2801 case OP_TYPESTAR:
2802 case OP_TYPEMINSTAR:
2803 case OP_TYPEPLUS:
2804 case OP_TYPEMINPLUS:
2805 case OP_TYPEQUERY:
2806 case OP_TYPEMINQUERY:
2807 if (*code >= OP_TYPESTAR)
2808 printf(" %s", OP_names[code[1]]);
2809 else if (isprint(c = code[1])) printf(" %c", c);
2810 else printf(" \\x%02x", c);
2811 printf("%s", OP_names[*code++]);
2812 break;
2813
2814 case OP_EXACT:
2815 case OP_UPTO:
2816 case OP_MINUPTO:
2817 if (isprint(c = code[3])) printf(" %c{", c);
2818 else printf(" \\x%02x{", c);
Guido van Rossum557dea11997-12-22 22:46:52 +00002819 if (*code != OP_EXACT) printf("0,");
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002820 printf("%d}", (code[1] << 8) + code[2]);
2821 if (*code == OP_MINUPTO) printf("?");
2822 code += 3;
2823 break;
2824
2825 case OP_TYPEEXACT:
2826 case OP_TYPEUPTO:
2827 case OP_TYPEMINUPTO:
2828 printf(" %s{", OP_names[code[3]]);
2829 if (*code != OP_TYPEEXACT) printf(",");
2830 printf("%d}", (code[1] << 8) + code[2]);
2831 if (*code == OP_TYPEMINUPTO) printf("?");
2832 code += 3;
2833 break;
2834
Guido van Rossum50700601997-12-08 17:15:20 +00002835 case OP_NOT:
2836 if (isprint(c = *(++code))) printf(" [^%c]", c);
2837 else printf(" [^\\x%02x]", c);
2838 break;
2839
2840 case OP_NOTSTAR:
2841 case OP_NOTMINSTAR:
2842 case OP_NOTPLUS:
2843 case OP_NOTMINPLUS:
2844 case OP_NOTQUERY:
2845 case OP_NOTMINQUERY:
2846 if (isprint(c = code[1])) printf(" [^%c]", c);
2847 else printf(" [^\\x%02x]", c);
2848 printf("%s", OP_names[*code++]);
2849 break;
2850
2851 case OP_NOTEXACT:
2852 case OP_NOTUPTO:
2853 case OP_NOTMINUPTO:
2854 if (isprint(c = code[3])) printf(" [^%c]{", c);
2855 else printf(" [^\\x%02x]{", c);
2856 if (*code != OP_NOTEXACT) printf(",");
2857 printf("%d}", (code[1] << 8) + code[2]);
2858 if (*code == OP_NOTMINUPTO) printf("?");
2859 code += 3;
2860 break;
2861
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002862 case OP_REF:
2863 printf(" \\%d", *(++code));
Guido van Rossum557dea11997-12-22 22:46:52 +00002864 code ++;
2865 goto CLASS_REF_REPEAT;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002866
2867 case OP_CLASS:
Guido van Rossum042ff9e1998-04-03 21:13:31 +00002868 case OP_NEGCLASS:
Guido van Rossum50700601997-12-08 17:15:20 +00002869 case OP_CLASS_L:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002870 {
2871 int i, min, max;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002872
Guido van Rossum50700601997-12-08 17:15:20 +00002873 if (*code==OP_CLASS_L)
2874 {
2875 code++;
2876 printf("Locflag = %i ", *code++);
Guido van Rossum042ff9e1998-04-03 21:13:31 +00002877 printf(" [");
Guido van Rossum50700601997-12-08 17:15:20 +00002878 }
2879 else
Guido van Rossum042ff9e1998-04-03 21:13:31 +00002880 {
2881 if (*code++ == OP_CLASS) printf(" [");
2882 else printf(" ^[");
2883 }
2884
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002885
Guido van Rossum50700601997-12-08 17:15:20 +00002886 for (i = 0; i < 256; i++)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002887 {
Guido van Rossum50700601997-12-08 17:15:20 +00002888 if ((code[i/8] & (1 << (i&7))) != 0)
2889 {
2890 int j;
2891 for (j = i+1; j < 256; j++)
2892 if ((code[j/8] & (1 << (j&7))) == 0) break;
2893 if (i == '-' || i == ']') printf("\\");
2894 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2895 if (--j > i)
2896 {
2897 printf("-");
2898 if (j == '-' || j == ']') printf("\\");
2899 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2900 }
2901 i = j;
2902 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002903 }
2904 printf("]");
Guido van Rossum50700601997-12-08 17:15:20 +00002905 code += 32;
2906 /* code ++;*/
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002907
Guido van Rossum557dea11997-12-22 22:46:52 +00002908 CLASS_REF_REPEAT:
2909
Guido van Rossum50700601997-12-08 17:15:20 +00002910 switch(*code)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002911 {
2912 case OP_CRSTAR:
2913 case OP_CRMINSTAR:
2914 case OP_CRPLUS:
2915 case OP_CRMINPLUS:
2916 case OP_CRQUERY:
2917 case OP_CRMINQUERY:
2918 printf("%s", OP_names[*code]);
2919 break;
2920
2921 case OP_CRRANGE:
2922 case OP_CRMINRANGE:
2923 min = (code[1] << 8) + code[2];
2924 max = (code[3] << 8) + code[4];
2925 if (max == 0) printf("{%d,}", min);
2926 else printf("{%d,%d}", min, max);
2927 if (*code == OP_CRMINRANGE) printf("?");
2928 code += 4;
2929 break;
2930
2931 default:
2932 code--;
2933 }
2934 }
2935 break;
2936
2937 /* Anything else is just a one-node item */
2938
2939 default:
2940 printf(" %s", OP_names[*code]);
2941 break;
2942 }
2943
2944 code++;
2945 printf("\n");
2946 }
2947printf("------------------------------------------------------------------\n");
2948
2949/* This check is done here in the debugging case so that the code that
2950was compiled can be seen. */
2951
2952if (code - re->code > length)
2953 {
Guido van Rossum50700601997-12-08 17:15:20 +00002954 printf("length=%i, code length=%i\n", length, code-re->code);
2955 *errorptr = ERR23;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002956 (pcre_free)(re);
2957 *erroroffset = ptr - (uschar *)pattern;
2958 return NULL;
2959 }
2960#endif
2961
2962return (pcre *)re;
2963}
2964
2965
2966
2967/*************************************************
2968* Match a character type *
2969*************************************************/
2970
2971/* Not used in all the places it might be as it's sometimes faster
2972to put the code inline.
2973
2974Arguments:
2975 type the character type
2976 c the character
Guido van Rossum50700601997-12-08 17:15:20 +00002977 dotall the dotall flag
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002978
2979Returns: TRUE if character is of the type
2980*/
2981
2982static BOOL
2983match_type(int type, int c, BOOL dotall)
2984{
2985
2986#ifdef DEBUG
2987if (isprint(c)) printf("matching subject %c against ", c);
2988 else printf("matching subject \\x%02x against ", c);
2989printf("%s\n", OP_names[type]);
2990#endif
2991
2992switch(type)
2993 {
2994 case OP_ANY: return dotall || c != '\n';
2995 case OP_NOT_DIGIT: return (pcre_ctypes[c] & ctype_digit) == 0;
2996 case OP_DIGIT: return (pcre_ctypes[c] & ctype_digit) != 0;
2997 case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
2998 case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0;
2999 case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0;
3000 case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0;
Guido van Rossum58132c61997-12-17 00:24:13 +00003001 case OP_NOT_WORDCHAR_L: return (c!='_' && !isalnum(c));
3002 case OP_WORDCHAR_L: return (c=='_' || isalnum(c));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003003 }
3004return FALSE;
3005}
3006
3007
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003008
3009/*************************************************
3010* Match a back-reference *
3011*************************************************/
3012
3013/* If a back reference hasn't been set, the match fails.
3014
3015Arguments:
3016 number reference number
3017 eptr points into the subject
3018 length length to be matched
3019 md points to match data block
3020
3021Returns: TRUE if matched
3022*/
3023
3024static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00003025match_ref(int number, register const uschar *eptr, int length, match_data *md)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003026{
Guido van Rossum58132c61997-12-17 00:24:13 +00003027const uschar *p = md->start_subject + md->offset_vector[number];
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003028
3029#ifdef DEBUG
3030if (eptr >= md->end_subject)
3031 printf("matching subject <null>");
3032else
3033 {
3034 printf("matching subject ");
3035 pchars(eptr, length, TRUE, md);
3036 }
3037printf(" against backref ");
3038pchars(p, length, FALSE, md);
3039printf("\n");
3040#endif
3041
3042/* Always fail if not enough characters left */
3043
3044if (length > md->end_subject - p) return FALSE;
3045
Guido van Rossum58132c61997-12-17 00:24:13 +00003046/* Separate the caseless case for speed */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003047
3048if (md->caseless)
3049 { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }
3050else
3051 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3052
3053return TRUE;
3054}
3055
3056static int free_stack(match_data *md)
3057{
3058/* Free any stack space that was allocated by the call to match(). */
Andrew M. Kuchlingfc9d2252000-02-18 19:16:45 +00003059if (md->off_num) PyMem_DEL(md->off_num);
3060if (md->offset_top) PyMem_DEL(md->offset_top);
3061if (md->r1) PyMem_DEL(md->r1);
3062if (md->r2) PyMem_DEL(md->r2);
3063if (md->eptr) PyMem_DEL((char *)md->eptr);
3064if (md->ecode) PyMem_DEL((char *)md->ecode);
Guido van Rossumc3861071997-10-08 02:07:40 +00003065return 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003066}
3067
3068static int grow_stack(match_data *md)
3069{
Guido van Rossum50700601997-12-08 17:15:20 +00003070 if (md->length != 0)
3071 {
3072 md->length = md->length + md->length/2;
3073 }
3074 else
3075 {
3076 int string_len = md->end_subject - md->start_subject + 1;
3077 if (string_len < 80) {md->length = string_len; }
3078 else {md->length = 80;}
3079 }
3080 PyMem_RESIZE(md->offset_top, int, md->length);
Guido van Rossum58132c61997-12-17 00:24:13 +00003081 PyMem_RESIZE(md->eptr, const uschar *, md->length);
3082 PyMem_RESIZE(md->ecode, const uschar *, md->length);
Guido van Rossum50700601997-12-08 17:15:20 +00003083 PyMem_RESIZE(md->off_num, int, md->length);
3084 PyMem_RESIZE(md->r1, int, md->length);
3085 PyMem_RESIZE(md->r2, int, md->length);
3086 if (md->offset_top == NULL || md->eptr == NULL || md->ecode == NULL ||
3087 md->off_num == NULL || md->r1 == NULL || md->r2 == NULL)
3088 {
Guido van Rossumdda66961998-05-07 15:32:44 +00003089 PyErr_NoMemory();
Guido van Rossum50700601997-12-08 17:15:20 +00003090 longjmp(md->error_env, 1);
3091 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003092 return 0;
3093}
3094
Guido van Rossum50700601997-12-08 17:15:20 +00003095
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003096/*************************************************
3097* Match from current position *
3098*************************************************/
3099
3100/* On entry ecode points to the first opcode, and eptr to the first character.
3101
3102Arguments:
3103 eptr pointer in subject
3104 ecode position in code
3105 offset_top current top pointer
3106 md pointer to "static" info for the match
3107
3108Returns: TRUE if matched
3109*/
3110
3111static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00003112match(register const uschar *eptr, register const uschar *ecode, int offset_top,
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003113 match_data *md)
3114{
3115 int save_stack_position = md->point;
3116match_loop:
3117
3118#define SUCCEED goto succeed
3119#define FAIL goto fail
3120
3121for (;;)
3122 {
3123 int min, max, ctype;
3124 register int i;
3125 register int c;
Guido van Rossum58132c61997-12-17 00:24:13 +00003126 BOOL minimize = FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003127
3128 /* Opening bracket. Check the alternative branches in turn, failing if none
3129 match. We have to set the start offset if required and there is space
3130 in the offset vector so that it is available for subsequent back references
3131 if the bracket matches. However, if the bracket fails, we must put back the
3132 previous value of both offsets in case they were set by a previous copy of
3133 the same bracket. Don't worry about setting the flag for the error case here;
3134 that is handled in the code for KET. */
3135
3136 if ((int)*ecode >= OP_BRA)
3137 {
3138 int number = (*ecode - OP_BRA) << 1;
Guido van Rossum58132c61997-12-17 00:24:13 +00003139 int save_offset1 = 0, save_offset2 = 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003140
Guido van Rossum557dea11997-12-22 22:46:52 +00003141 DPRINTF(("start bracket %d\n", number/2));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003142
3143 if (number > 0 && number < md->offset_end)
3144 {
3145 save_offset1 = md->offset_vector[number];
3146 save_offset2 = md->offset_vector[number+1];
3147 md->offset_vector[number] = eptr - md->start_subject;
3148
Guido van Rossum557dea11997-12-22 22:46:52 +00003149 DPRINTF(("saving %d %d\n", save_offset1, save_offset2));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003150 }
3151
3152 /* Recurse for all the alternatives. */
3153
3154 do
3155 {
3156 if (match(eptr, ecode+3, offset_top, md)) SUCCEED;
3157 ecode += (ecode[1] << 8) + ecode[2];
3158 }
3159 while (*ecode == OP_ALT);
3160
Guido van Rossum557dea11997-12-22 22:46:52 +00003161 DPRINTF(("bracket %d failed\n", number/2));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003162
3163 if (number > 0 && number < md->offset_end)
3164 {
3165 md->offset_vector[number] = save_offset1;
3166 md->offset_vector[number+1] = save_offset2;
3167 }
3168
3169 FAIL;
3170 }
3171
3172 /* Other types of node can be handled by a switch */
3173
3174 switch(*ecode)
3175 {
3176 case OP_END:
3177 md->end_match_ptr = eptr; /* Record where we ended */
3178 md->end_offset_top = offset_top; /* and how many extracts were taken */
3179 SUCCEED;
3180
Guido van Rossum50700601997-12-08 17:15:20 +00003181 /* The equivalent of Prolog's "cut" - if the rest doesn't match, the
3182 whole thing doesn't match, so we have to get out via a longjmp(). */
3183
3184 case OP_CUT:
3185 if (match(eptr, ecode+1, offset_top, md)) SUCCEED;
3186 longjmp(md->fail_env, 1);
3187
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003188 /* Assertion brackets. Check the alternative branches in turn - the
3189 matching won't pass the KET for an assertion. If any one branch matches,
3190 the assertion is true. */
3191
3192 case OP_ASSERT:
3193 do
3194 {
3195 if (match(eptr, ecode+3, offset_top, md)) break;
3196 ecode += (ecode[1] << 8) + ecode[2];
3197 }
3198 while (*ecode == OP_ALT);
3199 if (*ecode == OP_KET) FAIL;
3200
3201 /* Continue from after the assertion, updating the offsets high water
3202 mark, since extracts may have been taken during the assertion. */
3203
3204 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3205 ecode += 3;
3206 offset_top = md->end_offset_top;
3207 continue;
3208
3209 /* Negative assertion: all branches must fail to match */
3210
3211 case OP_ASSERT_NOT:
3212 do
3213 {
3214 if (match(eptr, ecode+3, offset_top, md)) FAIL;
3215 ecode += (ecode[1] << 8) + ecode[2];
3216 }
3217 while (*ecode == OP_ALT);
3218 ecode += 3;
3219 continue;
3220
Guido van Rossum50700601997-12-08 17:15:20 +00003221 /* "Once" brackets are like assertion brackets except that after a match,
3222 the point in the subject string is not moved back. Thus there can never be
3223 a move back into the brackets. Check the alternative branches in turn - the
3224 matching won't pass the KET for this kind of subpattern. If any one branch
3225 matches, we carry on, leaving the subject pointer. */
3226
3227 case OP_ONCE:
3228 do
3229 {
3230 if (match(eptr, ecode+3, offset_top, md)) break;
3231 ecode += (ecode[1] << 8) + ecode[2];
3232 }
3233 while (*ecode == OP_ALT);
Guido van Rossum557dea11997-12-22 22:46:52 +00003234 if (*ecode == OP_KET) FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00003235
3236 /* Continue as from after the assertion, updating the offsets high water
3237 mark, since extracts may have been taken. */
3238
3239 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3240 ecode += 3;
3241 offset_top = md->end_offset_top;
3242 eptr = md->end_match_ptr;
3243 continue;
3244
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003245 /* An alternation is the end of a branch; scan along to find the end of the
3246 bracketed group and go to there. */
3247
3248 case OP_ALT:
3249 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3250 break;
3251
3252 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3253 that it may occur zero times. It may repeat infinitely, or not at all -
3254 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3255 repeat limits are compiled as a number of copies, with the optional ones
3256 preceded by BRAZERO or BRAMINZERO. */
3257
3258 case OP_BRAZERO:
3259 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003260 const uschar *next = ecode+1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003261 if (match(eptr, next, offset_top, md)) SUCCEED;
3262 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3263 ecode = next + 3;
3264 }
3265 break;
3266
3267 case OP_BRAMINZERO:
3268 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003269 const uschar *next = ecode+1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003270 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3271 if (match(eptr, next+3, offset_top, md)) SUCCEED;
3272 ecode++;
3273 }
3274 break;;
3275
3276 /* End of a group, repeated or non-repeating. If we are at the end of
3277 an assertion "group", stop matching and SUCCEED, but record the
3278 current high water mark for use by positive assertions. */
3279
3280 case OP_KET:
3281 case OP_KETRMIN:
3282 case OP_KETRMAX:
3283 {
Guido van Rossum50700601997-12-08 17:15:20 +00003284 int number;
Guido van Rossum58132c61997-12-17 00:24:13 +00003285 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003286
Guido van Rossum50700601997-12-08 17:15:20 +00003287 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003288 {
Guido van Rossum50700601997-12-08 17:15:20 +00003289 md->end_match_ptr = eptr; /* For ONCE */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003290 md->end_offset_top = offset_top;
3291 SUCCEED;
3292 }
3293
3294 /* In all other cases we have to check the group number back at the
3295 start and if necessary complete handling an extraction by setting the
3296 final offset and bumping the high water mark. */
3297
3298 number = (*prev - OP_BRA) << 1;
3299
Guido van Rossum557dea11997-12-22 22:46:52 +00003300 DPRINTF(("end bracket %d\n", number/2));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003301
3302 if (number > 0)
3303 {
3304 if (number >= md->offset_end) md->offset_overflow = TRUE; else
3305 {
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003306 md->offset_vector[number+1] = eptr - md->start_subject;
3307 if (offset_top <= number) offset_top = number + 2;
3308 }
3309 }
3310
3311 /* For a non-repeating ket, just advance to the next node and continue at
3312 this level. */
3313
3314 if (*ecode == OP_KET)
3315 {
3316 ecode += 3;
3317 break;
3318 }
3319
3320 /* The repeating kets try the rest of the pattern or restart from the
3321 preceding bracket, in the appropriate order. */
3322
3323 if (*ecode == OP_KETRMIN)
3324 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003325 const uschar *ptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003326 if (match(eptr, ecode+3, offset_top, md)) goto succeed;
3327 /* Handle alternation inside the BRA...KET; push the additional
Guido van Rossum58132c61997-12-17 00:24:13 +00003328 alternatives onto the stack */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003329 ptr=prev;
3330 do {
3331 ptr += (ptr[1]<<8)+ ptr[2];
3332 if (*ptr==OP_ALT)
3333 {
Guido van Rossum50700601997-12-08 17:15:20 +00003334 if (md->length == md->point)
3335 {
3336 grow_stack(md);
3337 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003338 md->offset_top[md->point] = offset_top;
3339 md->eptr[md->point] = eptr;
3340 md->ecode[md->point] = ptr+3;
3341 md->r1[md->point] = 0;
3342 md->r2[md->point] = 0;
3343 md->off_num[md->point] = 0;
3344 md->point++;
3345 }
3346 } while (*ptr==OP_ALT);
3347 ecode=prev+3; goto match_loop;
3348 }
3349 else /* OP_KETRMAX */
3350 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003351 const uschar *ptr;
3352 /*int points_pushed=0;*/
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003353
3354 /* Push one failure point, that will resume matching at the code after
3355 the KETRMAX opcode. */
Guido van Rossum50700601997-12-08 17:15:20 +00003356 if (md->length == md->point)
3357 {
3358 grow_stack(md);
3359 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003360 md->offset_top[md->point] = offset_top;
3361 md->eptr[md->point] = eptr;
3362 md->ecode[md->point] = ecode+3;
3363 md->r1[md->point] = md->offset_vector[number];
3364 md->r2[md->point] = md->offset_vector[number+1];
3365 md->off_num[md->point] = number;
3366 md->point++;
3367
3368 md->offset_vector[number] = eptr - md->start_subject;
3369 /* Handle alternation inside the BRA...KET; push each of the
Guido van Rossum58132c61997-12-17 00:24:13 +00003370 additional alternatives onto the stack */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003371 ptr=prev;
3372 do {
3373 ptr += (ptr[1]<<8)+ ptr[2];
3374 if (*ptr==OP_ALT)
3375 {
Guido van Rossum50700601997-12-08 17:15:20 +00003376 if (md->length == md->point)
3377 if (md->length == md->point)
3378 {
3379 grow_stack(md);
3380 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003381 md->offset_top[md->point] = offset_top;
3382 md->eptr[md->point] = eptr;
3383 md->ecode[md->point] = ptr+3;
3384 md->r1[md->point] = 0;
3385 md->r2[md->point] = 0;
3386 md->off_num[md->point] = 0;
3387 md->point++;
Guido van Rossum58132c61997-12-17 00:24:13 +00003388 /*points_pushed++;*/
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003389 }
3390 } while (*ptr==OP_ALT);
3391 /* Jump to the first (or only) alternative and resume trying to match */
3392 ecode=prev+3; goto match_loop;
3393 }
3394 }
Guido van Rossum58132c61997-12-17 00:24:13 +00003395
Guido van Rossum50700601997-12-08 17:15:20 +00003396 /* Start of subject unless notbol, or after internal newline if multiline */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003397
3398 case OP_CIRC:
Guido van Rossum50700601997-12-08 17:15:20 +00003399 if (md->notbol && eptr == md->start_subject) FAIL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003400 if (md->multiline)
3401 {
3402 if (eptr != md->start_subject && eptr[-1] != '\n') FAIL;
3403 ecode++;
3404 break;
3405 }
3406 /* ... else fall through */
3407
3408 /* Start of subject assertion */
3409
3410 case OP_SOD:
3411 if (eptr != md->start_subject) FAIL;
3412 ecode++;
3413 break;
3414
Guido van Rossum50700601997-12-08 17:15:20 +00003415 /* Assert before internal newline if multiline, or before
3416 a terminating newline unless endonly is set, else end of subject unless
3417 noteol is set. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003418
3419 case OP_DOLL:
Guido van Rossum50700601997-12-08 17:15:20 +00003420 if (md->noteol && eptr >= md->end_subject) FAIL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003421 if (md->multiline)
3422 {
3423 if (eptr < md->end_subject && *eptr != '\n') FAIL;
3424 ecode++;
3425 break;
3426 }
Guido van Rossum50700601997-12-08 17:15:20 +00003427 else if (!md->endonly)
3428 {
3429 if (eptr < md->end_subject - 1 ||
3430 (eptr == md->end_subject - 1 && *eptr != '\n')) FAIL;
3431 ecode++;
3432 break;
3433 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003434 /* ... else fall through */
3435
3436 /* End of subject assertion */
3437
3438 case OP_EOD:
3439 if (eptr < md->end_subject) FAIL;
3440 ecode++;
3441 break;
3442
3443 /* Word boundary assertions */
3444
3445 case OP_NOT_WORD_BOUNDARY:
3446 case OP_WORD_BOUNDARY:
3447 {
3448 BOOL prev_is_word = (eptr != md->start_subject) &&
3449 ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
3450 BOOL cur_is_word = (eptr < md->end_subject) &&
3451 ((pcre_ctypes[*eptr] & ctype_word) != 0);
3452 if ((*ecode++ == OP_WORD_BOUNDARY)?
3453 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3454 FAIL;
3455 }
3456 break;
3457
Guido van Rossum50700601997-12-08 17:15:20 +00003458 case OP_NOT_WORD_BOUNDARY_L:
3459 case OP_WORD_BOUNDARY_L:
3460 {
3461 BOOL prev_is_word = (eptr != md->start_subject) &&
Guido van Rossum58132c61997-12-17 00:24:13 +00003462 (isalnum(eptr[-1]) || eptr[-1]=='_');
Guido van Rossum50700601997-12-08 17:15:20 +00003463 BOOL cur_is_word = (eptr < md->end_subject) &&
Guido van Rossum58132c61997-12-17 00:24:13 +00003464 (isalnum(*eptr) || *eptr=='_');
Guido van Rossum50700601997-12-08 17:15:20 +00003465 if ((*ecode++ == OP_WORD_BOUNDARY_L)?
3466 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3467 FAIL;
3468 }
3469 break;
3470
3471
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003472 /* Match a single character type; inline for speed */
3473
3474 case OP_ANY:
3475 if (!md->dotall && eptr < md->end_subject && *eptr == '\n') FAIL;
3476 if (eptr++ >= md->end_subject) FAIL;
3477 ecode++;
3478 break;
3479
3480 case OP_NOT_DIGIT:
3481 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)
3482 FAIL;
3483 ecode++;
3484 break;
3485
3486 case OP_DIGIT:
3487 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)
3488 FAIL;
3489 ecode++;
3490 break;
3491
3492 case OP_NOT_WHITESPACE:
3493 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)
3494 FAIL;
3495 ecode++;
3496 break;
3497
3498 case OP_WHITESPACE:
3499 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)
3500 FAIL;
3501 ecode++;
3502 break;
3503
3504 case OP_NOT_WORDCHAR:
3505 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)
3506 FAIL;
3507 ecode++;
3508 break;
3509
3510 case OP_WORDCHAR:
3511 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)
3512 FAIL;
3513 ecode++;
3514 break;
3515
Guido van Rossum50700601997-12-08 17:15:20 +00003516 case OP_NOT_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00003517 if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) ))
Guido van Rossum557dea11997-12-22 22:46:52 +00003518 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00003519 eptr++;
3520 ecode++;
3521 break;
3522
3523 case OP_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00003524 if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) ))
Guido van Rossum557dea11997-12-22 22:46:52 +00003525 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00003526 eptr++;
3527 ecode++;
3528 break;
3529
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003530 /* Match a back reference, possibly repeatedly. Look past the end of the
3531 item to see if there is repeat information following. The code is similar
3532 to that for character classes, but repeated for efficiency. Then obey
3533 similar code to character type repeats - written out again for speed.
3534 However, if the referenced string is the empty string, always treat
3535 it as matched, any number of times (otherwise there could be infinite
3536 loops). */
3537
3538 case OP_REF:
3539 {
3540 int length;
3541 int number = ecode[1] << 1; /* Doubled reference number */
3542 ecode += 2; /* Advance past the item */
3543
3544 if (number >= offset_top || md->offset_vector[number] < 0)
3545 {
3546 md->errorcode = PCRE_ERROR_BADREF;
3547 FAIL;
3548 }
3549
3550 length = md->offset_vector[number+1] - md->offset_vector[number];
3551
3552 switch (*ecode)
3553 {
3554 case OP_CRSTAR:
3555 case OP_CRMINSTAR:
3556 case OP_CRPLUS:
3557 case OP_CRMINPLUS:
3558 case OP_CRQUERY:
3559 case OP_CRMINQUERY:
3560 c = *ecode++ - OP_CRSTAR;
3561 minimize = (c & 1) != 0;
3562 min = rep_min[c]; /* Pick up values from tables; */
3563 max = rep_max[c]; /* zero for max => infinity */
3564 if (max == 0) max = INT_MAX;
3565 break;
3566
3567 case OP_CRRANGE:
3568 case OP_CRMINRANGE:
3569 minimize = (*ecode == OP_CRMINRANGE);
3570 min = (ecode[1] << 8) + ecode[2];
3571 max = (ecode[3] << 8) + ecode[4];
3572 if (max == 0) max = INT_MAX;
3573 ecode += 5;
3574 break;
3575
3576 default: /* No repeat follows */
3577 if (!match_ref(number, eptr, length, md)) FAIL;
3578 eptr += length;
3579 continue; /* With the main loop */
3580 }
3581
3582 /* If the length of the reference is zero, just continue with the
3583 main loop. */
3584
3585 if (length == 0) continue;
3586
3587 /* First, ensure the minimum number of matches are present. We get back
3588 the length of the reference string explicitly rather than passing the
3589 address of eptr, so that eptr can be a register variable. */
3590
3591 for (i = 1; i <= min; i++)
3592 {
3593 if (!match_ref(number, eptr, length, md)) FAIL;
3594 eptr += length;
3595 }
3596
3597 /* If min = max, continue at the same level without recursion.
3598 They are not both allowed to be zero. */
3599
3600 if (min == max) continue;
3601
3602 /* If minimizing, keep trying and advancing the pointer */
3603
3604 if (minimize)
3605 {
3606 for (i = min;; i++)
3607 {
3608 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3609 if (i >= max || !match_ref(number, eptr, length, md))
3610 FAIL;
3611 eptr += length;
3612 }
3613 /* Control never gets here */
3614 }
3615
3616 /* If maximizing, find the longest string and work backwards */
3617
3618 else
3619 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003620 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003621 for (i = min; i < max; i++)
3622 {
3623 if (!match_ref(number, eptr, length, md)) break;
3624 eptr += length;
3625 }
3626 while (eptr >= pp)
3627 {
3628 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3629 eptr -= length;
3630 }
3631 FAIL;
3632 }
3633 }
3634 /* Control never gets here */
3635
3636 /* Match a character class, possibly repeatedly. Look past the end of the
3637 item to see if there is repeat information following. Then obey similar
Guido van Rossum50700601997-12-08 17:15:20 +00003638 code to character type repeats - written out again for speed. If caseless
3639 matching was set at runtime but not at compile time, we have to check both
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003640 versions of a character, and we have to behave differently for positive and
3641 negative classes. This is the only time where OP_CLASS and OP_NEGCLASS are
3642 treated differently. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003643
3644 case OP_CLASS:
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003645 case OP_NEGCLASS:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003646 {
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003647 BOOL nasty_case = *ecode == OP_NEGCLASS && md->runtime_caseless;
Guido van Rossum58132c61997-12-17 00:24:13 +00003648 const uschar *data = ecode + 1; /* Save for matching */
3649 ecode += 33; /* Advance past the item */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003650
3651 switch (*ecode)
3652 {
3653 case OP_CRSTAR:
3654 case OP_CRMINSTAR:
3655 case OP_CRPLUS:
3656 case OP_CRMINPLUS:
3657 case OP_CRQUERY:
3658 case OP_CRMINQUERY:
3659 c = *ecode++ - OP_CRSTAR;
3660 minimize = (c & 1) != 0;
3661 min = rep_min[c]; /* Pick up values from tables; */
3662 max = rep_max[c]; /* zero for max => infinity */
3663 if (max == 0) max = INT_MAX;
3664 break;
3665
3666 case OP_CRRANGE:
3667 case OP_CRMINRANGE:
3668 minimize = (*ecode == OP_CRMINRANGE);
3669 min = (ecode[1] << 8) + ecode[2];
3670 max = (ecode[3] << 8) + ecode[4];
3671 if (max == 0) max = INT_MAX;
3672 ecode += 5;
3673 break;
3674
3675 default: /* No repeat follows */
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003676 min = max = 1;
3677 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003678 }
3679
3680 /* First, ensure the minimum number of matches are present. */
3681
3682 for (i = 1; i <= min; i++)
Guido van Rossum50700601997-12-08 17:15:20 +00003683 {
3684 if (eptr >= md->end_subject) FAIL;
3685 c = *eptr++;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003686
3687 /* Either not runtime caseless, or it was a positive class. For
3688 runtime caseless, continue if either case is in the map. */
3689
3690 if (!nasty_case)
Guido van Rossum50700601997-12-08 17:15:20 +00003691 {
Guido van Rossum50700601997-12-08 17:15:20 +00003692 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003693 if (md->runtime_caseless)
3694 {
3695 c = pcre_fcc[c];
3696 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3697 }
Guido van Rossum50700601997-12-08 17:15:20 +00003698 }
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003699
3700 /* Runtime caseless and it was a negative class. Continue only if
3701 both cases are in the map. */
3702
3703 else
3704 {
3705 if ((data[c/8] & (1 << (c&7))) == 0) FAIL;
3706 c = pcre_fcc[c];
3707 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3708 }
3709
3710 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00003711 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003712
3713 /* If max == min we can continue with the main loop without the
3714 need to recurse. */
3715
3716 if (min == max) continue;
3717
3718 /* If minimizing, keep testing the rest of the expression and advancing
3719 the pointer while it matches the class. */
3720
3721 if (minimize)
3722 {
3723 for (i = min;; i++)
3724 {
3725 if (match(eptr, ecode, offset_top, md)) SUCCEED;
Guido van Rossum50700601997-12-08 17:15:20 +00003726 if (i >= max || eptr >= md->end_subject) FAIL;
3727 c = *eptr++;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003728
3729 /* Either not runtime caseless, or it was a positive class. For
3730 runtime caseless, continue if either case is in the map. */
3731
3732 if (!nasty_case)
Guido van Rossum50700601997-12-08 17:15:20 +00003733 {
Guido van Rossum50700601997-12-08 17:15:20 +00003734 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003735 if (md->runtime_caseless)
3736 {
3737 c = pcre_fcc[c];
3738 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3739 }
Guido van Rossum50700601997-12-08 17:15:20 +00003740 }
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003741
3742 /* Runtime caseless and it was a negative class. Continue only if
3743 both cases are in the map. */
3744
3745 else
3746 {
3747 if ((data[c/8] & (1 << (c&7))) == 0) return FALSE;
3748 c = pcre_fcc[c];
3749 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3750 }
3751
Guido van Rossum50700601997-12-08 17:15:20 +00003752 FAIL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003753 }
3754 /* Control never gets here */
3755 }
3756
3757 /* If maximizing, find the longest possible run, then work backwards. */
3758
3759 else
3760 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003761 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00003762 for (i = min; i < max; eptr++, i++)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003763 {
Guido van Rossum50700601997-12-08 17:15:20 +00003764 if (eptr >= md->end_subject) break;
3765 c = *eptr;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003766
3767 /* Either not runtime caseless, or it was a positive class. For
3768 runtime caseless, continue if either case is in the map. */
3769
3770 if (!nasty_case)
Guido van Rossum50700601997-12-08 17:15:20 +00003771 {
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003772 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3773 if (md->runtime_caseless)
3774 {
3775 c = pcre_fcc[c];
3776 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3777 }
3778 }
3779
3780 /* Runtime caseless and it was a negative class. Continue only if
3781 both cases are in the map. */
3782
3783 else
3784 {
3785 if ((data[c/8] & (1 << (c&7))) == 0) break;
Guido van Rossum50700601997-12-08 17:15:20 +00003786 c = pcre_fcc[c];
3787 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3788 }
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003789
Guido van Rossum50700601997-12-08 17:15:20 +00003790 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003791 }
Guido van Rossum50700601997-12-08 17:15:20 +00003792
3793 while (eptr >= pp)
3794 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3795 FAIL;
3796 }
3797 }
3798 /* Control never gets here */
3799
3800 /* OP_CLASS_L opcode: handles localized character classes */
3801
3802 case OP_CLASS_L:
3803 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003804 const uschar *data = ecode + 1; /* Save for matching */
3805 const uschar locale_flag = *data;
Guido van Rossum50700601997-12-08 17:15:20 +00003806 ecode++; data++; /* The localization support adds an extra byte */
3807
3808 ecode += 33; /* Advance past the item */
3809
3810 switch (*ecode)
3811 {
3812 case OP_CRSTAR:
3813 case OP_CRMINSTAR:
3814 case OP_CRPLUS:
3815 case OP_CRMINPLUS:
3816 case OP_CRQUERY:
3817 case OP_CRMINQUERY:
3818 c = *ecode++ - OP_CRSTAR;
3819 minimize = (c & 1) != 0;
3820 min = rep_min[c]; /* Pick up values from tables; */
3821 max = rep_max[c]; /* zero for max => infinity */
3822 if (max == 0) max = INT_MAX;
3823 break;
3824
3825 case OP_CRRANGE:
3826 case OP_CRMINRANGE:
3827 minimize = (*ecode == OP_CRMINRANGE);
3828 min = (ecode[1] << 8) + ecode[2];
3829 max = (ecode[3] << 8) + ecode[4];
3830 if (max == 0) max = INT_MAX;
3831 ecode += 5;
3832 break;
3833
3834 default: /* No repeat follows */
3835 if (eptr >= md->end_subject) FAIL;
3836 c = *eptr++;
3837 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
Guido van Rossum58132c61997-12-17 00:24:13 +00003838 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3839 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003840#if 0
3841 if ( (locale_flag & 4) && isdigit(c) ) continue; /* Locale \d */
3842 if ( (locale_flag & 8) && !isdigit(c) ) continue; /* Locale \D */
3843 if ( (locale_flag & 16) && isspace(c) ) continue; /* Locale \s */
3844 if ( (locale_flag & 32) && !isspace(c) ) continue; /* Locale \S */
3845#endif
3846
3847 if (md->runtime_caseless)
3848 {
3849 c = pcre_fcc[c];
3850 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
3851
Guido van Rossum58132c61997-12-17 00:24:13 +00003852 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3853 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003854 }
3855 FAIL;
3856 }
3857
3858 /* First, ensure the minimum number of matches are present. */
3859
3860 for (i = 1; i <= min; i++)
3861 {
3862 if (eptr >= md->end_subject) FAIL;
3863 c = *eptr++;
3864 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003865 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3866 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003867
3868 if (md->runtime_caseless)
3869 {
3870 c = pcre_fcc[c];
3871 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003872 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3873 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003874 }
3875 FAIL;
3876 }
3877
3878 /* If max == min we can continue with the main loop without the
3879 need to recurse. */
3880
3881 if (min == max) continue;
3882
3883 /* If minimizing, keep testing the rest of the expression and advancing
3884 the pointer while it matches the class. */
3885
3886 if (minimize)
3887 {
3888 for (i = min;; i++)
3889 {
3890 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3891 if (i >= max || eptr >= md->end_subject) FAIL;
3892 c = *eptr++;
3893 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003894 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3895 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003896
3897 if (md->runtime_caseless)
3898 {
3899 c = pcre_fcc[c];
3900 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003901 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3902 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003903 }
3904 FAIL;
3905 }
3906 /* Control never gets here */
3907 }
3908
3909 /* If maximizing, find the longest possible run, then work backwards. */
3910
3911 else
3912 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003913 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00003914 for (i = min; i < max; eptr++, i++)
3915 {
3916 if (eptr >= md->end_subject) break;
3917 c = *eptr;
3918 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003919 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3920 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003921 if (md->runtime_caseless)
3922 {
3923 c = pcre_fcc[c];
3924 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003925 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3926 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003927 }
3928 break;
3929 }
3930
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003931 while (eptr >= pp)
3932 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3933 FAIL;
3934 }
3935 }
3936 /* Control never gets here */
3937
3938 /* Match a run of characters */
3939
3940 case OP_CHARS:
3941 {
3942 register int length = ecode[1];
3943 ecode += 2;
3944
Guido van Rossum557dea11997-12-22 22:46:52 +00003945#ifdef DEBUG /* Sigh. Some compilers never learn. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003946 if (eptr >= md->end_subject)
3947 printf("matching subject <null> against pattern ");
3948 else
3949 {
3950 printf("matching subject ");
3951 pchars(eptr, length, TRUE, md);
3952 printf(" against pattern ");
3953 }
3954 pchars(ecode, length, FALSE, md);
3955 printf("\n");
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003956#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003957
3958 if (length > md->end_subject - eptr) FAIL;
3959 if (md->caseless)
3960 {
3961 while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) FAIL;
3962 }
3963 else
3964 {
3965 while (length-- > 0) if (*ecode++ != *eptr++) FAIL;
3966 }
3967 }
3968 break;
3969
3970 /* Match a single character repeatedly; different opcodes share code. */
3971
3972 case OP_EXACT:
3973 min = max = (ecode[1] << 8) + ecode[2];
3974 ecode += 3;
3975 goto REPEATCHAR;
3976
3977 case OP_UPTO:
3978 case OP_MINUPTO:
3979 min = 0;
3980 max = (ecode[1] << 8) + ecode[2];
3981 minimize = *ecode == OP_MINUPTO;
3982 ecode += 3;
3983 goto REPEATCHAR;
3984
3985 case OP_STAR:
3986 case OP_MINSTAR:
3987 case OP_PLUS:
3988 case OP_MINPLUS:
3989 case OP_QUERY:
3990 case OP_MINQUERY:
3991 c = *ecode++ - OP_STAR;
3992 minimize = (c & 1) != 0;
3993 min = rep_min[c]; /* Pick up values from tables; */
3994 max = rep_max[c]; /* zero for max => infinity */
3995 if (max == 0) max = INT_MAX;
3996
3997 /* Common code for all repeated single-character matches. We can give
3998 up quickly if there are fewer than the minimum number of characters left in
3999 the subject. */
4000
4001 REPEATCHAR:
4002 if (min > md->end_subject - eptr) FAIL;
4003 c = *ecode++;
4004
4005 /* The code is duplicated for the caseless and caseful cases, for speed,
4006 since matching characters is likely to be quite common. First, ensure the
4007 minimum number of matches are present. If min = max, continue at the same
4008 level without recursing. Otherwise, if minimizing, keep trying the rest of
4009 the expression and advancing one matching character if failing, up to the
4010 maximum. Alternatively, if maximizing, find the maximum number of
4011 characters and work backwards. */
4012
Guido van Rossum557dea11997-12-22 22:46:52 +00004013 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
4014 max, eptr));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004015
4016 if (md->caseless)
4017 {
4018 c = pcre_lcc[c];
4019 for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) FAIL;
4020 if (min == max) continue;
4021 if (minimize)
4022 {
4023 for (i = min;; i++)
4024 {
4025 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4026 if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])
4027 FAIL;
4028 }
4029 /* Control never gets here */
4030 }
4031 else
4032 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004033 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004034 for (i = min; i < max; i++)
4035 {
4036 if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;
4037 eptr++;
4038 }
4039 while (eptr >= pp)
4040 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4041 FAIL;
4042 }
Guido van Rossum50700601997-12-08 17:15:20 +00004043 /* Control never gets here */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004044 }
4045
4046 /* Caseful comparisons */
4047
4048 else
4049 {
4050 for (i = 1; i <= min; i++) if (c != *eptr++) FAIL;
4051 if (min == max) continue;
4052 if (minimize)
4053 {
4054 for (i = min;; i++)
4055 {
4056 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4057 if (i >= max || eptr >= md->end_subject || c != *eptr++) FAIL;
4058 }
4059 /* Control never gets here */
4060 }
4061 else
4062 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004063 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004064 for (i = min; i < max; i++)
4065 {
4066 if (eptr >= md->end_subject || c != *eptr) break;
4067 eptr++;
4068 }
4069 while (eptr >= pp)
4070 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4071 FAIL;
4072 }
4073 }
4074 /* Control never gets here */
4075
Guido van Rossum50700601997-12-08 17:15:20 +00004076 /* Match a negated single character */
4077
4078 case OP_NOT:
Guido van Rossum557dea11997-12-22 22:46:52 +00004079 if (eptr >= md->end_subject) FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00004080 ecode++;
4081 if (md->caseless)
4082 {
4083 if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) FAIL;
4084 }
4085 else
4086 {
4087 if (*ecode++ == *eptr++) FAIL;
4088 }
4089 break;
4090
4091 /* Match a negated single character repeatedly. This is almost a repeat of
4092 the code for a repeated single character, but I haven't found a nice way of
4093 commoning these up that doesn't require a test of the positive/negative
4094 option for each character match. Maybe that wouldn't add very much to the
4095 time taken, but character matching *is* what this is all about... */
4096
4097 case OP_NOTEXACT:
4098 min = max = (ecode[1] << 8) + ecode[2];
4099 ecode += 3;
4100 goto REPEATNOTCHAR;
4101
4102 case OP_NOTUPTO:
4103 case OP_NOTMINUPTO:
4104 min = 0;
4105 max = (ecode[1] << 8) + ecode[2];
4106 minimize = *ecode == OP_NOTMINUPTO;
4107 ecode += 3;
4108 goto REPEATNOTCHAR;
4109
4110 case OP_NOTSTAR:
4111 case OP_NOTMINSTAR:
4112 case OP_NOTPLUS:
4113 case OP_NOTMINPLUS:
4114 case OP_NOTQUERY:
4115 case OP_NOTMINQUERY:
4116 c = *ecode++ - OP_NOTSTAR;
4117 minimize = (c & 1) != 0;
4118 min = rep_min[c]; /* Pick up values from tables; */
4119 max = rep_max[c]; /* zero for max => infinity */
4120 if (max == 0) max = INT_MAX;
4121
4122 /* Common code for all repeated single-character matches. We can give
4123 up quickly if there are fewer than the minimum number of characters left in
4124 the subject. */
4125
4126 REPEATNOTCHAR:
4127 if (min > md->end_subject - eptr) FAIL;
4128 c = *ecode++;
4129
4130 /* The code is duplicated for the caseless and caseful cases, for speed,
4131 since matching characters is likely to be quite common. First, ensure the
4132 minimum number of matches are present. If min = max, continue at the same
4133 level without recursing. Otherwise, if minimizing, keep trying the rest of
4134 the expression and advancing one matching character if failing, up to the
4135 maximum. Alternatively, if maximizing, find the maximum number of
4136 characters and work backwards. */
4137
Guido van Rossum557dea11997-12-22 22:46:52 +00004138 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
4139 max, eptr));
Guido van Rossum50700601997-12-08 17:15:20 +00004140
4141 if (md->caseless)
4142 {
4143 c = pcre_lcc[c];
4144 for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) FAIL;
4145 if (min == max) continue;
4146 if (minimize)
4147 {
4148 for (i = min;; i++)
4149 {
4150 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4151 if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])
4152 FAIL;
4153 }
4154 /* Control never gets here */
4155 }
4156 else
4157 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004158 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00004159 for (i = min; i < max; i++)
4160 {
4161 if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;
4162 eptr++;
4163 }
4164 while (eptr >= pp)
4165 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4166 FAIL;
4167 }
4168 /* Control never gets here */
4169 }
4170
4171 /* Caseful comparisons */
4172
4173 else
4174 {
4175 for (i = 1; i <= min; i++) if (c == *eptr++) FAIL;
4176 if (min == max) continue;
4177 if (minimize)
4178 {
4179 for (i = min;; i++)
4180 {
4181 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4182 if (i >= max || eptr >= md->end_subject || c == *eptr++) FAIL;
4183 }
4184 /* Control never gets here */
4185 }
4186 else
4187 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004188 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00004189 for (i = min; i < max; i++)
4190 {
4191 if (eptr >= md->end_subject || c == *eptr) break;
4192 eptr++;
4193 }
4194 while (eptr >= pp)
4195 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4196 FAIL;
4197 }
4198 }
4199 /* Control never gets here */
4200
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004201 /* Match a single character type repeatedly; several different opcodes
4202 share code. This is very similar to the code for single characters, but we
4203 repeat it in the interests of efficiency. */
4204
4205 case OP_TYPEEXACT:
4206 min = max = (ecode[1] << 8) + ecode[2];
4207 minimize = TRUE;
4208 ecode += 3;
4209 goto REPEATTYPE;
4210
4211 case OP_TYPEUPTO:
4212 case OP_TYPEMINUPTO:
4213 min = 0;
4214 max = (ecode[1] << 8) + ecode[2];
4215 minimize = *ecode == OP_TYPEMINUPTO;
4216 ecode += 3;
4217 goto REPEATTYPE;
4218
4219 case OP_TYPESTAR:
4220 case OP_TYPEMINSTAR:
4221 case OP_TYPEPLUS:
4222 case OP_TYPEMINPLUS:
4223 case OP_TYPEQUERY:
4224 case OP_TYPEMINQUERY:
4225 c = *ecode++ - OP_TYPESTAR;
4226 minimize = (c & 1) != 0;
4227 min = rep_min[c]; /* Pick up values from tables; */
4228 max = rep_max[c]; /* zero for max => infinity */
4229 if (max == 0) max = INT_MAX;
4230
4231 /* Common code for all repeated single character type matches */
4232
4233 REPEATTYPE:
4234 ctype = *ecode++; /* Code for the character type */
4235
4236 /* First, ensure the minimum number of matches are present. Use inline
4237 code for maximizing the speed, and do the type test once at the start
4238 (i.e. keep it out of the loop). Also test that there are at least the
4239 minimum number of characters before we start. */
4240
4241 if (min > md->end_subject - eptr) FAIL;
4242 if (min > 0) switch(ctype)
4243 {
4244 case OP_ANY:
4245 if (!md->dotall)
4246 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') FAIL; }
4247 else eptr += min;
4248 break;
4249
4250 case OP_NOT_DIGIT:
4251 for (i = 1; i <= min; i++)
4252 if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) FAIL;
4253 break;
4254
4255 case OP_DIGIT:
4256 for (i = 1; i <= min; i++)
4257 if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) FAIL;
4258 break;
4259
4260 case OP_NOT_WHITESPACE:
4261 for (i = 1; i <= min; i++)
4262 if ((pcre_ctypes[*eptr++] & ctype_space) != 0) FAIL;
4263 break;
4264
4265 case OP_WHITESPACE:
4266 for (i = 1; i <= min; i++)
4267 if ((pcre_ctypes[*eptr++] & ctype_space) == 0) FAIL;
4268 break;
4269
4270 case OP_NOT_WORDCHAR:
4271 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
4272 FAIL;
4273 break;
4274
4275 case OP_WORDCHAR:
4276 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
4277 FAIL;
4278 break;
Guido van Rossum50700601997-12-08 17:15:20 +00004279
4280 case OP_NOT_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00004281 for (i = 1; i <= min; i++, eptr++) if (*eptr=='_' || isalnum(*eptr))
Guido van Rossum557dea11997-12-22 22:46:52 +00004282 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00004283 break;
4284
4285 case OP_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00004286 for (i = 1; i <= min; i++, eptr++) if (*eptr!='_' && !isalnum(*eptr))
Guido van Rossum557dea11997-12-22 22:46:52 +00004287 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00004288 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004289 }
4290
4291 /* If min = max, continue at the same level without recursing */
4292
4293 if (min == max) continue;
4294
4295 /* If minimizing, we have to test the rest of the pattern before each
4296 subsequent match, so inlining isn't much help; just use the function. */
4297
4298 if (minimize)
4299 {
4300 for (i = min;; i++)
4301 {
4302 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4303 if (i >= max || eptr >= md->end_subject ||
4304 !match_type(ctype, *eptr++, md->dotall))
4305 FAIL;
4306 }
4307 /* Control never gets here */
4308 }
4309
4310 /* If maximizing it is worth using inline code for speed, doing the type
4311 test once at the start (i.e. keep it out of the loop). */
4312
4313 else
4314 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004315 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004316 switch(ctype)
4317 {
4318 case OP_ANY:
4319 if (!md->dotall)
4320 {
4321 for (i = min; i < max; i++)
4322 {
4323 if (eptr >= md->end_subject || *eptr == '\n') break;
4324 eptr++;
4325 }
4326 }
4327 else
4328 {
4329 c = max - min;
4330 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4331 eptr += c;
4332 }
4333 break;
4334
4335 case OP_NOT_DIGIT:
4336 for (i = min; i < max; i++)
4337 {
4338 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)
4339 break;
4340 eptr++;
4341 }
4342 break;
4343
4344 case OP_DIGIT:
4345 for (i = min; i < max; i++)
4346 {
4347 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)
4348 break;
4349 eptr++;
4350 }
4351 break;
4352
4353 case OP_NOT_WHITESPACE:
4354 for (i = min; i < max; i++)
4355 {
4356 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)
4357 break;
4358 eptr++;
4359 }
4360 break;
4361
4362 case OP_WHITESPACE:
4363 for (i = min; i < max; i++)
4364 {
4365 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)
4366 break;
4367 eptr++;
4368 }
4369 break;
4370
4371 case OP_NOT_WORDCHAR:
4372 for (i = min; i < max; i++)
4373 {
4374 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)
4375 break;
4376 eptr++;
4377 }
4378 break;
4379
4380 case OP_WORDCHAR:
4381 for (i = min; i < max; i++)
4382 {
Guido van Rossum50700601997-12-08 17:15:20 +00004383 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)
4384 break;
4385 eptr++;
4386 }
4387 break;
4388 case OP_NOT_WORDCHAR_L:
4389 for (i = min; i < max; i++)
4390 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004391 if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) ) )
Guido van Rossum50700601997-12-08 17:15:20 +00004392 break;
4393 eptr++;
4394 }
4395 break;
4396
4397 case OP_WORDCHAR_L:
4398 for (i = min; i < max; i++)
4399 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004400 if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) ) )
Guido van Rossum50700601997-12-08 17:15:20 +00004401 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004402 eptr++;
4403 }
4404 break;
4405 }
4406
4407 while (eptr >= pp)
4408 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4409 FAIL;
4410 }
4411 /* Control never gets here */
4412
4413 /* There's been some horrible disaster. */
4414
4415 default:
Guido van Rossum557dea11997-12-22 22:46:52 +00004416 DPRINTF(("Unknown opcode %d\n", *ecode));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004417 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4418 FAIL;
4419 }
4420
4421 /* Do not stick any code in here without much thought; it is assumed
4422 that "continue" in the code above comes out to here to repeat the main
4423 loop. */
4424
4425 } /* End of main loop */
4426/* Control never reaches here */
4427
4428fail:
4429 if (md->point > save_stack_position)
4430 {
4431 /* If there are still points remaining on the stack, pop the next one off */
Guido van Rossumc3861071997-10-08 02:07:40 +00004432 int off_num;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004433
4434 md->point--;
4435 offset_top = md->offset_top[md->point];
4436 eptr = md->eptr[md->point];
4437 ecode = md->ecode[md->point];
4438 off_num = md->off_num[md->point];
4439 md->offset_vector[off_num] = md->r1[md->point];
4440 md->offset_vector[off_num+1] = md->r2[md->point];
4441 goto match_loop;
4442 }
4443 /* Failure, and nothing left on the stack, so end this function call */
4444
4445 /* Restore the top of the stack to where it was before this function
4446 call. This lets us use one stack for everything; recursive calls
4447 can push and pop information, and may increase the stack. When
4448 the call returns, the parent function can resume pushing and
4449 popping wherever it was. */
4450
4451 md->point = save_stack_position;
4452 return FALSE;
4453
4454succeed:
4455 return TRUE;
4456}
4457
4458
Guido van Rossum50700601997-12-08 17:15:20 +00004459
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004460/*************************************************
Guido van Rossum557dea11997-12-22 22:46:52 +00004461* Segregate setjmp() *
4462*************************************************/
4463
4464/* The -Wall option of gcc gives warnings for all local variables when setjmp()
4465is used, even if the coding conforms to the rules of ANSI C. To avoid this, we
4466hide it in a separate function. This is called only when PCRE_EXTRA is set,
4467since it's needed only for the extension \X option, and with any luck, a good
4468compiler will spot the tail recursion and compile it efficiently.
4469
4470Arguments:
4471 eptr pointer in subject
4472 ecode position in code
4473 offset_top current top pointer
4474 md pointer to "static" info for the match
4475
4476Returns: TRUE if matched
4477*/
4478
4479static BOOL
4480match_with_setjmp(const uschar *eptr, const uschar *ecode, int offset_top,
4481 match_data *match_block)
4482{
4483return setjmp(match_block->fail_env) == 0 &&
4484 match(eptr, ecode, offset_top, match_block);
4485}
4486
4487
4488
4489/*************************************************
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004490* Execute a Regular Expression *
4491*************************************************/
4492
4493/* This function applies a compiled re to a subject string and picks out
4494portions of the string if it matches. Two elements in the vector are set for
4495each substring: the offsets to the start and end of the substring.
4496
4497Arguments:
Guido van Rossum50700601997-12-08 17:15:20 +00004498 external_re points to the compiled expression
4499 external_extra points to "hints" from pcre_study() or is NULL
4500 subject points to the subject string
4501 length length of subject string (may contain binary zeros)
4502 options option bits
4503 offsets points to a vector of ints to be filled in with offsets
4504 offsetcount the number of elements in the vector
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004505
Guido van Rossum50700601997-12-08 17:15:20 +00004506Returns: > 0 => success; value is the number of elements filled in
4507 = 0 => success, but offsets is not big enough
4508 -1 => failed to match
4509 < -1 => some kind of unexpected problem
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004510*/
4511
4512int
Guido van Rossum50700601997-12-08 17:15:20 +00004513pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
Guido van Rossum816671c1998-03-10 04:55:29 +00004514 const char *subject, int length, int start_pos, int options,
4515 int *offsets, int offsetcount)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004516{
Guido van Rossum58132c61997-12-17 00:24:13 +00004517 /* The "volatile" directives are to make gcc -Wall stop complaining
4518 that these variables can be clobbered by the longjmp. Hopefully
4519 they won't cost too much performance. */
Guido van Rossum042ff9e1998-04-03 21:13:31 +00004520volatile int resetcount, ocount;
4521volatile int first_char = -1;
Tim Peters54925f92000-07-05 22:56:52 +00004522const uschar * volatile start_bits = NULL;
4523const uschar * volatile start_match = (const uschar *)subject + start_pos;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004524match_data match_block;
Guido van Rossum58132c61997-12-17 00:24:13 +00004525const uschar *end_subject;
4526const real_pcre *re = (const real_pcre *)external_re;
4527const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00004528volatile BOOL using_temporary_offsets = FALSE;
4529volatile BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4530volatile BOOL startline = (re->options & PCRE_STARTLINE) != 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004531
4532if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4533
4534if (re == NULL || subject == NULL ||
4535 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4536if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4537
Guido van Rossum58132c61997-12-17 00:24:13 +00004538match_block.start_subject = (const uschar *)subject;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004539match_block.end_subject = match_block.start_subject + length;
4540end_subject = match_block.end_subject;
4541
Guido van Rossum50700601997-12-08 17:15:20 +00004542match_block.caseless = ((re->options | options) & PCRE_CASELESS) != 0;
4543match_block.runtime_caseless = match_block.caseless &&
4544 (re->options & PCRE_CASELESS) == 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004545
Guido van Rossum50700601997-12-08 17:15:20 +00004546match_block.multiline = ((re->options | options) & PCRE_MULTILINE) != 0;
4547match_block.dotall = ((re->options | options) & PCRE_DOTALL) != 0;
4548match_block.endonly = ((re->options | options) & PCRE_DOLLAR_ENDONLY) != 0;
4549
4550match_block.notbol = (options & PCRE_NOTBOL) != 0;
4551match_block.noteol = (options & PCRE_NOTEOL) != 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004552
4553match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
4554
4555/* Set the stack state to empty */
4556 match_block.off_num = match_block.offset_top = NULL;
4557 match_block.r1 = match_block.r2 = NULL;
4558 match_block.eptr = match_block.ecode = NULL;
4559 match_block.point = match_block.length = 0;
4560
Guido van Rossum50700601997-12-08 17:15:20 +00004561/* If the expression has got more back references than the offsets supplied can
4562hold, we get a temporary bit of working store to use during the matching.
Guido van Rossum557dea11997-12-22 22:46:52 +00004563Otherwise, we can use the vector supplied, rounding down its size to a multiple
4564of 2. */
Guido van Rossum50700601997-12-08 17:15:20 +00004565
Guido van Rossum557dea11997-12-22 22:46:52 +00004566ocount = offsetcount & (-2);
4567if (re->top_backref > 0 && re->top_backref >= ocount/2)
Guido van Rossum50700601997-12-08 17:15:20 +00004568 {
4569 ocount = re->top_backref * 2 + 2;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00004570 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
Guido van Rossum50700601997-12-08 17:15:20 +00004571 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
Guido van Rossum557dea11997-12-22 22:46:52 +00004572 using_temporary_offsets = TRUE;
4573 DPRINTF(("Got memory to hold back references\n"));
Guido van Rossum50700601997-12-08 17:15:20 +00004574 }
4575else match_block.offset_vector = offsets;
4576
4577match_block.offset_end = ocount;
4578match_block.offset_overflow = FALSE;
4579
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004580/* Compute the minimum number of offsets that we need to reset each time. Doing
4581this makes a huge difference to execution time when there aren't many brackets
4582in the pattern. */
4583
4584resetcount = 2 + re->top_bracket * 2;
Guido van Rossum50700601997-12-08 17:15:20 +00004585if (resetcount > offsetcount) resetcount = ocount;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004586
4587/* If MULTILINE is set at exec time but was not set at compile time, and the
4588anchored flag is set, we must re-check because a setting provoked by ^ in the
4589pattern is not right in multi-line mode. Calling is_anchored() again here does
4590the right check, because multiline is now set. If it now yields FALSE, the
4591expression must have had ^ starting some of its branches. Check to see if
4592that is true for *all* branches, and if so, set the startline flag. */
4593
Guido van Rossum557dea11997-12-22 22:46:52 +00004594if (match_block.multiline && anchored && (re->options & PCRE_MULTILINE) == 0 &&
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004595 !is_anchored(re->code, match_block.multiline))
4596 {
4597 anchored = FALSE;
4598 if (is_startline(re->code)) startline = TRUE;
4599 }
4600
4601/* Set up the first character to match, if available. The first_char value is
4602never set for an anchored regular expression, but the anchoring may be forced
4603at run time, so we have to test for anchoring. The first char may be unset for
4604an unanchored pattern, of course. If there's no first char and the pattern was
4605studied, the may be a bitmap of possible first characters. However, we can
4606use this only if the caseless state of the studying was correct. */
4607
4608if (!anchored)
4609 {
4610 if ((re->options & PCRE_FIRSTSET) != 0)
4611 {
4612 first_char = re->first_char;
4613 if (match_block.caseless) first_char = pcre_lcc[first_char];
4614 }
4615 else
4616 if (!startline && extra != NULL &&
4617 (extra->options & PCRE_STUDY_MAPPED) != 0 &&
4618 ((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless)
4619 start_bits = extra->start_bits;
4620 }
4621
4622/* Loop for unanchored matches; for anchored regexps the loop runs just once. */
4623
4624do
4625 {
Guido van Rossum557dea11997-12-22 22:46:52 +00004626 int rc;
Guido van Rossum50700601997-12-08 17:15:20 +00004627 register int *iptr = match_block.offset_vector;
4628 register int *iend = iptr + resetcount;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004629
4630 /* Reset the maximum number of extractions we might see. */
4631
4632 while (iptr < iend) *iptr++ = -1;
4633
4634 /* Advance to a unique first char if possible */
4635
4636 if (first_char >= 0)
4637 {
4638 if (match_block.caseless)
4639 while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
4640 start_match++;
4641 else
4642 while (start_match < end_subject && *start_match != first_char)
4643 start_match++;
4644 }
4645
4646 /* Or to just after \n for a multiline match if possible */
4647
4648 else if (startline)
4649 {
4650 if (start_match > match_block.start_subject)
4651 {
4652 while (start_match < end_subject && start_match[-1] != '\n')
4653 start_match++;
4654 }
4655 }
4656
4657 /* Or to a non-unique first char */
4658
4659 else if (start_bits != NULL)
4660 {
4661 while (start_match < end_subject)
4662 {
4663 register int c = *start_match;
Guido van Rossum50700601997-12-08 17:15:20 +00004664 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004665 }
4666 }
4667
Guido van Rossum557dea11997-12-22 22:46:52 +00004668#ifdef DEBUG /* Sigh. Some compilers never learn. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004669 printf(">>>> Match against: ");
4670 pchars(start_match, end_subject - start_match, TRUE, &match_block);
4671 printf("\n");
Guido van Rossum57ba4f31997-12-02 20:40:28 +00004672#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004673
4674 /* When a match occurs, substrings will be set for all internal extractions;
4675 we just need to set up the whole thing as substring 0 before returning. If
Guido van Rossum50700601997-12-08 17:15:20 +00004676 there were too many extractions, set the return code to zero. In the case
4677 where we had to get some local store to hold offsets for backreferences, copy
4678 those back references that we can. In this case there need not be overflow
4679 if certain parts of the pattern were not used.
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004680
Guido van Rossum50700601997-12-08 17:15:20 +00004681 Before starting the match, we have to set up a longjmp() target to enable
Guido van Rossum557dea11997-12-22 22:46:52 +00004682 the "cut" operation to fail a match completely without backtracking. This
4683 is done in a separate function to avoid compiler warnings. We need not do
4684 it unless PCRE_EXTRA is set, since only in that case is the "cut" operation
4685 enabled. */
Guido van Rossum50700601997-12-08 17:15:20 +00004686
4687 /* To handle errors such as running out of memory for the failure
4688 stack, we need to save this location via setjmp(), so
4689 error-handling code can call longjmp() to jump out of deeply-nested code. */
4690 if (setjmp(match_block.error_env)==0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004691 {
Guido van Rossum50700601997-12-08 17:15:20 +00004692
Guido van Rossum557dea11997-12-22 22:46:52 +00004693 if ((re->options & PCRE_EXTRA) != 0)
Guido van Rossum50700601997-12-08 17:15:20 +00004694 {
Guido van Rossum557dea11997-12-22 22:46:52 +00004695 if (!match_with_setjmp(start_match, re->code, 2, &match_block))
4696 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004697 }
Guido van Rossum557dea11997-12-22 22:46:52 +00004698 else if (!match(start_match, re->code, 2, &match_block)) continue;
4699
4700 /* Copy the offset information from temporary store if necessary */
4701
4702 if (using_temporary_offsets)
4703 {
4704 if (offsetcount >= 4)
4705 {
4706 memcpy(offsets + 2, match_block.offset_vector + 2,
4707 (offsetcount - 2) * sizeof(int));
4708 DPRINTF(("Copied offsets from temporary memory\n"));
4709 }
4710 if (match_block.end_offset_top > offsetcount)
4711 match_block.offset_overflow = TRUE;
4712
4713 DPRINTF(("Freeing temporary memory\n"));
4714 (pcre_free)(match_block.offset_vector);
4715 }
4716
4717 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
4718
4719 if (match_block.offset_end < 2) rc = 0; else
4720 {
4721 offsets[0] = start_match - match_block.start_subject;
4722 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
4723 }
4724
4725 DPRINTF((">>>> returning %d\n", rc));
4726 free_stack(&match_block);
4727 return rc;
Guido van Rossum50700601997-12-08 17:15:20 +00004728 } /* End of (if setjmp(match_block.error_env)...) */
Guido van Rossum042ff9e1998-04-03 21:13:31 +00004729 free_stack(&match_block);
4730
Guido van Rossum50700601997-12-08 17:15:20 +00004731 /* Return an error code; pcremodule.c will preserve the exception */
4732 if (PyErr_Occurred()) return PCRE_ERROR_NOMEMORY;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004733 }
4734while (!anchored &&
4735 match_block.errorcode == PCRE_ERROR_NOMATCH &&
4736 start_match++ < end_subject);
4737
Guido van Rossum557dea11997-12-22 22:46:52 +00004738if (using_temporary_offsets)
4739 {
4740 DPRINTF(("Freeing temporary memory\n"));
4741 (pcre_free)(match_block.offset_vector);
4742 }
4743
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004744#ifdef DEBUG
4745printf(">>>> returning %d\n", match_block.errorcode);
4746#endif
Guido van Rossum50700601997-12-08 17:15:20 +00004747
Barry Warsawb80667d1999-01-27 21:41:08 +00004748 free_stack(&match_block);
Guido van Rossum557dea11997-12-22 22:46:52 +00004749 return match_block.errorcode;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004750}
4751
4752/* End of pcre.c */