blob: 55908e71ee3f7e2444faef4e5fedbc63e954d511 [file] [log] [blame]
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001
2/*************************************************
3* Perl-Compatible Regular Expressions *
4*************************************************/
5
6/* DO NOT EDIT THIS FILE! */
7
8/* This file is automatically written by the merge-files.py script
9included with the PCRE distribution for Python; it's produced from
10several C files, and code is removed in the process. If you want to
11modify the code or track down bugs, it will be much easier to work
12with the code in its original, multiple-file form. Don't edit this
13file by hand, or submit patches to it.
14
15The Python-specific PCRE distribution can be retrieved from
16 http://starship.skyport.net/crew/amk/regex/
17
Guido van Rossum58132c61997-12-17 00:24:13 +000018The unmodified original PCRE distribution is available at
19ftp://ftp.cus.cam.ac.uk/pub/software/programs/pcre/, and is originally
20written by: Philip Hazel <ph10@cam.ac.uk>
Guido van Rossum51b3aa31997-10-06 14:43:11 +000021
22Extensively modified by the Python String-SIG: <string-sig@python.org>
23Send bug reports to: <string-sig@python.org>
24(They'll figure out if it's a bug in PCRE or in the Python-specific
25changes.)
26
27 Copyright (c) 1997 University of Cambridge
28
29-----------------------------------------------------------------------------
30Permission is granted to anyone to use this software for any purpose on any
31computer system, and to redistribute it freely, subject to the following
32restrictions:
33
341. This software is distributed in the hope that it will be useful,
35 but WITHOUT ANY WARRANTY; without even the implied warranty of
36 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
37
382. The origin of this software must not be misrepresented, either by
39 explicit claim or by omission.
40
413. Altered versions must be plainly marked as such, and must not be
42 misrepresented as being the original software.
43-----------------------------------------------------------------------------
44*/
45
46
47#define FOR_PYTHON
Guido van Rossum58132c61997-12-17 00:24:13 +000048#include "pcre-int.h"
Guido van Rossum51b3aa31997-10-06 14:43:11 +000049#include "Python.h"
Guido van Rossum50700601997-12-08 17:15:20 +000050#include "mymalloc.h"
51#include <ctype.h>
Guido van Rossum51b3aa31997-10-06 14:43:11 +000052#include "graminit.h"
53
54/*************************************************
55* Perl-Compatible Regular Expressions *
56*************************************************/
57
58/* This file is automatically written by the makechartables auxiliary
59program. If you edit it by hand, you might like to edit the Makefile to
60prevent its ever being regenerated. */
61
62/* This table is a lower casing table. */
63
64unsigned char pcre_lcc[] = {
65 0, 1, 2, 3, 4, 5, 6, 7,
66 8, 9, 10, 11, 12, 13, 14, 15,
67 16, 17, 18, 19, 20, 21, 22, 23,
68 24, 25, 26, 27, 28, 29, 30, 31,
69 32, 33, 34, 35, 36, 37, 38, 39,
70 40, 41, 42, 43, 44, 45, 46, 47,
71 48, 49, 50, 51, 52, 53, 54, 55,
72 56, 57, 58, 59, 60, 61, 62, 63,
73 64, 97, 98, 99,100,101,102,103,
74 104,105,106,107,108,109,110,111,
75 112,113,114,115,116,117,118,119,
76 120,121,122, 91, 92, 93, 94, 95,
77 96, 97, 98, 99,100,101,102,103,
78 104,105,106,107,108,109,110,111,
79 112,113,114,115,116,117,118,119,
80 120,121,122,123,124,125,126,127,
81 128,129,130,131,132,133,134,135,
82 136,137,138,139,140,141,142,143,
83 144,145,146,147,148,149,150,151,
84 152,153,154,155,156,157,158,159,
85 160,161,162,163,164,165,166,167,
86 168,169,170,171,172,173,174,175,
87 176,177,178,179,180,181,182,183,
88 184,185,186,187,188,189,190,191,
89 192,193,194,195,196,197,198,199,
90 200,201,202,203,204,205,206,207,
91 208,209,210,211,212,213,214,215,
92 216,217,218,219,220,221,222,223,
93 224,225,226,227,228,229,230,231,
94 232,233,234,235,236,237,238,239,
95 240,241,242,243,244,245,246,247,
96 248,249,250,251,252,253,254,255 };
97
Guido van Rossum50700601997-12-08 17:15:20 +000098/* This table is a case flipping table. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +000099
Guido van Rossum50700601997-12-08 17:15:20 +0000100unsigned char pcre_fcc[] = {
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000101 0, 1, 2, 3, 4, 5, 6, 7,
102 8, 9, 10, 11, 12, 13, 14, 15,
103 16, 17, 18, 19, 20, 21, 22, 23,
104 24, 25, 26, 27, 28, 29, 30, 31,
105 32, 33, 34, 35, 36, 37, 38, 39,
106 40, 41, 42, 43, 44, 45, 46, 47,
107 48, 49, 50, 51, 52, 53, 54, 55,
108 56, 57, 58, 59, 60, 61, 62, 63,
Guido van Rossum50700601997-12-08 17:15:20 +0000109 64, 97, 98, 99,100,101,102,103,
110 104,105,106,107,108,109,110,111,
111 112,113,114,115,116,117,118,119,
112 120,121,122, 91, 92, 93, 94, 95,
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000113 96, 65, 66, 67, 68, 69, 70, 71,
114 72, 73, 74, 75, 76, 77, 78, 79,
115 80, 81, 82, 83, 84, 85, 86, 87,
116 88, 89, 90,123,124,125,126,127,
117 128,129,130,131,132,133,134,135,
118 136,137,138,139,140,141,142,143,
119 144,145,146,147,148,149,150,151,
120 152,153,154,155,156,157,158,159,
121 160,161,162,163,164,165,166,167,
122 168,169,170,171,172,173,174,175,
123 176,177,178,179,180,181,182,183,
124 184,185,186,187,188,189,190,191,
125 192,193,194,195,196,197,198,199,
126 200,201,202,203,204,205,206,207,
127 208,209,210,211,212,213,214,215,
128 216,217,218,219,220,221,222,223,
129 224,225,226,227,228,229,230,231,
130 232,233,234,235,236,237,238,239,
131 240,241,242,243,244,245,246,247,
132 248,249,250,251,252,253,254,255 };
133
Guido van Rossum50700601997-12-08 17:15:20 +0000134/* This table contains bit maps for digits, letters, 'word' chars, and
135white space. Each map is 32 bytes long and the bits run from the least
136significant end of each byte. */
137
138unsigned char pcre_cbits[] = {
139 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
140 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
141 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
142 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
143
144 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
145 0xfe,0xff,0xff,0x07,0xfe,0xff,0xff,0x07,
146 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
147 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
148
149 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
150 0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07,
151 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
152 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
153
154 0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00,
155 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
156 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
157 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 };
158
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000159/* This table identifies various classes of character by individual bits:
Guido van Rossum50700601997-12-08 17:15:20 +0000160 0x01 white space character
161 0x02 letter
162 0x04 decimal digit
163 0x08 hexadecimal digit
164 0x10 alphanumeric or '_'
165 0x80 regular expression metacharacter or binary zero
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000166*/
167
168unsigned char pcre_ctypes[] = {
169 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
170 0x00,0x01,0x01,0x01,0x01,0x01,0x00,0x00, /* 8- 15 */
171 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
172 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
173 0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /* - ' */
174 0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /* ( - / */
Guido van Rossum50700601997-12-08 17:15:20 +0000175 0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c, /* 0 - 7 */
176 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x80, /* 8 - ? */
177 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* @ - G */
178 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* H - O */
179 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* P - W */
180 0x12,0x12,0x12,0x80,0x00,0x00,0x80,0x10, /* X - _ */
181 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* ` - g */
182 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* h - o */
183 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* p - w */
184 0x12,0x12,0x12,0x80,0x80,0x00,0x00,0x00, /* x -127 */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000185 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
186 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
187 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
188 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
189 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
190 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
191 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
192 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
193 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
194 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
195 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
196 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
197 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
198 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
199 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
200 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
201
Guido van Rossum50700601997-12-08 17:15:20 +0000202/* End of chartables.c */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000203/*************************************************
204* Perl-Compatible Regular Expressions *
205*************************************************/
206
207/*
208This is a library of functions to support regular expressions whose syntax
209and semantics are as close as possible to those of the Perl 5 language. See
210the file Tech.Notes for some information on the internals.
211
212Written by: Philip Hazel <ph10@cam.ac.uk>
213
214 Copyright (c) 1997 University of Cambridge
215
216-----------------------------------------------------------------------------
217Permission is granted to anyone to use this software for any purpose on any
218computer system, and to redistribute it freely, subject to the following
219restrictions:
220
2211. This software is distributed in the hope that it will be useful,
222 but WITHOUT ANY WARRANTY; without even the implied warranty of
223 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
224
2252. The origin of this software must not be misrepresented, either by
226 explicit claim or by omission.
227
2283. Altered versions must be plainly marked as such, and must not be
229 misrepresented as being the original software.
230-----------------------------------------------------------------------------
231*/
232
233
234/* Include the internals header, which itself includes Standard C headers plus
235the external pcre header. */
236
237
238
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000239
240/*************************************************
241* Create bitmap of starting chars *
242*************************************************/
243
244/* This function scans a compiled unanchored expression and attempts to build a
245bitmap of the set of initial characters. If it can't, it returns FALSE. As time
246goes by, we may be able to get more clever at doing this.
247
248Arguments:
249 code points to an expression
250 start_bits points to a 32-byte table, initialized to 0
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000251
252Returns: TRUE if table built, FALSE otherwise
253*/
254
255static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +0000256set_start_bits(const uschar *code, uschar *start_bits)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000257{
Guido van Rossum50700601997-12-08 17:15:20 +0000258register int c;
259
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000260do
261 {
Guido van Rossum58132c61997-12-17 00:24:13 +0000262 const uschar *tcode = code + 3;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000263 BOOL try_next = TRUE;
264
265 while (try_next)
266 {
267 try_next = FALSE;
268
269 if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
270 {
Guido van Rossum50700601997-12-08 17:15:20 +0000271 if (!set_start_bits(tcode, start_bits)) return FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000272 }
273
274 else switch(*tcode)
275 {
276 default:
277 return FALSE;
278
279 /* BRAZERO does the bracket, but carries on. */
280
281 case OP_BRAZERO:
282 case OP_BRAMINZERO:
Guido van Rossum50700601997-12-08 17:15:20 +0000283 if (!set_start_bits(++tcode, start_bits)) return FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000284 do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
285 tcode += 3;
286 try_next = TRUE;
287 break;
288
289 /* Single-char * or ? sets the bit and tries the next item */
290
291 case OP_STAR:
292 case OP_MINSTAR:
293 case OP_QUERY:
294 case OP_MINQUERY:
Guido van Rossum50700601997-12-08 17:15:20 +0000295 start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000296 tcode += 2;
297 try_next = TRUE;
298 break;
299
300 /* Single-char upto sets the bit and tries the next */
301
302 case OP_UPTO:
303 case OP_MINUPTO:
Guido van Rossum50700601997-12-08 17:15:20 +0000304 start_bits[tcode[3]/8] |= (1 << (tcode[3]&7));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000305 tcode += 4;
306 try_next = TRUE;
307 break;
308
309 /* At least one single char sets the bit and stops */
310
311 case OP_EXACT: /* Fall through */
312 tcode++;
313
314 case OP_CHARS: /* Fall through */
315 tcode++;
316
317 case OP_PLUS:
318 case OP_MINPLUS:
Guido van Rossum50700601997-12-08 17:15:20 +0000319 start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000320 break;
321
322 /* Single character type sets the bits and stops */
323
324 case OP_NOT_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000325 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000326 break;
327
328 case OP_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000329 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000330 break;
331
332 case OP_NOT_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000333 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000334 break;
335
336 case OP_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000337 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000338 break;
339
340 case OP_NOT_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000341 for (c = 0; c < 32; c++)
342 start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000343 break;
344
345 case OP_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000346 for (c = 0; c < 32; c++)
347 start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000348 break;
349
350 /* One or more character type fudges the pointer and restarts, knowing
351 it will hit a single character type and stop there. */
352
353 case OP_TYPEPLUS:
354 case OP_TYPEMINPLUS:
355 tcode++;
356 try_next = TRUE;
357 break;
358
359 case OP_TYPEEXACT:
360 tcode += 3;
361 try_next = TRUE;
362 break;
363
364 /* Zero or more repeats of character types set the bits and then
365 try again. */
366
367 case OP_TYPEUPTO:
368 case OP_TYPEMINUPTO:
369 tcode += 2; /* Fall through */
370
371 case OP_TYPESTAR:
372 case OP_TYPEMINSTAR:
373 case OP_TYPEQUERY:
374 case OP_TYPEMINQUERY:
375 switch(tcode[1])
376 {
377 case OP_NOT_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000378 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000379 break;
380
381 case OP_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000382 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000383 break;
384
385 case OP_NOT_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000386 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000387 break;
388
389 case OP_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000390 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000391 break;
392
393 case OP_NOT_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000394 for (c = 0; c < 32; c++)
395 start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000396 break;
397
398 case OP_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000399 for (c = 0; c < 32; c++)
400 start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000401 break;
402 }
403
404 tcode += 2;
405 try_next = TRUE;
406 break;
407
408 /* Character class: set the bits and either carry on or not,
409 according to the repeat count. */
410
411 case OP_CLASS:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000412 {
Guido van Rossum50700601997-12-08 17:15:20 +0000413 tcode++;
414 for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
415 tcode += 32;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000416 switch (*tcode)
417 {
418 case OP_CRSTAR:
419 case OP_CRMINSTAR:
420 case OP_CRQUERY:
421 case OP_CRMINQUERY:
422 tcode++;
423 try_next = TRUE;
424 break;
425
426 case OP_CRRANGE:
427 case OP_CRMINRANGE:
428 if (((tcode[1] << 8) + tcode[2]) == 0)
429 {
430 tcode += 5;
431 try_next = TRUE;
432 }
433 break;
434 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000435 }
436 break; /* End of class handling */
437
438 } /* End of switch */
439 } /* End of try_next loop */
440
441 code += (code[1] << 8) + code[2]; /* Advance to next branch */
442 }
443while (*code == OP_ALT);
444return TRUE;
445}
446
447
448
449/*************************************************
450* Study a compiled expression *
451*************************************************/
452
453/* This function is handed a compiled expression that it must study to produce
454information that will speed up the matching. It returns a pcre_extra block
455which then gets handed back to pcre_exec().
456
457Arguments:
458 re points to the compiled expression
459 options contains option bits
460 errorptr points to where to place error messages;
461 set NULL unless error
462
463Returns: pointer to a pcre_extra block,
464 NULL on error or if no optimization possible
465*/
466
467pcre_extra *
Guido van Rossum58132c61997-12-17 00:24:13 +0000468pcre_study(const pcre *external_re, int options, const char **errorptr)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000469{
470BOOL caseless;
471uschar start_bits[32];
472real_pcre_extra *extra;
Guido van Rossum58132c61997-12-17 00:24:13 +0000473const real_pcre *re = (const real_pcre *)external_re;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000474
475*errorptr = NULL;
476
477if (re == NULL || re->magic_number != MAGIC_NUMBER)
478 {
479 *errorptr = "argument is not a compiled regular expression";
480 return NULL;
481 }
482
483if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
484 {
485 *errorptr = "unknown or incorrect option bit(s) set";
486 return NULL;
487 }
488
Guido van Rossum50700601997-12-08 17:15:20 +0000489/* Caseless can either be from the compiled regex or from options. */
490
491caseless = ((re->options | options) & PCRE_CASELESS) != 0;
492
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000493/* For an anchored pattern, or an unchored pattern that has a first char, or a
494multiline pattern that matches only at "line starts", no further processing at
495present. */
496
497if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
498 return NULL;
499
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000500/* See if we can find a fixed set of initial characters for the pattern. */
501
Guido van Rossum50700601997-12-08 17:15:20 +0000502memset(start_bits, 0, 32 * sizeof(uschar));
503if (!set_start_bits(re->code, start_bits)) return NULL;
504
505/* If this studying is caseless, scan the created bit map and duplicate the
506bits for any letters. */
507
508if (caseless)
509 {
510 register int c;
511 for (c = 0; c < 256; c++)
512 {
513 if ((start_bits[c/8] & (1 << (c&7))) != 0 &&
514 (pcre_ctypes[c] & ctype_letter) != 0)
515 {
516 int d = pcre_fcc[c];
517 start_bits[d/8] |= (1 << (d&7));
518 }
519 }
520 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000521
522/* Get an "extra" block and put the information therein. */
523
524extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
525
526if (extra == NULL)
527 {
528 *errorptr = "failed to get memory";
529 return NULL;
530 }
Guido van Rossum50700601997-12-08 17:15:20 +0000531
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000532extra->options = PCRE_STUDY_MAPPED | (caseless? PCRE_STUDY_CASELESS : 0);
Guido van Rossum50700601997-12-08 17:15:20 +0000533memcpy(extra->start_bits, start_bits, sizeof(start_bits));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000534
535return (pcre_extra *)extra;
536}
537
Guido van Rossum50700601997-12-08 17:15:20 +0000538/* End of study.c */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000539/*************************************************
540* Perl-Compatible Regular Expressions *
541*************************************************/
542
543/*
544This is a library of functions to support regular expressions whose syntax
545and semantics are as close as possible to those of the Perl 5 language. See
546the file Tech.Notes for some information on the internals.
547
548Written by: Philip Hazel <ph10@cam.ac.uk>
549
550 Copyright (c) 1997 University of Cambridge
551
552-----------------------------------------------------------------------------
553Permission is granted to anyone to use this software for any purpose on any
554computer system, and to redistribute it freely, subject to the following
555restrictions:
556
5571. This software is distributed in the hope that it will be useful,
558 but WITHOUT ANY WARRANTY; without even the implied warranty of
559 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
560
5612. The origin of this software must not be misrepresented, either by
562 explicit claim or by omission.
563
5643. Altered versions must be plainly marked as such, and must not be
565 misrepresented as being the original software.
566-----------------------------------------------------------------------------
567*/
568
569
570/* Define DEBUG to get debugging output on stdout. */
571
572/* #define DEBUG */
573
574
575/* Include the internals header, which itself includes Standard C headers plus
576the external pcre header. */
577
578
Guido van Rossum50700601997-12-08 17:15:20 +0000579
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000580#ifndef Py_eval_input
581/* For Python 1.4, graminit.h has to be explicitly included */
582#define Py_eval_input eval_input
Guido van Rossum50700601997-12-08 17:15:20 +0000583
584#endif /* FOR_PYTHON */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000585
586/* Min and max values for the common repeats; for the maxima, 0 => infinity */
587
588static char rep_min[] = { 0, 0, 1, 1, 0, 0 };
589static char rep_max[] = { 0, 0, 0, 0, 1, 1 };
590
591/* Text forms of OP_ values and things, for debugging */
592
593#ifdef DEBUG
Guido van Rossum58132c61997-12-17 00:24:13 +0000594static const char *OP_names[] = {
595 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
Guido van Rossum50700601997-12-08 17:15:20 +0000596 "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z",
597 "localized \\B", "localized \\b", "localized \\W", "localized \\w",
598 "^", "$", "Any", "chars",
599 "not",
600 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000601 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
602 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
603 "*", "*?", "+", "+?", "?", "??", "{", "{",
Guido van Rossum50700601997-12-08 17:15:20 +0000604 "class", "classL", "Ref",
605 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000606 "Brazero", "Braminzero", "Bra"
607};
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000608#endif
609
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000610/* Table for handling escaped characters in the range '0'-'z'. Positive returns
611are simple data values; negative values are for special things like \d and so
612on. Zero means further processing is needed (for things like \x), or the escape
613is invalid. */
614
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000615static short int escapes[] = {
616 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
617 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
618 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
619 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
620 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
621 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
622 '`', 7, -ESC_b, 0, -ESC_d, 0, '\f', 0, /* ` - g */
623 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
624 0, 0, '\r', -ESC_s, '\t', 0, '\v', -ESC_w, /* p - w */
625 0, 0, 0 /* x - z */
626};
627
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000628/* Definition to allow mutual recursion */
629
Guido van Rossum58132c61997-12-17 00:24:13 +0000630static BOOL compile_regex(int, int *, uschar **, const uschar **,
631 const char **, PyObject *);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000632
633/* Structure for passing "static" information around between the functions
634doing the matching, so that they are thread-safe. */
635
636typedef struct match_data {
637 int errorcode; /* As it says */
638 int *offset_vector; /* Offset vector */
639 int offset_end; /* One past the end */
640 BOOL offset_overflow; /* Set if too many extractions */
641 BOOL caseless; /* Case-independent flag */
Guido van Rossum50700601997-12-08 17:15:20 +0000642 BOOL runtime_caseless; /* Caseless forced at run time */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000643 BOOL multiline; /* Multiline flag */
Guido van Rossum50700601997-12-08 17:15:20 +0000644 BOOL notbol; /* NOTBOL flag */
645 BOOL noteol; /* NOTEOL flag */
646 BOOL dotall; /* Dot matches any char */
647 BOOL endonly; /* Dollar not before final \n */
Guido van Rossum58132c61997-12-17 00:24:13 +0000648 const uschar *start_subject; /* Start of the subject string */
649 const uschar *end_subject; /* End of the subject string */
Guido van Rossum50700601997-12-08 17:15:20 +0000650 jmp_buf fail_env; /* Environment for longjump() break out */
Guido van Rossum58132c61997-12-17 00:24:13 +0000651 const uschar *end_match_ptr; /* Subject position at end match */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000652 int end_offset_top; /* Highwater mark at end of match */
Guido van Rossum50700601997-12-08 17:15:20 +0000653 jmp_buf error_env; /* For longjmp() if an error occurs deep inside a
654 matching operation */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000655 int length; /* Length of the allocated stacks */
656 int point; /* Point to add next item pushed onto stacks */
657 /* Pointers to the 6 stacks */
658 int *off_num, *offset_top, *r1, *r2;
Guido van Rossum58132c61997-12-17 00:24:13 +0000659 const uschar **eptr, **ecode;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000660} match_data;
661
662
663
664/*************************************************
Guido van Rossum50700601997-12-08 17:15:20 +0000665* Global variables *
666*************************************************/
667
668/* PCRE is thread-clean and doesn't use any global variables in the normal
669sense. However, it calls memory allocation and free functions via the two
670indirections below, which are can be changed by the caller, but are shared
671between all threads. */
672
673void *(*pcre_malloc)(size_t) = malloc;
674void (*pcre_free)(void *) = free;
675
676
677
678
679/*************************************************
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000680* Return version string *
681*************************************************/
682
Guido van Rossum58132c61997-12-17 00:24:13 +0000683const char *
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000684pcre_version(void)
685{
686return PCRE_VERSION;
687}
688
689
690
691
692/*************************************************
693* Return info about a compiled pattern *
694*************************************************/
695
696/* This function picks potentially useful data out of the private
697structure.
698
699Arguments:
700 external_re points to compiled code
701 optptr where to pass back the options
702 first_char where to pass back the first character,
703 or -1 if multiline and all branches start ^,
704 or -2 otherwise
705
706Returns: number of identifying extraction brackets
707 or negative values on error
708*/
709
710int
Guido van Rossum50700601997-12-08 17:15:20 +0000711pcre_info(const pcre *external_re, int *optptr, int *first_char)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000712{
Guido van Rossum58132c61997-12-17 00:24:13 +0000713const real_pcre *re = (real_pcre *)external_re;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000714if (re == NULL) return PCRE_ERROR_NULL;
715if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
716if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
717if (first_char != NULL)
718 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
719 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
720return re->top_bracket;
721}
722
723
724
725
726#ifdef DEBUG
727/*************************************************
728* Debugging function to print chars *
729*************************************************/
730
731/* Print a sequence of chars in printable format, stopping at the end of the
732subject if the requested.
733
734Arguments:
735 p points to characters
736 length number to print
737 is_subject TRUE if printing from within md->start_subject
738 md pointer to matching data block, if is_subject is TRUE
739
740Returns: nothing
741*/
742
743static pchars(uschar *p, int length, BOOL is_subject, match_data *md)
744{
745int c;
746if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
747while (length-- > 0)
748 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
749}
750#endif
751
752
753
754
755/*************************************************
756* Check subpattern for empty operand *
757*************************************************/
758
759/* This function checks a bracketed subpattern to see if any of the paths
760through it could match an empty string. This is used to diagnose an error if
761such a subpattern is followed by a quantifier with an unlimited upper bound.
762
763Argument:
764 code points to the opening bracket
765
766Returns: TRUE or FALSE
767*/
768
769static BOOL
770could_be_empty(uschar *code)
771{
772do {
773 uschar *cc = code + 3;
774
775 /* Scan along the opcodes for this branch; as soon as we find something
776 that matches a non-empty string, break out and advance to test the next
777 branch. If we get to the end of the branch, return TRUE for the whole
778 sub-expression. */
779
780 for (;;)
781 {
782 /* Test an embedded subpattern; if it could not be empty, break the
783 loop. Otherwise carry on in the branch. */
784
Guido van Rossum50700601997-12-08 17:15:20 +0000785 if ((int)(*cc) >= OP_BRA || (int)(*cc) == OP_ONCE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000786 {
787 if (!could_be_empty(cc)) break;
788 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
789 cc += 3;
790 }
791
792 else switch (*cc)
793 {
794 /* Reached end of a branch: the subpattern may match the empty string */
795
796 case OP_ALT:
797 case OP_KET:
798 case OP_KETRMAX:
799 case OP_KETRMIN:
800 return TRUE;
801
802 /* Skip over assertive subpatterns */
803
804 case OP_ASSERT:
805 case OP_ASSERT_NOT:
806 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
807 cc += 3;
808 break;
809
810 /* Skip over things that don't match chars */
811
812 case OP_SOD:
813 case OP_EOD:
814 case OP_CIRC:
815 case OP_DOLL:
816 case OP_BRAZERO:
817 case OP_BRAMINZERO:
818 case OP_NOT_WORD_BOUNDARY:
819 case OP_WORD_BOUNDARY:
Guido van Rossum50700601997-12-08 17:15:20 +0000820 case OP_NOT_WORD_BOUNDARY_L:
821 case OP_WORD_BOUNDARY_L:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000822 cc++;
823 break;
824
825 /* Skip over simple repeats with zero lower bound */
826
827 case OP_STAR:
828 case OP_MINSTAR:
829 case OP_QUERY:
830 case OP_MINQUERY:
Guido van Rossum50700601997-12-08 17:15:20 +0000831 case OP_NOTSTAR:
832 case OP_NOTMINSTAR:
833 case OP_NOTQUERY:
834 case OP_NOTMINQUERY:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000835 case OP_TYPESTAR:
836 case OP_TYPEMINSTAR:
837 case OP_TYPEQUERY:
838 case OP_TYPEMINQUERY:
839 cc += 2;
840 break;
841
842 /* Skip over UPTOs (lower bound is zero) */
843
844 case OP_UPTO:
845 case OP_MINUPTO:
846 case OP_TYPEUPTO:
847 case OP_TYPEMINUPTO:
848 cc += 4;
849 break;
850
851 /* Check a class or a back reference for a zero minimum */
852
853 case OP_CLASS:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000854 case OP_REF:
Guido van Rossum50700601997-12-08 17:15:20 +0000855 case OP_CLASS_L:
856 switch(*cc)
857 {
858 case (OP_REF): cc += 2; break;
859 case (OP_CLASS): cc += 1+32; break;
860 case (OP_CLASS_L): cc += 1+1+32; break;
861 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000862
863 switch (*cc)
864 {
865 case OP_CRSTAR:
866 case OP_CRMINSTAR:
867 case OP_CRQUERY:
868 case OP_CRMINQUERY:
869 cc++;
870 break;
871
872 case OP_CRRANGE:
873 case OP_CRMINRANGE:
874 if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
875 cc += 3;
876 break;
877
878 default:
879 goto NEXT_BRANCH;
880 }
881 break;
882
883 /* Anything else matches at least one character */
884
885 default:
886 goto NEXT_BRANCH;
887 }
888 }
889
890 NEXT_BRANCH:
891 code += (code[1] << 8) + code[2];
892 }
893while (*code == OP_ALT);
894
895/* No branches match the empty string */
896
897return FALSE;
898}
899
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000900/* Determine the length of a group ID in an expression like
901 (?P<foo_123>...)
902Arguments:
903 ptr pattern position pointer (say that 3 times fast)
904 finalchar the character that will mark the end of the ID
905 errorptr points to the pointer to the error message
906*/
907
908static int
Guido van Rossum58132c61997-12-17 00:24:13 +0000909get_group_id(const uschar *ptr, char finalchar, const char **errorptr)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000910{
Guido van Rossum58132c61997-12-17 00:24:13 +0000911 const uschar *start = ptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000912
913 /* If the first character is not in \w, or is in \w but is a digit,
914 report an error */
915 if (!(pcre_ctypes[*ptr] & ctype_word) ||
916 (pcre_ctypes[*ptr++] & ctype_digit))
917 {
918 *errorptr = "(?P identifier must start with a letter or underscore";
919 return 0;
920 }
921
922 /* Increment ptr until we either hit a null byte, the desired
923 final character, or a non-word character */
924 for(; (*ptr != 0) && (*ptr != finalchar) &&
925 (pcre_ctypes[*ptr] & ctype_word); ptr++)
926 {
Guido van Rossumc3861071997-10-08 02:07:40 +0000927 /* Empty loop body */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000928 }
929 if (*ptr==finalchar)
930 return ptr-start;
931 if (*ptr==0)
932 {
933 *errorptr = "unterminated (?P identifier";
934 return 0;
935 }
936 *errorptr = "illegal character in (?P identifier";
937 return 0;
938}
939
940/*************************************************
941* Handle escapes *
942*************************************************/
943
944/* This function is called when a \ has been encountered. It either returns a
945positive value for a simple escape such as \n, or a negative value which
946encodes one of the more complicated things such as \d. On entry, ptr is
947pointing at the \. On exit, it is on the final character of the escape
948sequence.
949
950Arguments:
951 ptrptr points to the pattern position pointer
952 errorptr points to the pointer to the error message
Guido van Rossum50700601997-12-08 17:15:20 +0000953 bracount number of previous extracting brackets
954 options the options bits
955 isclass TRUE if inside a character class
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000956
957Returns: zero or positive => a data character
958 negative => a special escape sequence
959 on error, errorptr is set
960*/
961
962static int
Guido van Rossum58132c61997-12-17 00:24:13 +0000963check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
964 int options, BOOL isclass)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000965{
Guido van Rossum58132c61997-12-17 00:24:13 +0000966const uschar *ptr = *ptrptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000967int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
968int i;
969
Guido van Rossum50700601997-12-08 17:15:20 +0000970if (c == 0) *errorptr = ERR1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000971
972/* Digits or letters may have special meaning; all others are literals. */
973
974else if (c < '0' || c > 'z') {}
975
976/* Do an initial lookup in a table. A non-zero result is something that can be
977returned immediately. Otherwise further processing may be required. */
978
979else if ((i = escapes[c - '0']) != 0) c = i;
980
981/* Escapes that need further processing, or are illegal. */
982
Guido van Rossum50700601997-12-08 17:15:20 +0000983else
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000984 {
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000985
Guido van Rossum50700601997-12-08 17:15:20 +0000986 switch (c)
987 {
988 /* The handling of escape sequences consisting of a string of digits
989 starting with one that is not zero is not straightforward. By experiment,
990 the way Perl works seems to be as follows:
991
992 Outside a character class, the digits are read as a decimal number. If the
993 number is less than 10, or if there are that many previous extracting
994 left brackets, then it is a back reference. Otherwise, up to three octal
995 digits are read to form an escaped byte. Thus \123 is likely to be octal
996 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
997 value is greater than 377, the least significant 8 bits are taken. Inside a
998 character class, \ followed by a digit is always an octal number. */
999
1000 case '1': case '2': case '3': case '4': case '5':
1001 case '6': case '7': case '8': case '9':
1002
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001003 {
1004 /* PYTHON: Try to compute an octal value for a character */
1005 for(c=0, i=0; c!=-1 && ptr[i]!=0 && i<3; i++)
1006 {
1007 if (( pcre_ctypes[ ptr[i] ] & ctype_odigit) != 0)
1008 c = c * 8 + ptr[i]-'0';
1009 else
1010 c = -1; /* Non-octal character */
1011 }
1012 /* Aha! There were 3 octal digits, so it must be a character */
1013 if (c != -1 && i == 3)
1014 {
1015 ptr += i-1;
1016 break;
1017 }
1018 c = ptr[0]; /* Restore the first character after the \ */
1019 c -= '0'; i = 1;
1020 while (i<2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0)
1021 {
1022 c = c * 10 + ptr[1] - '0';
1023 ptr++; i++;
1024 }
1025 if (c > 255 - ESC_REF) *errorptr = "back reference too big";
1026 c = -(ESC_REF + c);
1027 }
1028 break;
1029
Guido van Rossum50700601997-12-08 17:15:20 +00001030 /* \0 always starts an octal number, but we may drop through to here with a
1031 larger first octal digit */
1032
1033 case '0':
1034 c -= '0';
1035 while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&
1036 ptr[1] != '8' && ptr[1] != '9')
1037 c = c * 8 + *(++ptr) - '0';
1038 break;
1039
1040 /* Special escapes not starting with a digit are straightforward */
1041
1042 case 'x':
1043 c = 0;
1044 while ( (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001045 {
Guido van Rossum50700601997-12-08 17:15:20 +00001046 ptr++;
1047 c = c * 16 + pcre_lcc[*ptr] -
1048 (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
1049 c &= 255;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001050 }
1051 break;
1052
1053
Guido van Rossum50700601997-12-08 17:15:20 +00001054 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1055 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
1056 for Perl compatibility, it is a literal. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001057
Guido van Rossum50700601997-12-08 17:15:20 +00001058 default:
1059 if ((options & PCRE_EXTRA) != 0) switch(c)
1060 {
1061 case 'X':
1062 c = -ESC_X; /* This could be a lookup if it ever got into Perl */
1063 break;
1064
1065 default:
1066 *errorptr = ERR3;
1067 break;
1068 }
1069 break;
1070 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001071 }
1072
1073*ptrptr = ptr;
1074return c;
1075}
1076
1077
1078
1079/*************************************************
Guido van Rossum50700601997-12-08 17:15:20 +00001080* Check for counted repeat *
1081*************************************************/
1082
1083/* This function is called when a '{' is encountered in a place where it might
1084start a quantifier. It looks ahead to see if it really is a quantifier or not.
1085It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1086where the ddds are digits.
1087
1088Arguments:
1089 p pointer to the first char after '{'
1090
1091Returns: TRUE or FALSE
1092*/
1093
1094static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00001095is_counted_repeat(const uschar *p)
Guido van Rossum50700601997-12-08 17:15:20 +00001096{
1097if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1098while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1099if (*p == '}') return TRUE;
1100
1101if (*p++ != ',') return FALSE;
1102if (*p == '}') return TRUE;
1103
1104if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1105while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1106return (*p == '}');
1107}
1108
1109
1110
1111/*************************************************
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001112* Read repeat counts *
1113*************************************************/
1114
Guido van Rossum50700601997-12-08 17:15:20 +00001115/* Read an item of the form {n,m} and return the values. This is called only
1116after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1117so the syntax is guaranteed to be correct, but we need to check the values.
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001118
1119Arguments:
1120 p pointer to first char after '{'
1121 minp pointer to int for min
1122 maxp pointer to int for max
1123 returned as -1 if no max
1124 errorptr points to pointer to error message
1125
1126Returns: pointer to '}' on success;
1127 current ptr on error, with errorptr set
1128*/
1129
Guido van Rossum58132c61997-12-17 00:24:13 +00001130static const uschar *
1131read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001132{
1133int min = 0;
1134int max = -1;
1135
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001136while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
1137
1138if (*p == '}') max = min; else
1139 {
Guido van Rossum50700601997-12-08 17:15:20 +00001140 if (*(++p) != '}')
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001141 {
1142 max = 0;
1143 while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001144 if (max < min)
1145 {
Guido van Rossum50700601997-12-08 17:15:20 +00001146 *errorptr = ERR4;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001147 return p;
1148 }
1149 }
1150 }
1151
1152/* Do paranoid checks, then fill in the required variables, and pass back the
1153pointer to the terminating '}'. */
1154
Guido van Rossum50700601997-12-08 17:15:20 +00001155if (min > 65535 || max > 65535)
1156 *errorptr = ERR5;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001157else
1158 {
1159 *minp = min;
1160 *maxp = max;
1161 }
1162return p;
1163}
1164
1165
1166
1167/*************************************************
1168* Compile one branch *
1169*************************************************/
1170
1171/* Scan the pattern, compiling it into the code vector.
1172
1173Arguments:
Guido van Rossum50700601997-12-08 17:15:20 +00001174 options the option bits
1175 bracket points to number of brackets used
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001176 code points to the pointer to the current code point
1177 ptrptr points to the current pattern pointer
1178 errorptr points to pointer to error message
1179
1180Returns: TRUE on success
1181 FALSE, with *errorptr set on error
1182*/
1183
1184static BOOL
Guido van Rossum50700601997-12-08 17:15:20 +00001185compile_branch(int options, int *brackets, uschar **codeptr,
Guido van Rossum58132c61997-12-17 00:24:13 +00001186 const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001187{
1188int repeat_type, op_type;
1189int repeat_min, repeat_max;
1190int bravalue, length;
1191register int c;
1192register uschar *code = *codeptr;
Guido van Rossum58132c61997-12-17 00:24:13 +00001193const uschar *ptr = *ptrptr;
1194const uschar *oldptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001195uschar *previous = NULL;
Guido van Rossum50700601997-12-08 17:15:20 +00001196uschar class[32];
1197uschar *class_flag; /* Pointer to the single-byte flag for OP_CLASS_L */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001198
1199/* Switch on next character until the end of the branch */
1200
1201for (;; ptr++)
1202 {
Guido van Rossum50700601997-12-08 17:15:20 +00001203 BOOL negate_class;
1204 int class_charcount;
1205 int class_lastchar;
1206
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001207 c = *ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00001208 if ((options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001209 {
1210 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1211 if (c == '#')
1212 {
1213 while ((c = *(++ptr)) != 0 && c != '\n');
1214 continue;
1215 }
1216 }
1217
1218 switch(c)
1219 {
1220 /* The branch terminates at end of string, |, or ). */
1221
1222 case 0:
1223 case '|':
1224 case ')':
1225 *codeptr = code;
1226 *ptrptr = ptr;
1227 return TRUE;
1228
1229 /* Handle single-character metacharacters */
1230
1231 case '^':
1232 previous = NULL;
1233 *code++ = OP_CIRC;
1234 break;
1235
1236 case '$':
1237 previous = NULL;
1238 *code++ = OP_DOLL;
1239 break;
1240
1241 case '.':
1242 previous = code;
1243 *code++ = OP_ANY;
1244 break;
1245
Guido van Rossum50700601997-12-08 17:15:20 +00001246 /* Character classes. These always build a 32-byte bitmap of the permitted
1247 characters, except in the special case where there is only one character.
1248 For negated classes, we build the map as usual, then invert it at the end.
1249 */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001250
1251 case '[':
Guido van Rossum50700601997-12-08 17:15:20 +00001252 previous = code;
1253 if (options & PCRE_LOCALE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001254 {
Guido van Rossum50700601997-12-08 17:15:20 +00001255 *code++ = OP_CLASS_L;
1256 /* Set the flag for localized classes (like \w) to 0 */
1257 class_flag = code;
1258 *class_flag = 0;
1259 }
1260 else
1261 {
1262 *code++ = OP_CLASS;
1263 class_flag = NULL;
1264 }
1265
1266 /* If the first character is '^', set the negation flag */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001267
Guido van Rossum50700601997-12-08 17:15:20 +00001268 if ((c = *(++ptr)) == '^')
1269 {
1270 negate_class = TRUE;
1271 c = *(++ptr);
1272 }
1273 else negate_class = FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001274
Guido van Rossum50700601997-12-08 17:15:20 +00001275 /* Keep a count of chars so that we can optimize the case of just a single
1276 character. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001277
Guido van Rossum50700601997-12-08 17:15:20 +00001278 class_charcount = 0;
1279 class_lastchar = -1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001280
Guido van Rossum50700601997-12-08 17:15:20 +00001281 /* Initialize the 32-char bit map to all zeros. We have to build the
1282 map in a temporary bit of store, in case the class contains only 1
1283 character, because in that case the compiled code doesn't use the
1284 bit map. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001285
Guido van Rossum50700601997-12-08 17:15:20 +00001286 memset(class, 0, 32 * sizeof(uschar));
1287
1288 /* Process characters until ] is reached. By writing this as a "do" it
1289 means that an initial ] is taken as a data character. */
1290
1291 do
1292 {
1293 if (c == 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001294 {
Guido van Rossum50700601997-12-08 17:15:20 +00001295 *errorptr = ERR6;
1296 goto FAILED;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001297 }
1298
Guido van Rossum50700601997-12-08 17:15:20 +00001299 /* Backslash may introduce a single character, or it may introduce one
1300 of the specials, which just set a flag. Escaped items are checked for
1301 validity in the pre-compiling pass. The sequence \b is a special case.
Guido van Rossum58132c61997-12-17 00:24:13 +00001302 Inside a class (and only there) it is treated as backspace. Elsewhere
Guido van Rossum50700601997-12-08 17:15:20 +00001303 it marks a word boundary. Other escapes have preset maps ready to
1304 or into the one we are building. We assume they have more than one
1305 character in them, so set class_count bigger than one. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001306
Guido van Rossum50700601997-12-08 17:15:20 +00001307 if (c == '\\')
1308 {
1309 c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1310 if (-c == ESC_b) c = '\b';
1311 else if (c < 0)
1312 {
1313 class_charcount = 10;
1314 switch (-c)
1315 {
1316 case ESC_d:
Guido van Rossum50700601997-12-08 17:15:20 +00001317 {
1318 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];
1319 }
1320 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001321
Guido van Rossum50700601997-12-08 17:15:20 +00001322 case ESC_D:
Guido van Rossum50700601997-12-08 17:15:20 +00001323 {
1324 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];
1325 }
1326 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001327
Guido van Rossum50700601997-12-08 17:15:20 +00001328 case ESC_w:
1329 if (options & PCRE_LOCALE)
1330 {
1331 *class_flag |= 1;
1332 }
1333 else
1334 {
1335 for (c = 0; c < 32; c++)
1336 class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1337 }
1338 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001339
Guido van Rossum50700601997-12-08 17:15:20 +00001340 case ESC_W:
1341 if (options & PCRE_LOCALE)
1342 {
1343 *class_flag |= 2;
1344 }
1345 else
1346 {
1347 for (c = 0; c < 32; c++)
1348 class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1349 }
1350 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001351
Guido van Rossum50700601997-12-08 17:15:20 +00001352 case ESC_s:
Guido van Rossum50700601997-12-08 17:15:20 +00001353 {
1354 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];
1355 }
1356 continue;
1357
1358 case ESC_S:
Guido van Rossum50700601997-12-08 17:15:20 +00001359 {
1360 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];
1361 }
1362 continue;
1363
1364 default:
1365 *errorptr = ERR7;
1366 goto FAILED;
1367 }
1368 }
1369 /* Fall through if single character */
1370 }
1371
1372 /* A single character may be followed by '-' to form a range. However,
1373 Perl does not permit ']' to be the end of the range. A '-' character
1374 here is treated as a literal. */
1375
1376 if (ptr[1] == '-' && ptr[2] != ']')
1377 {
1378 int d;
1379 ptr += 2;
1380 d = *ptr;
1381
1382 if (d == 0)
1383 {
1384 *errorptr = ERR6;
1385 goto FAILED;
1386 }
1387
1388 /* The second part of a range can be a single-character escape, but
1389 not any of the other escapes. */
1390
1391 if (d == '\\')
1392 {
1393 d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1394 if (d < 0)
1395 {
1396 if (d == -ESC_b) d = '\b'; else
1397 {
1398 *errorptr = ERR7;
1399 goto FAILED;
1400 }
1401 }
1402 }
1403
1404 if (d < c)
1405 {
1406 *errorptr = ERR8;
1407 goto FAILED;
1408 }
1409
1410 for (; c <= d; c++)
1411 {
1412 class[c/8] |= (1 << (c&7));
1413 if ((options & PCRE_CASELESS) != 0)
1414 {
1415 int uc = pcre_fcc[c]; /* flip case */
1416 class[uc/8] |= (1 << (uc&7));
1417 }
1418 class_charcount++; /* in case a one-char range */
1419 class_lastchar = c;
1420 }
1421 continue; /* Go get the next char in the class */
1422 }
1423
1424 /* Handle a lone single character - we can get here for a normal
1425 non-escape char, or after \ that introduces a single character. */
1426
1427 class [c/8] |= (1 << (c&7));
1428 if ((options & PCRE_CASELESS) != 0)
1429 {
1430 c = pcre_fcc[c]; /* flip case */
1431 class[c/8] |= (1 << (c&7));
1432 }
1433 class_charcount++;
1434 class_lastchar = c;
1435 }
1436
1437 /* Loop until ']' reached; the check for end of string happens inside the
1438 loop. This "while" is the end of the "do" above. */
1439
1440 while ((c = *(++ptr)) != ']');
1441
1442 /* If class_charcount is 1 and class_lastchar is not negative, we saw
1443 precisely one character. This doesn't need the whole 32-byte bit map.
1444 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1445 it's negative. */
1446
1447 if (class_charcount == 1 && class_lastchar >= 0)
1448 {
1449 if (negate_class)
1450 {
1451 code[-1] = OP_NOT;
1452 }
1453 else
1454 {
1455 code[-1] = OP_CHARS;
1456 *code++ = 1;
1457 }
1458 *code++ = class_lastchar;
1459 }
1460
1461 /* Otherwise, negate the 32-byte map if necessary, and copy it into
1462 the code vector. */
1463
1464 else
1465 {
1466 /* If this is a localized opcode, bump the code pointer up */
1467 if (class_flag) code++;
1468 if (negate_class)
1469 {
1470 if (class_flag) *class_flag = (*class_flag) ^ 63;
1471 for (c = 0; c < 32; c++) code[c] = ~class[c];
1472 }
1473 else
1474 memcpy(code, class, 32);
1475 code += 32;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001476 }
1477 break;
1478
1479 /* Various kinds of repeat */
1480
1481 case '{':
Guido van Rossum50700601997-12-08 17:15:20 +00001482 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001483 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
1484 if (*errorptr != NULL) goto FAILED;
1485 goto REPEAT;
1486
1487 case '*':
1488 repeat_min = 0;
1489 repeat_max = -1;
1490 goto REPEAT;
1491
1492 case '+':
1493 repeat_min = 1;
1494 repeat_max = -1;
1495 goto REPEAT;
1496
1497 case '?':
1498 repeat_min = 0;
1499 repeat_max = 1;
1500
1501 REPEAT:
1502 if (previous == NULL)
1503 {
Guido van Rossum50700601997-12-08 17:15:20 +00001504 *errorptr = ERR9;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001505 goto FAILED;
1506 }
1507
1508 /* If the next character is '?' this is a minimizing repeat. Advance to the
1509 next character. */
1510
1511 if (ptr[1] == '?') { repeat_type = 1; ptr++; } else repeat_type = 0;
1512
Guido van Rossum50700601997-12-08 17:15:20 +00001513 /* If the maximum is zero then the minimum must also be zero; Perl allows
1514 this case, so we do too - by simply omitting the item altogether. */
1515
1516 if (repeat_max == 0) code = previous;
1517
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001518 /* If previous was a string of characters, chop off the last one and use it
1519 as the subject of the repeat. If there was only one character, we can
1520 abolish the previous item altogether. */
1521
Guido van Rossum50700601997-12-08 17:15:20 +00001522 else if (*previous == OP_CHARS)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001523 {
1524 int len = previous[1];
1525 if (len == 1)
1526 {
1527 c = previous[2];
1528 code = previous;
1529 }
1530 else
1531 {
1532 c = previous[len+1];
1533 previous[1]--;
1534 code--;
1535 }
1536 op_type = 0; /* Use single-char op codes */
1537 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1538 }
1539
Guido van Rossum50700601997-12-08 17:15:20 +00001540 /* If previous was a single negated character ([^a] or similar), we use
1541 one of the special opcodes, replacing it. The code is shared with single-
1542 character repeats by adding a suitable offset into repeat_type. */
1543
1544 else if ((int)*previous == OP_NOT)
1545 {
1546 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
1547 c = previous[1];
1548 code = previous;
1549 goto OUTPUT_SINGLE_REPEAT;
1550 }
1551
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001552 /* If previous was a character type match (\d or similar), abolish it and
1553 create a suitable repeat item. The code is shared with single-character
1554 repeats by adding a suitable offset into repeat_type. */
1555
Guido van Rossum50700601997-12-08 17:15:20 +00001556 else if ((int)*previous < OP_CIRC || *previous == OP_ANY)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001557 {
1558 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1559 c = *previous;
1560 code = previous;
1561
1562 OUTPUT_SINGLE_REPEAT:
1563 repeat_type += op_type; /* Combine both values for many cases */
1564
1565 /* A minimum of zero is handled either as the special case * or ?, or as
1566 an UPTO, with the maximum given. */
1567
1568 if (repeat_min == 0)
1569 {
1570 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1571 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1572 else
1573 {
1574 *code++ = OP_UPTO + repeat_type;
1575 *code++ = repeat_max >> 8;
1576 *code++ = (repeat_max & 255);
1577 }
1578 }
1579
1580 /* The case {1,} is handled as the special case + */
1581
1582 else if (repeat_min == 1 && repeat_max == -1)
1583 *code++ = OP_PLUS + repeat_type;
1584
1585 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1586 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1587
1588 else
1589 {
1590 if (repeat_min != 1)
1591 {
1592 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1593 *code++ = repeat_min >> 8;
1594 *code++ = (repeat_min & 255);
1595 }
1596
1597 /* If the mininum is 1 and the previous item was a character string,
1598 we either have to put back the item that got cancelled if the string
1599 length was 1, or add the character back onto the end of a longer
1600 string. For a character type nothing need be done; it will just get put
1601 back naturally. */
1602
1603 else if (*previous == OP_CHARS)
1604 {
1605 if (code == previous) code += 2; else previous[1]++;
1606 }
1607
1608 /* Insert an UPTO if the max is greater than the min. */
1609
1610 if (repeat_max != repeat_min)
1611 {
1612 *code++ = c;
1613 repeat_max -= repeat_min;
1614 *code++ = OP_UPTO + repeat_type;
1615 *code++ = repeat_max >> 8;
1616 *code++ = (repeat_max & 255);
1617 }
1618 }
1619
1620 /* The character or character type itself comes last in all cases. */
1621
1622 *code++ = c;
1623 }
1624
1625 /* If previous was a character class or a back reference, we put the repeat
1626 stuff after it. */
1627
Guido van Rossum50700601997-12-08 17:15:20 +00001628 else if (*previous == OP_CLASS || *previous==OP_CLASS_L || *previous == OP_REF)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001629 {
1630 if (repeat_min == 0 && repeat_max == -1)
1631 *code++ = OP_CRSTAR + repeat_type;
1632 else if (repeat_min == 1 && repeat_max == -1)
1633 *code++ = OP_CRPLUS + repeat_type;
1634 else if (repeat_min == 0 && repeat_max == 1)
1635 *code++ = OP_CRQUERY + repeat_type;
1636 else
1637 {
1638 *code++ = OP_CRRANGE + repeat_type;
1639 *code++ = repeat_min >> 8;
1640 *code++ = repeat_min & 255;
1641 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1642 *code++ = repeat_max >> 8;
1643 *code++ = repeat_max & 255;
1644 }
1645 }
1646
1647 /* If previous was a bracket group, we may have to replicate it in certain
1648 cases. If the maximum repeat count is unlimited, check that the bracket
1649 group cannot match the empty string, and diagnose an error if it can. */
1650
1651 else if ((int)*previous >= OP_BRA)
1652 {
1653 int i;
1654 int length = code - previous;
1655
1656 if (repeat_max == -1 && could_be_empty(previous))
1657 {
Guido van Rossum50700601997-12-08 17:15:20 +00001658 *errorptr = ERR10;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001659 goto FAILED;
1660 }
1661
1662 /* If the minimum is greater than zero, and the maximum is unlimited or
1663 equal to the minimum, the first copy remains where it is, and is
1664 replicated up to the minimum number of times. This case includes the +
1665 repeat, but of course no replication is needed in that case. */
1666
1667 if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1668 {
1669 for (i = 1; i < repeat_min; i++)
1670 {
1671 memcpy(code, previous, length);
1672 code += length;
1673 }
1674 }
1675
1676 /* If the minimum is zero, stick BRAZERO in front of the first copy.
1677 Then, if there is a fixed upper limit, replicated up to that many times,
1678 sticking BRAZERO in front of all the optional ones. */
1679
1680 else
1681 {
1682 if (repeat_min == 0)
1683 {
1684 memmove(previous+1, previous, length);
1685 code++;
1686 *previous++ = OP_BRAZERO + repeat_type;
1687 }
1688
1689 for (i = 1; i < repeat_min; i++)
1690 {
1691 memcpy(code, previous, length);
1692 code += length;
1693 }
1694
1695 for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1696 {
1697 *code++ = OP_BRAZERO + repeat_type;
1698 memcpy(code, previous, length);
1699 code += length;
1700 }
1701 }
1702
1703 /* If the maximum is unlimited, set a repeater in the final copy. */
1704
1705 if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
1706 }
1707
1708 /* Else there's some kind of shambles */
1709
1710 else
1711 {
Guido van Rossum50700601997-12-08 17:15:20 +00001712 *errorptr = ERR11;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001713 goto FAILED;
1714 }
1715
1716 /* In all case we no longer have a previous item. */
1717
1718 previous = NULL;
1719 break;
1720
1721
1722 /* Start of nested bracket sub-expression, or comment or lookahead.
1723 First deal with special things that can come after a bracket; all are
1724 introduced by ?, and the appearance of any of them means that this is not a
1725 referencing group. They were checked for validity in the first pass over
1726 the string, so we don't have to check for syntax errors here. */
1727
1728 case '(':
1729 previous = code; /* Only real brackets can be repeated */
1730 if (*(++ptr) == '?')
1731 {
1732 bravalue = OP_BRA;
1733
1734 switch (*(++ptr))
1735 {
1736 case '#':
1737 case 'i':
Guido van Rossumbd49ac41997-12-10 23:05:53 +00001738 case 'L':
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001739 case 'm':
1740 case 's':
1741 case 'x':
1742 ptr++;
1743 while (*ptr != ')') ptr++;
1744 previous = NULL;
1745 continue;
1746
1747 case ':': /* Non-extracting bracket */
1748 ptr++;
1749 break;
1750
1751 case '=': /* Assertions can't be repeated */
1752 bravalue = OP_ASSERT;
1753 ptr++;
1754 previous = NULL;
1755 break;
1756
1757 case '!':
1758 bravalue = OP_ASSERT_NOT;
1759 ptr++;
1760 previous = NULL;
1761 break;
1762
1763 case ('P'):
1764 ptr++;
1765 if (*ptr=='<')
1766 {
1767 /* (?P<groupname>...) */
1768 int idlen;
1769 PyObject *string, *intobj;
1770
1771 ptr++;
1772 idlen = get_group_id(ptr, '>', errorptr);
1773 if (*errorptr) {
1774 goto FAILED;
1775 }
Guido van Rossum57ba4f31997-12-02 20:40:28 +00001776 string = PyString_FromStringAndSize((char*)ptr, idlen);
Guido van Rossum50700601997-12-08 17:15:20 +00001777 intobj = PyInt_FromLong( brackets[0] + 1 );
Guido van Rossum58132c61997-12-17 00:24:13 +00001778 if (intobj == NULL || string == NULL)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001779 {
1780 Py_XDECREF(string);
1781 Py_XDECREF(intobj);
1782 *errorptr = "exception raised";
1783 goto FAILED;
1784 }
1785 PyDict_SetItem(dictionary, string, intobj);
Guido van Rossum58132c61997-12-17 00:24:13 +00001786 Py_DECREF(string); Py_DECREF(intobj); /* XXX DECREF commented out! */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001787 ptr += idlen+1; /* Point to rest of expression */
1788 goto do_grouping_bracket;
1789 }
1790 if (*ptr=='=')
1791 {
1792 /* (?P=groupname) */
1793 int idlen, refnum;
1794 PyObject *string, *intobj;
1795
1796 ptr++;
1797 idlen = get_group_id(ptr, ')', errorptr);
1798 if (*errorptr) {
1799 goto FAILED;
1800 }
Guido van Rossum50700601997-12-08 17:15:20 +00001801 string = PyString_FromStringAndSize((char *)ptr, idlen);
Guido van Rossumc3861071997-10-08 02:07:40 +00001802 if (string==NULL) {
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001803 *errorptr = "exception raised";
1804 goto FAILED;
1805 }
1806 intobj = PyDict_GetItem(dictionary, string);
1807 if (intobj==NULL) {
Guido van Rossumc3861071997-10-08 02:07:40 +00001808 Py_DECREF(string);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001809 *errorptr = "?P= group identifier isn't defined";
1810 goto FAILED;
1811 }
1812
1813 refnum = PyInt_AsLong(intobj);
Guido van Rossum1eadb411997-12-15 17:33:24 +00001814 Py_DECREF(string);
Guido van Rossum58132c61997-12-17 00:24:13 +00001815 /* The caller doesn't own the reference to the value
1816 returned from PyDict_GetItem, so intobj is not
1817 DECREF'ed. */
1818
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001819 *code++ = OP_REF;
1820 *code++ = refnum;
1821 /* The continue will cause the top-level for() loop to
1822 be resumed, so ptr will be immediately incremented.
1823 Therefore, the following line adds just idlen, not
1824 idlen+1 */
1825 ptr += idlen;
1826 continue;
1827 }
1828 /* The character after ?P is neither < nor =, so
1829 report an error. Add more Python-extensions here. */
1830 *errorptr="unknown after (?P";
1831 goto FAILED;
1832 break;
Guido van Rossum50700601997-12-08 17:15:20 +00001833
1834 case '>': /* "Match once" brackets */
1835 if ((options & PCRE_EXTRA) != 0) /* Not yet standard */
1836 {
1837 bravalue = OP_ONCE;
1838 ptr++;
1839 previous = NULL;
1840 break;
1841 }
1842 /* Else fall through */
1843
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001844 default:
Guido van Rossum50700601997-12-08 17:15:20 +00001845 *errorptr = ERR12;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001846 goto FAILED;
1847 }
1848 }
1849
1850 /* Else we have a referencing group */
1851
1852 else
1853 {
1854 do_grouping_bracket:
Guido van Rossum50700601997-12-08 17:15:20 +00001855 if (++(*brackets) > EXTRACT_MAX)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001856 {
Guido van Rossum50700601997-12-08 17:15:20 +00001857 *errorptr = ERR13;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001858 goto FAILED;
1859 }
Guido van Rossum50700601997-12-08 17:15:20 +00001860 bravalue = OP_BRA + *brackets;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001861 }
1862
1863 /* Process nested bracketed re; at end pointer is on the bracket. We copy
1864 code into a non-register variable in order to be able to pass its address
1865 because some compilers complain otherwise. */
1866
1867 *code = bravalue;
1868 {
1869 uschar *mcode = code;
Guido van Rossum50700601997-12-08 17:15:20 +00001870 if (!compile_regex(options, brackets, &mcode, &ptr, errorptr, dictionary))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001871 goto FAILED;
1872 code = mcode;
1873 }
1874
1875 if (*ptr != ')')
1876 {
Guido van Rossum50700601997-12-08 17:15:20 +00001877 *errorptr = ERR14;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001878 goto FAILED;
1879 }
1880 break;
1881
1882 /* Check \ for being a real metacharacter; if not, fall through and handle
1883 it as a data character at the start of a string. Escape items are checked
1884 for validity in the pre-compiling pass. */
1885
1886 case '\\':
1887 oldptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00001888 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001889
1890 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1891 are arranged to be the negation of the corresponding OP_values. For the
1892 back references, the values are ESC_REF plus the reference number. Only
1893 back references and those types that consume a character may be repeated.
1894 We can test for values between ESC_b and ESC_Z for the latter; this may
1895 have to change if any new ones are ever created. */
1896
1897 if (c < 0)
1898 {
1899 if (-c >= ESC_REF)
1900 {
Guido van Rossum50700601997-12-08 17:15:20 +00001901 int refnum = -c - ESC_REF;
1902 if (*brackets < refnum)
1903 {
1904 *errorptr = ERR15;
1905 goto FAILED;
1906 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001907 previous = code;
1908 *code++ = OP_REF;
1909 *code++ = refnum;
1910 }
1911 else
1912 {
Guido van Rossum50700601997-12-08 17:15:20 +00001913 previous = (-c > ESC_b && -c < ESC_X)? code : NULL;
1914 if ( (options & PCRE_LOCALE) != 0)
1915 {
1916 switch (c)
1917 {
1918 case (-ESC_b): c = -OP_WORD_BOUNDARY_L; break;
1919 case (-ESC_B): c = -OP_NOT_WORD_BOUNDARY_L; break;
1920 case (-ESC_w): c = -OP_WORDCHAR_L; break;
1921 case (-ESC_W): c = -OP_NOT_WORDCHAR_L; break;
1922 }
1923 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001924 *code++ = -c;
1925 }
1926 continue;
1927 }
1928
Guido van Rossum58132c61997-12-17 00:24:13 +00001929 /* Data character: Reset and fall through */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001930
1931 ptr = oldptr;
1932 c = '\\';
1933
1934 /* Handle a run of data characters until a metacharacter is encountered.
1935 The first character is guaranteed not to be whitespace or # when the
1936 extended flag is set. */
1937
Guido van Rossum50700601997-12-08 17:15:20 +00001938 NORMAL_CHAR:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001939 default:
1940 previous = code;
1941 *code = OP_CHARS;
1942 code += 2;
1943 length = 0;
1944
1945 do
1946 {
Guido van Rossum50700601997-12-08 17:15:20 +00001947 if ((options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001948 {
1949 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1950 if (c == '#')
1951 {
1952 while ((c = *(++ptr)) != 0 && c != '\n');
1953 if (c == 0) break;
1954 continue;
1955 }
1956 }
1957
1958 /* Backslash may introduce a data char or a metacharacter. Escaped items
1959 are checked for validity in the pre-compiling pass. Stop the string
1960 before a metaitem. */
1961
1962 if (c == '\\')
1963 {
1964 oldptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00001965 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001966 if (c < 0) { ptr = oldptr; break; }
1967 }
1968
1969 /* Ordinary character or single-char escape */
1970
1971 *code++ = c;
1972 length++;
1973 }
1974
1975 /* This "while" is the end of the "do" above. */
1976
1977 while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
1978
1979 /* Compute the length and set it in the data vector, and advance to
1980 the next state. */
1981
1982 previous[1] = length;
1983 ptr--;
1984 break;
1985 }
1986 } /* end of big loop */
1987
1988/* Control never reaches here by falling through, only by a goto for all the
1989error states. Pass back the position in the pattern so that it can be displayed
1990to the user for diagnosing the error. */
1991
1992FAILED:
1993*ptrptr = ptr;
1994return FALSE;
1995}
1996
1997
1998
1999
2000/*************************************************
2001* Compile sequence of alternatives *
2002*************************************************/
2003
2004/* On entry, ptr is pointing past the bracket character, but on return
2005it points to the closing bracket, or vertical bar, or end of string.
2006The code variable is pointing at the byte into which the BRA operator has been
2007stored.
2008
2009Argument:
Guido van Rossum50700601997-12-08 17:15:20 +00002010 options the option bits
2011 brackets -> int containing the number of extracting brackets used
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002012 codeptr -> the address of the current code pointer
2013 ptrptr -> the address of the current pattern pointer
2014 errorptr -> pointer to error message
2015
2016Returns: TRUE on success
2017*/
2018
2019static BOOL
Guido van Rossum50700601997-12-08 17:15:20 +00002020compile_regex(int options, int *brackets, uschar **codeptr,
Guido van Rossum58132c61997-12-17 00:24:13 +00002021 const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002022{
Guido van Rossum58132c61997-12-17 00:24:13 +00002023const uschar *ptr = *ptrptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002024uschar *code = *codeptr;
2025uschar *start_bracket = code;
2026
2027for (;;)
2028 {
2029 int length;
2030 uschar *last_branch = code;
2031
2032 code += 3;
Guido van Rossum50700601997-12-08 17:15:20 +00002033 if (!compile_branch(options, brackets, &code, &ptr, errorptr, dictionary))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002034 {
2035 *ptrptr = ptr;
2036 return FALSE;
2037 }
2038
2039 /* Fill in the length of the last branch */
2040
2041 length = code - last_branch;
2042 last_branch[1] = length >> 8;
2043 last_branch[2] = length & 255;
2044
2045 /* Reached end of expression, either ')' or end of pattern. Insert a
2046 terminating ket and the length of the whole bracketed item, and return,
2047 leaving the pointer at the terminating char. */
2048
2049 if (*ptr != '|')
2050 {
2051 length = code - start_bracket;
2052 *code++ = OP_KET;
2053 *code++ = length >> 8;
2054 *code++ = length & 255;
2055 *codeptr = code;
2056 *ptrptr = ptr;
2057 return TRUE;
2058 }
2059
2060 /* Another branch follows; insert an "or" node and advance the pointer. */
2061
2062 *code = OP_ALT;
2063 ptr++;
2064 }
2065/* Control never reaches here */
2066}
2067
2068
2069
2070/*************************************************
2071* Check for anchored expression *
2072*************************************************/
2073
2074/* Try to find out if this is an anchored regular expression. Consider each
2075alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2076all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2077it's anchored. However, if this is a multiline pattern, then only OP_SOD
2078counts, since OP_CIRC can match in the middle.
2079
2080A branch is also implicitly anchored if it starts with .* because that will try
2081the rest of the pattern at all possible matching points, so there is no point
2082trying them again.
2083
2084Argument: points to start of expression (the bracket)
2085Returns: TRUE or FALSE
2086*/
2087
2088static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00002089is_anchored(register const uschar *code, BOOL multiline)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002090{
2091do {
2092 int op = (int)code[3];
Guido van Rossum50700601997-12-08 17:15:20 +00002093 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002094 { if (!is_anchored(code+3, multiline)) return FALSE; }
2095 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2096 { if (code[4] != OP_ANY) return FALSE; }
2097 else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;
2098 code += (code[1] << 8) + code[2];
2099 }
2100while (*code == OP_ALT);
2101return TRUE;
2102}
2103
2104
2105
2106/*************************************************
2107* Check for start with \n line expression *
2108*************************************************/
2109
2110/* This is called for multiline expressions to try to find out if every branch
2111starts with ^ so that "first char" processing can be done to speed things up.
2112
2113Argument: points to start of expression (the bracket)
2114Returns: TRUE or FALSE
2115*/
2116
2117static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00002118is_startline(const uschar *code)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002119{
2120do {
2121 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2122 { if (!is_startline(code+3)) return FALSE; }
2123 else if (code[3] != OP_CIRC) return FALSE;
2124 code += (code[1] << 8) + code[2];
2125 }
2126while (*code == OP_ALT);
2127return TRUE;
2128}
2129
2130
2131
2132/*************************************************
2133* Check for fixed first char *
2134*************************************************/
2135
2136/* Try to find out if there is a fixed first character. This is called for
2137unanchored expressions, as it speeds up their processing quite considerably.
2138Consider each alternative branch. If they all start with the same char, or with
2139a bracket all of whose alternatives start with the same char (recurse ad lib),
2140then we return that char, otherwise -1.
2141
2142Argument: points to start of expression (the bracket)
2143Returns: -1 or the fixed first char
2144*/
2145
2146static int
2147find_firstchar(uschar *code)
2148{
2149register int c = -1;
2150do
2151 {
2152 register int charoffset = 4;
2153
2154 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2155 {
2156 register int d;
2157 if ((d = find_firstchar(code+3)) < 0) return -1;
2158 if (c < 0) c = d; else if (c != d) return -1;
2159 }
2160
2161 else switch(code[3])
2162 {
2163 default:
2164 return -1;
2165
2166 case OP_EXACT: /* Fall through */
2167 charoffset++;
2168
2169 case OP_CHARS: /* Fall through */
2170 charoffset++;
2171
2172 case OP_PLUS:
2173 case OP_MINPLUS:
2174 if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;
2175 break;
2176 }
2177 code += (code[1] << 8) + code[2];
2178 }
2179while (*code == OP_ALT);
2180return c;
2181}
2182
2183
2184
2185/*************************************************
2186* Compile a Regular Expression *
2187*************************************************/
2188
2189/* This function takes a string and returns a pointer to a block of store
2190holding a compiled version of the expression.
2191
2192Arguments:
2193 pattern the regular expression
2194 options various option bits
2195 errorptr pointer to pointer to error text
2196 erroroffset ptr offset in pattern where error was detected
2197
2198Returns: pointer to compiled data block, or NULL on error,
2199 with errorptr and erroroffset set
2200*/
2201
2202pcre *
Guido van Rossum58132c61997-12-17 00:24:13 +00002203pcre_compile(const char *pattern, int options, const char **errorptr,
Guido van Rossum50700601997-12-08 17:15:20 +00002204 int *erroroffset, PyObject *dictionary)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002205{
2206real_pcre *re;
2207int spaces = 0;
2208int length = 3; /* For initial BRA plus length */
2209int runlength;
2210int c, size;
Guido van Rossum50700601997-12-08 17:15:20 +00002211int bracount = 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002212int brastack[200];
Guido van Rossum50700601997-12-08 17:15:20 +00002213int top_backref = 0;
Guido van Rossum58132c61997-12-17 00:24:13 +00002214unsigned int brastackptr = 0;
2215uschar *code;
2216const uschar *ptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002217
2218#ifdef DEBUG
2219uschar *code_base, *code_end;
2220#endif
2221
Guido van Rossum50700601997-12-08 17:15:20 +00002222/* We can't pass back an error message if errorptr is NULL; I guess the best we
2223can do is just return NULL. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002224
Guido van Rossum50700601997-12-08 17:15:20 +00002225if (errorptr == NULL) return NULL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002226*errorptr = NULL;
Guido van Rossum50700601997-12-08 17:15:20 +00002227
2228/* However, we can give a message for this error */
2229
2230if (erroroffset == NULL)
2231 {
2232 *errorptr = ERR16;
2233 return NULL;
2234 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002235*erroroffset = 0;
2236
2237if ((options & ~PUBLIC_OPTIONS) != 0)
2238 {
Guido van Rossum50700601997-12-08 17:15:20 +00002239 *errorptr = ERR17;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002240 return NULL;
2241 }
2242
2243#ifdef DEBUG
2244printf("------------------------------------------------------------------\n");
2245printf("%s\n", pattern);
2246#endif
2247
2248/* The first thing to do is to make a pass over the pattern to compute the
2249amount of store required to hold the compiled code. This does not have to be
2250perfect as long as errors are overestimates. At the same time we can detect any
2251internal flag settings. Make an attempt to correct for any counted white space
2252if an "extended" flag setting appears late in the pattern. We can't be so
2253clever for #-comments. */
2254
Guido van Rossum58132c61997-12-17 00:24:13 +00002255ptr = (const uschar *)(pattern - 1);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002256while ((c = *(++ptr)) != 0)
2257 {
Guido van Rossum50700601997-12-08 17:15:20 +00002258 int min, max;
2259 int class_charcount;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002260
2261 if ((pcre_ctypes[c] & ctype_space) != 0)
2262 {
Guido van Rossum50700601997-12-08 17:15:20 +00002263 if ((options & PCRE_EXTENDED) != 0) continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002264 spaces++;
2265 }
2266
Guido van Rossum50700601997-12-08 17:15:20 +00002267 if (c == '#' && (options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002268 {
2269 while ((c = *(++ptr)) != 0 && c != '\n');
2270 continue;
2271 }
2272
2273 switch(c)
2274 {
2275 /* A backslashed item may be an escaped "normal" character or a
2276 character type. For a "normal" character, put the pointers and
2277 character back so that tests for whitespace etc. in the input
2278 are done correctly. */
2279
2280 case '\\':
2281 {
Guido van Rossum58132c61997-12-17 00:24:13 +00002282 const uschar *save_ptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00002283 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002284 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2285 if (c >= 0)
2286 {
2287 ptr = save_ptr;
2288 c = '\\';
2289 goto NORMAL_CHAR;
2290 }
2291 }
2292 length++;
2293
2294 /* A back reference needs an additional char, plus either one or 5
Guido van Rossum50700601997-12-08 17:15:20 +00002295 bytes for a repeat. We also need to keep the value of the highest
2296 back reference. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002297
2298 if (c <= -ESC_REF)
2299 {
Guido van Rossum50700601997-12-08 17:15:20 +00002300 int refnum = -c - ESC_REF;
2301 if (refnum > top_backref) top_backref = refnum;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002302 length++; /* For single back reference */
Guido van Rossum50700601997-12-08 17:15:20 +00002303 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002304 {
2305 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2306 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2307 if ((min == 0 && (max == 1 || max == -1)) ||
2308 (min == 1 && max == -1))
2309 length++;
2310 else length += 5;
2311 if (ptr[1] == '?') ptr++;
2312 }
2313 }
2314 continue;
2315
2316 case '^':
2317 case '.':
2318 case '$':
2319 case '*': /* These repeats won't be after brackets; */
2320 case '+': /* those are handled separately */
2321 case '?':
2322 length++;
2323 continue;
2324
2325 /* This covers the cases of repeats after a single char, metachar, class,
2326 or back reference. */
2327
2328 case '{':
Guido van Rossum50700601997-12-08 17:15:20 +00002329 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002330 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
2331 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2332 if ((min == 0 && (max == 1 || max == -1)) ||
2333 (min == 1 && max == -1))
2334 length++;
2335 else
2336 {
2337 length--; /* Uncount the original char or metachar */
2338 if (min == 1) length++; else if (min > 0) length += 4;
2339 if (max > 0) length += 4; else length += 2;
2340 }
2341 if (ptr[1] == '?') ptr++;
2342 continue;
2343
2344 /* An alternation contains an offset to the next branch or ket. */
2345 case '|':
2346 length += 3;
2347 continue;
2348
Guido van Rossum50700601997-12-08 17:15:20 +00002349 /* A character class uses 33 characters. Don't worry about character types
2350 that aren't allowed in classes - they'll get picked up during the compile.
2351 A character class that contains only one character uses 2 or 3 bytes,
2352 depending on whether it is negated or not. Notice this where we can. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002353
2354 case '[':
Guido van Rossum50700601997-12-08 17:15:20 +00002355 class_charcount = 0;
2356 if (*(++ptr) == '^') ptr++;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002357 do
2358 {
Guido van Rossum50700601997-12-08 17:15:20 +00002359 if (*ptr == '\\')
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002360 {
Guido van Rossum50700601997-12-08 17:15:20 +00002361 int c = check_escape(&ptr, errorptr, bracount, options, TRUE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002362 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
Guido van Rossum50700601997-12-08 17:15:20 +00002363 if (-c == ESC_b) class_charcount++; else class_charcount = 10;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002364 }
Guido van Rossum50700601997-12-08 17:15:20 +00002365 else class_charcount++;
2366 ptr++;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002367 }
2368 while (*ptr != 0 && *ptr != ']');
2369
Guido van Rossum50700601997-12-08 17:15:20 +00002370 /* Repeats for negated single chars are handled by the general code */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002371
Guido van Rossum50700601997-12-08 17:15:20 +00002372 if (class_charcount == 1) length += 3; else
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002373 {
Guido van Rossum50700601997-12-08 17:15:20 +00002374 length += 33;
2375 if (options & PCRE_LOCALE) length++; /* Add a byte for the localization flag */
2376
2377 /* A repeat needs either 1 or 5 bytes. */
2378
2379 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
2380 {
2381 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2382 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2383 if ((min == 0 && (max == 1 || max == -1)) ||
2384 (min == 1 && max == -1))
2385 length++;
2386 else length += 5;
2387 if (ptr[1] == '?') ptr++;
2388 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002389 }
2390 continue;
2391
2392 /* Brackets may be genuine groups or special things */
2393
2394 case '(':
2395
2396 /* Handle special forms of bracket, which all start (? */
2397
2398 if (ptr[1] == '?') switch (c = ptr[2])
2399 {
2400 /* Skip over comments entirely */
2401 case '#':
2402 ptr += 3;
2403 while (*ptr != 0 && *ptr != ')') ptr++;
2404 if (*ptr == 0)
2405 {
Guido van Rossum50700601997-12-08 17:15:20 +00002406 *errorptr = ERR18;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002407 goto PCRE_ERROR_RETURN;
2408 }
2409 continue;
2410
2411 /* Non-referencing groups and lookaheads just move the pointer on, and
Guido van Rossum50700601997-12-08 17:15:20 +00002412 then behave like a non-special bracket, except that they don't increment
2413 the count of extracting brackets. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002414
2415 case ':':
2416 case '=':
2417 case '!':
2418 ptr += 2;
2419 break;
2420
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002421 case ('P'):
2422 {
2423 int idlen;
2424 switch (*ptr++) {
2425 case ('<'):
2426 idlen = get_group_id(ptr++, '>', errorptr);
2427 if (*errorptr) goto PCRE_ERROR_RETURN;
2428 ptr += idlen+1;
2429 break;
2430 case ('='):
2431 idlen = get_group_id(ptr++, ')', errorptr);
2432 if (*errorptr) goto PCRE_ERROR_RETURN;
2433 ptr += idlen+1;
2434 length++;
2435 break;
2436 }
2437 }
2438 break;
2439
Guido van Rossum50700601997-12-08 17:15:20 +00002440 /* Ditto for the "once only" bracket, allowed only if the extra bit
2441 is set. */
2442
2443 case '>':
2444 if ((options & PCRE_EXTRA) != 0)
2445 {
2446 ptr += 2;
2447 break;
2448 }
2449 /* Else fall thourh */
2450
2451 /* Else loop setting valid options until ) is met. Anything else is an
2452 error. */
2453
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002454 default:
2455 ptr += 2;
2456 for (;; ptr++)
2457 {
2458 if ((c = *ptr) == 'i')
2459 {
2460 options |= PCRE_CASELESS;
2461 continue;
2462 }
Guido van Rossumbd49ac41997-12-10 23:05:53 +00002463 else if ((c = *ptr) == 'L')
Guido van Rossum50700601997-12-08 17:15:20 +00002464 {
2465 options |= PCRE_LOCALE;
2466 continue;
2467 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002468 else if ((c = *ptr) == 'm')
2469 {
2470 options |= PCRE_MULTILINE;
2471 continue;
2472 }
Guido van Rossum50700601997-12-08 17:15:20 +00002473 else if (c == 's')
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002474 {
2475 options |= PCRE_DOTALL;
2476 continue;
2477 }
2478 else if (c == 'x')
2479 {
2480 options |= PCRE_EXTENDED;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002481 length -= spaces; /* Already counted spaces */
2482 continue;
2483 }
2484 else if (c == ')') break;
2485
Guido van Rossum50700601997-12-08 17:15:20 +00002486 *errorptr = ERR12;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002487 goto PCRE_ERROR_RETURN;
2488 }
2489 continue; /* End of this bracket handling */
2490 }
2491
Guido van Rossum50700601997-12-08 17:15:20 +00002492 /* Extracting brackets must be counted so we can process escapes in a
2493 Perlish way. */
2494
2495 else bracount++;
2496
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002497 /* Non-special forms of bracket. Save length for computing whole length
2498 at end if there's a repeat that requires duplication of the group. */
2499
2500 if (brastackptr >= sizeof(brastack)/sizeof(int))
2501 {
Guido van Rossum50700601997-12-08 17:15:20 +00002502 *errorptr = ERR19;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002503 goto PCRE_ERROR_RETURN;
2504 }
2505
2506 brastack[brastackptr++] = length;
2507 length += 3;
2508 continue;
2509
2510 /* Handle ket. Look for subsequent max/min; for certain sets of values we
2511 have to replicate this bracket up to that many times. */
2512
2513 case ')':
2514 length += 3;
2515 {
2516 int min = 1;
2517 int max = 1;
2518 int duplength = length - brastack[--brastackptr];
2519
2520 /* Leave ptr at the final char; for read_repeat_counts this happens
2521 automatically; for the others we need an increment. */
2522
Guido van Rossum50700601997-12-08 17:15:20 +00002523 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002524 {
2525 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2526 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2527 }
2528 else if (c == '*') { min = 0; max = -1; ptr++; }
2529 else if (c == '+') { max = -1; ptr++; }
2530 else if (c == '?') { min = 0; ptr++; }
2531
2532 /* If there is a minimum > 1 we have to replicate up to min-1 times; if
2533 there is a limited maximum we have to replicate up to max-1 times and
2534 allow for a BRAZERO item before each optional copy, as we also have to
2535 do before the first copy if the minimum is zero. */
2536
2537 if (min == 0) length++;
2538 else if (min > 1) length += (min - 1) * duplength;
2539 if (max > min) length += (max - min) * (duplength + 1);
2540 }
2541
2542 continue;
2543
2544 /* Non-special character. For a run of such characters the length required
2545 is the number of characters + 2, except that the maximum run length is 255.
2546 We won't get a skipped space or a non-data escape or the start of a #
2547 comment as the first character, so the length can't be zero. */
2548
2549 NORMAL_CHAR:
2550 default:
2551 length += 2;
2552 runlength = 0;
2553 do
2554 {
2555 if ((pcre_ctypes[c] & ctype_space) != 0)
2556 {
Guido van Rossum50700601997-12-08 17:15:20 +00002557 if ((options & PCRE_EXTENDED) != 0) continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002558 spaces++;
2559 }
2560
Guido van Rossum50700601997-12-08 17:15:20 +00002561 if (c == '#' && (options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002562 {
2563 while ((c = *(++ptr)) != 0 && c != '\n');
2564 continue;
2565 }
2566
2567 /* Backslash may introduce a data char or a metacharacter; stop the
2568 string before the latter. */
2569
2570 if (c == '\\')
2571 {
Guido van Rossum58132c61997-12-17 00:24:13 +00002572 const uschar *saveptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00002573 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002574 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2575 if (c < 0) { ptr = saveptr; break; }
2576 }
2577
2578 /* Ordinary character or single-char escape */
2579
2580 runlength++;
2581 }
2582
2583 /* This "while" is the end of the "do" above. */
2584
2585 while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2586
2587 ptr--;
2588 length += runlength;
2589 continue;
2590 }
2591 }
2592
2593length += 4; /* For final KET and END */
2594
2595if (length > 65539)
2596 {
Guido van Rossum50700601997-12-08 17:15:20 +00002597 *errorptr = ERR20;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002598 return NULL;
2599 }
2600
2601/* Compute the size of data block needed and get it, either from malloc or
2602externally provided function. Put in the magic number and the options. */
2603
Guido van Rossum50700601997-12-08 17:15:20 +00002604size = length + offsetof(real_pcre, code);
2605re = (real_pcre *)(pcre_malloc)(size+50);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002606
2607if (re == NULL)
2608 {
Guido van Rossum50700601997-12-08 17:15:20 +00002609 *errorptr = ERR21;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002610 return NULL;
2611 }
2612
2613re->magic_number = MAGIC_NUMBER;
2614re->options = options;
2615
2616/* Set up a starting, non-extracting bracket, then compile the expression. On
2617error, *errorptr will be set non-NULL, so we don't need to look at the result
2618of the function here. */
2619
Guido van Rossum58132c61997-12-17 00:24:13 +00002620ptr = (const uschar *)pattern;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002621code = re->code;
2622*code = OP_BRA;
Guido van Rossum50700601997-12-08 17:15:20 +00002623bracount = 0;
2624(void)compile_regex(options, &bracount, &code, &ptr, errorptr, dictionary);
2625re->top_bracket = bracount;
2626re->top_backref = top_backref;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002627
2628/* If not reached end of pattern on success, there's an excess bracket. */
2629
Guido van Rossum50700601997-12-08 17:15:20 +00002630if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
2631
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002632/* Fill in the terminating state and check for disastrous overflow, but
2633if debugging, leave the test till after things are printed out. */
2634
2635*code++ = OP_END;
2636
Guido van Rossum50700601997-12-08 17:15:20 +00002637
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002638#ifndef DEBUG
Guido van Rossum50700601997-12-08 17:15:20 +00002639if (code - re->code > length) *errorptr = ERR23;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002640#endif
2641
2642/* Failed to compile */
2643
2644if (*errorptr != NULL)
2645 {
2646 (pcre_free)(re);
2647 PCRE_ERROR_RETURN:
Guido van Rossum58132c61997-12-17 00:24:13 +00002648 *erroroffset = ptr - (const uschar *)pattern;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002649 return NULL;
2650 }
2651
2652/* If the anchored option was not passed, set flag if we can determine that it
2653is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
2654we can determine what the first character has to be, because that speeds up
2655unanchored matches no end. In the case of multiline matches, an alternative is
2656to set the PCRE_STARTLINE flag if all branches start with ^. */
2657
2658if ((options & PCRE_ANCHORED) == 0)
2659 {
2660 if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))
2661 re->options |= PCRE_ANCHORED;
2662 else
2663 {
2664 int c = find_firstchar(re->code);
2665 if (c >= 0)
2666 {
2667 re->first_char = c;
2668 re->options |= PCRE_FIRSTSET;
2669 }
2670 else if (is_startline(re->code))
2671 re->options |= PCRE_STARTLINE;
2672 }
2673 }
2674
2675/* Print out the compiled data for debugging */
2676
2677#ifdef DEBUG
2678
Guido van Rossum50700601997-12-08 17:15:20 +00002679printf("Length = %d top_bracket = %d top_backref=%d\n",
2680 length, re->top_bracket, re->top_backref);
2681
2682if (re->options != 0)
2683 {
2684 printf("%s%s%s%s%s%s%s\n",
2685 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2686 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2687 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2688 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2689 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2690 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
2691 ((re->options & PCRE_EXTRA) != 0)? "extra " : "");
2692 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002693
2694if ((re->options & PCRE_FIRSTSET) != 0)
2695 {
2696 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2697 else printf("First char = \\x%02x\n", re->first_char);
2698 }
2699
2700code_end = code;
2701code_base = code = re->code;
2702
2703while (code < code_end)
2704 {
2705 int charlength;
2706
2707 printf("%3d ", code - code_base);
2708
2709 if (*code >= OP_BRA)
2710 {
2711 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2712 code += 2;
2713 }
2714
2715 else switch(*code)
2716 {
2717 case OP_CHARS:
2718 charlength = *(++code);
2719 printf("%3d ", charlength);
2720 while (charlength-- > 0)
2721 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2722 break;
2723
2724 case OP_KETRMAX:
2725 case OP_KETRMIN:
2726 case OP_ALT:
2727 case OP_KET:
2728 case OP_ASSERT:
2729 case OP_ASSERT_NOT:
Guido van Rossum50700601997-12-08 17:15:20 +00002730 case OP_ONCE:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002731 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2732 code += 2;
2733 break;
2734
2735 case OP_STAR:
2736 case OP_MINSTAR:
2737 case OP_PLUS:
2738 case OP_MINPLUS:
2739 case OP_QUERY:
2740 case OP_MINQUERY:
2741 case OP_TYPESTAR:
2742 case OP_TYPEMINSTAR:
2743 case OP_TYPEPLUS:
2744 case OP_TYPEMINPLUS:
2745 case OP_TYPEQUERY:
2746 case OP_TYPEMINQUERY:
2747 if (*code >= OP_TYPESTAR)
2748 printf(" %s", OP_names[code[1]]);
2749 else if (isprint(c = code[1])) printf(" %c", c);
2750 else printf(" \\x%02x", c);
2751 printf("%s", OP_names[*code++]);
2752 break;
2753
2754 case OP_EXACT:
2755 case OP_UPTO:
2756 case OP_MINUPTO:
2757 if (isprint(c = code[3])) printf(" %c{", c);
2758 else printf(" \\x%02x{", c);
2759 if (*code != OP_EXACT) printf(",");
2760 printf("%d}", (code[1] << 8) + code[2]);
2761 if (*code == OP_MINUPTO) printf("?");
2762 code += 3;
2763 break;
2764
2765 case OP_TYPEEXACT:
2766 case OP_TYPEUPTO:
2767 case OP_TYPEMINUPTO:
2768 printf(" %s{", OP_names[code[3]]);
2769 if (*code != OP_TYPEEXACT) printf(",");
2770 printf("%d}", (code[1] << 8) + code[2]);
2771 if (*code == OP_TYPEMINUPTO) printf("?");
2772 code += 3;
2773 break;
2774
Guido van Rossum50700601997-12-08 17:15:20 +00002775 case OP_NOT:
2776 if (isprint(c = *(++code))) printf(" [^%c]", c);
2777 else printf(" [^\\x%02x]", c);
2778 break;
2779
2780 case OP_NOTSTAR:
2781 case OP_NOTMINSTAR:
2782 case OP_NOTPLUS:
2783 case OP_NOTMINPLUS:
2784 case OP_NOTQUERY:
2785 case OP_NOTMINQUERY:
2786 if (isprint(c = code[1])) printf(" [^%c]", c);
2787 else printf(" [^\\x%02x]", c);
2788 printf("%s", OP_names[*code++]);
2789 break;
2790
2791 case OP_NOTEXACT:
2792 case OP_NOTUPTO:
2793 case OP_NOTMINUPTO:
2794 if (isprint(c = code[3])) printf(" [^%c]{", c);
2795 else printf(" [^\\x%02x]{", c);
2796 if (*code != OP_NOTEXACT) printf(",");
2797 printf("%d}", (code[1] << 8) + code[2]);
2798 if (*code == OP_NOTMINUPTO) printf("?");
2799 code += 3;
2800 break;
2801
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002802 case OP_REF:
2803 printf(" \\%d", *(++code));
2804 break;
2805
2806 case OP_CLASS:
Guido van Rossum50700601997-12-08 17:15:20 +00002807 case OP_CLASS_L:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002808 {
2809 int i, min, max;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002810
Guido van Rossum50700601997-12-08 17:15:20 +00002811 if (*code==OP_CLASS_L)
2812 {
2813 code++;
2814 printf("Locflag = %i ", *code++);
2815 }
2816 else
2817 code++;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002818
Guido van Rossum50700601997-12-08 17:15:20 +00002819 printf(" [");
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002820
Guido van Rossum50700601997-12-08 17:15:20 +00002821 for (i = 0; i < 256; i++)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002822 {
Guido van Rossum50700601997-12-08 17:15:20 +00002823 if ((code[i/8] & (1 << (i&7))) != 0)
2824 {
2825 int j;
2826 for (j = i+1; j < 256; j++)
2827 if ((code[j/8] & (1 << (j&7))) == 0) break;
2828 if (i == '-' || i == ']') printf("\\");
2829 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2830 if (--j > i)
2831 {
2832 printf("-");
2833 if (j == '-' || j == ']') printf("\\");
2834 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2835 }
2836 i = j;
2837 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002838 }
2839 printf("]");
Guido van Rossum50700601997-12-08 17:15:20 +00002840 code += 32;
2841 /* code ++;*/
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002842
Guido van Rossum50700601997-12-08 17:15:20 +00002843 switch(*code)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002844 {
2845 case OP_CRSTAR:
2846 case OP_CRMINSTAR:
2847 case OP_CRPLUS:
2848 case OP_CRMINPLUS:
2849 case OP_CRQUERY:
2850 case OP_CRMINQUERY:
2851 printf("%s", OP_names[*code]);
2852 break;
2853
2854 case OP_CRRANGE:
2855 case OP_CRMINRANGE:
2856 min = (code[1] << 8) + code[2];
2857 max = (code[3] << 8) + code[4];
2858 if (max == 0) printf("{%d,}", min);
2859 else printf("{%d,%d}", min, max);
2860 if (*code == OP_CRMINRANGE) printf("?");
2861 code += 4;
2862 break;
2863
2864 default:
2865 code--;
2866 }
2867 }
2868 break;
2869
2870 /* Anything else is just a one-node item */
2871
2872 default:
2873 printf(" %s", OP_names[*code]);
2874 break;
2875 }
2876
2877 code++;
2878 printf("\n");
2879 }
2880printf("------------------------------------------------------------------\n");
2881
2882/* This check is done here in the debugging case so that the code that
2883was compiled can be seen. */
2884
2885if (code - re->code > length)
2886 {
Guido van Rossum50700601997-12-08 17:15:20 +00002887 printf("length=%i, code length=%i\n", length, code-re->code);
2888 *errorptr = ERR23;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002889 (pcre_free)(re);
2890 *erroroffset = ptr - (uschar *)pattern;
2891 return NULL;
2892 }
2893#endif
2894
2895return (pcre *)re;
2896}
2897
2898
2899
2900/*************************************************
2901* Match a character type *
2902*************************************************/
2903
2904/* Not used in all the places it might be as it's sometimes faster
2905to put the code inline.
2906
2907Arguments:
2908 type the character type
2909 c the character
Guido van Rossum50700601997-12-08 17:15:20 +00002910 dotall the dotall flag
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002911
2912Returns: TRUE if character is of the type
2913*/
2914
2915static BOOL
2916match_type(int type, int c, BOOL dotall)
2917{
2918
2919#ifdef DEBUG
2920if (isprint(c)) printf("matching subject %c against ", c);
2921 else printf("matching subject \\x%02x against ", c);
2922printf("%s\n", OP_names[type]);
2923#endif
2924
2925switch(type)
2926 {
2927 case OP_ANY: return dotall || c != '\n';
2928 case OP_NOT_DIGIT: return (pcre_ctypes[c] & ctype_digit) == 0;
2929 case OP_DIGIT: return (pcre_ctypes[c] & ctype_digit) != 0;
2930 case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
2931 case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0;
2932 case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0;
2933 case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0;
Guido van Rossum58132c61997-12-17 00:24:13 +00002934 case OP_NOT_WORDCHAR_L: return (c!='_' && !isalnum(c));
2935 case OP_WORDCHAR_L: return (c=='_' || isalnum(c));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002936 }
2937return FALSE;
2938}
2939
2940
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002941
2942/*************************************************
2943* Match a back-reference *
2944*************************************************/
2945
2946/* If a back reference hasn't been set, the match fails.
2947
2948Arguments:
2949 number reference number
2950 eptr points into the subject
2951 length length to be matched
2952 md points to match data block
2953
2954Returns: TRUE if matched
2955*/
2956
2957static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00002958match_ref(int number, register const uschar *eptr, int length, match_data *md)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002959{
Guido van Rossum58132c61997-12-17 00:24:13 +00002960const uschar *p = md->start_subject + md->offset_vector[number];
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002961
2962#ifdef DEBUG
2963if (eptr >= md->end_subject)
2964 printf("matching subject <null>");
2965else
2966 {
2967 printf("matching subject ");
2968 pchars(eptr, length, TRUE, md);
2969 }
2970printf(" against backref ");
2971pchars(p, length, FALSE, md);
2972printf("\n");
2973#endif
2974
2975/* Always fail if not enough characters left */
2976
2977if (length > md->end_subject - p) return FALSE;
2978
Guido van Rossum58132c61997-12-17 00:24:13 +00002979/* Separate the caseless case for speed */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002980
2981if (md->caseless)
2982 { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }
2983else
2984 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
2985
2986return TRUE;
2987}
2988
2989static int free_stack(match_data *md)
2990{
2991/* Free any stack space that was allocated by the call to match(). */
2992if (md->off_num) free(md->off_num);
2993if (md->offset_top) free(md->offset_top);
2994if (md->r1) free(md->r1);
2995if (md->r2) free(md->r2);
2996if (md->eptr) free(md->eptr);
Guido van Rossumc3861071997-10-08 02:07:40 +00002997if (md->ecode) free(md->ecode);
2998return 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002999}
3000
3001static int grow_stack(match_data *md)
3002{
Guido van Rossum50700601997-12-08 17:15:20 +00003003 if (md->length != 0)
3004 {
3005 md->length = md->length + md->length/2;
3006 }
3007 else
3008 {
3009 int string_len = md->end_subject - md->start_subject + 1;
3010 if (string_len < 80) {md->length = string_len; }
3011 else {md->length = 80;}
3012 }
3013 PyMem_RESIZE(md->offset_top, int, md->length);
Guido van Rossum58132c61997-12-17 00:24:13 +00003014 PyMem_RESIZE(md->eptr, const uschar *, md->length);
3015 PyMem_RESIZE(md->ecode, const uschar *, md->length);
Guido van Rossum50700601997-12-08 17:15:20 +00003016 PyMem_RESIZE(md->off_num, int, md->length);
3017 PyMem_RESIZE(md->r1, int, md->length);
3018 PyMem_RESIZE(md->r2, int, md->length);
3019 if (md->offset_top == NULL || md->eptr == NULL || md->ecode == NULL ||
3020 md->off_num == NULL || md->r1 == NULL || md->r2 == NULL)
3021 {
3022 PyErr_SetString(PyExc_MemoryError, "Can't increase failure stack for re operation");
3023 longjmp(md->error_env, 1);
3024 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003025 return 0;
3026}
3027
Guido van Rossum50700601997-12-08 17:15:20 +00003028
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003029/*************************************************
3030* Match from current position *
3031*************************************************/
3032
3033/* On entry ecode points to the first opcode, and eptr to the first character.
3034
3035Arguments:
3036 eptr pointer in subject
3037 ecode position in code
3038 offset_top current top pointer
3039 md pointer to "static" info for the match
3040
3041Returns: TRUE if matched
3042*/
3043
3044static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00003045match(register const uschar *eptr, register const uschar *ecode, int offset_top,
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003046 match_data *md)
3047{
3048 int save_stack_position = md->point;
3049match_loop:
3050
3051#define SUCCEED goto succeed
3052#define FAIL goto fail
3053
3054for (;;)
3055 {
3056 int min, max, ctype;
3057 register int i;
3058 register int c;
Guido van Rossum58132c61997-12-17 00:24:13 +00003059 BOOL minimize = FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003060
3061 /* Opening bracket. Check the alternative branches in turn, failing if none
3062 match. We have to set the start offset if required and there is space
3063 in the offset vector so that it is available for subsequent back references
3064 if the bracket matches. However, if the bracket fails, we must put back the
3065 previous value of both offsets in case they were set by a previous copy of
3066 the same bracket. Don't worry about setting the flag for the error case here;
3067 that is handled in the code for KET. */
3068
3069 if ((int)*ecode >= OP_BRA)
3070 {
3071 int number = (*ecode - OP_BRA) << 1;
Guido van Rossum58132c61997-12-17 00:24:13 +00003072 int save_offset1 = 0, save_offset2 = 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003073
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003074#ifdef DEBUG
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003075 printf("start bracket %d\n", number/2);
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003076#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003077
3078 if (number > 0 && number < md->offset_end)
3079 {
3080 save_offset1 = md->offset_vector[number];
3081 save_offset2 = md->offset_vector[number+1];
3082 md->offset_vector[number] = eptr - md->start_subject;
3083
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003084#ifdef DEBUG
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003085 printf("saving %d %d\n", save_offset1, save_offset2);
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003086#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003087 }
3088
3089 /* Recurse for all the alternatives. */
3090
3091 do
3092 {
3093 if (match(eptr, ecode+3, offset_top, md)) SUCCEED;
3094 ecode += (ecode[1] << 8) + ecode[2];
3095 }
3096 while (*ecode == OP_ALT);
3097
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003098#ifdef DEBUG
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003099 printf("bracket %d failed\n", number/2);
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003100#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003101
3102 if (number > 0 && number < md->offset_end)
3103 {
3104 md->offset_vector[number] = save_offset1;
3105 md->offset_vector[number+1] = save_offset2;
3106 }
3107
3108 FAIL;
3109 }
3110
3111 /* Other types of node can be handled by a switch */
3112
3113 switch(*ecode)
3114 {
3115 case OP_END:
3116 md->end_match_ptr = eptr; /* Record where we ended */
3117 md->end_offset_top = offset_top; /* and how many extracts were taken */
3118 SUCCEED;
3119
Guido van Rossum50700601997-12-08 17:15:20 +00003120 /* The equivalent of Prolog's "cut" - if the rest doesn't match, the
3121 whole thing doesn't match, so we have to get out via a longjmp(). */
3122
3123 case OP_CUT:
3124 if (match(eptr, ecode+1, offset_top, md)) SUCCEED;
3125 longjmp(md->fail_env, 1);
3126
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003127 /* Assertion brackets. Check the alternative branches in turn - the
3128 matching won't pass the KET for an assertion. If any one branch matches,
3129 the assertion is true. */
3130
3131 case OP_ASSERT:
3132 do
3133 {
3134 if (match(eptr, ecode+3, offset_top, md)) break;
3135 ecode += (ecode[1] << 8) + ecode[2];
3136 }
3137 while (*ecode == OP_ALT);
3138 if (*ecode == OP_KET) FAIL;
3139
3140 /* Continue from after the assertion, updating the offsets high water
3141 mark, since extracts may have been taken during the assertion. */
3142
3143 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3144 ecode += 3;
3145 offset_top = md->end_offset_top;
3146 continue;
3147
3148 /* Negative assertion: all branches must fail to match */
3149
3150 case OP_ASSERT_NOT:
3151 do
3152 {
3153 if (match(eptr, ecode+3, offset_top, md)) FAIL;
3154 ecode += (ecode[1] << 8) + ecode[2];
3155 }
3156 while (*ecode == OP_ALT);
3157 ecode += 3;
3158 continue;
3159
Guido van Rossum50700601997-12-08 17:15:20 +00003160 /* "Once" brackets are like assertion brackets except that after a match,
3161 the point in the subject string is not moved back. Thus there can never be
3162 a move back into the brackets. Check the alternative branches in turn - the
3163 matching won't pass the KET for this kind of subpattern. If any one branch
3164 matches, we carry on, leaving the subject pointer. */
3165
3166 case OP_ONCE:
3167 do
3168 {
3169 if (match(eptr, ecode+3, offset_top, md)) break;
3170 ecode += (ecode[1] << 8) + ecode[2];
3171 }
3172 while (*ecode == OP_ALT);
3173 if (*ecode == OP_KET) return FALSE;
3174
3175 /* Continue as from after the assertion, updating the offsets high water
3176 mark, since extracts may have been taken. */
3177
3178 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3179 ecode += 3;
3180 offset_top = md->end_offset_top;
3181 eptr = md->end_match_ptr;
3182 continue;
3183
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003184 /* An alternation is the end of a branch; scan along to find the end of the
3185 bracketed group and go to there. */
3186
3187 case OP_ALT:
3188 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3189 break;
3190
3191 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3192 that it may occur zero times. It may repeat infinitely, or not at all -
3193 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3194 repeat limits are compiled as a number of copies, with the optional ones
3195 preceded by BRAZERO or BRAMINZERO. */
3196
3197 case OP_BRAZERO:
3198 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003199 const uschar *next = ecode+1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003200 if (match(eptr, next, offset_top, md)) SUCCEED;
3201 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3202 ecode = next + 3;
3203 }
3204 break;
3205
3206 case OP_BRAMINZERO:
3207 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003208 const uschar *next = ecode+1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003209 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3210 if (match(eptr, next+3, offset_top, md)) SUCCEED;
3211 ecode++;
3212 }
3213 break;;
3214
3215 /* End of a group, repeated or non-repeating. If we are at the end of
3216 an assertion "group", stop matching and SUCCEED, but record the
3217 current high water mark for use by positive assertions. */
3218
3219 case OP_KET:
3220 case OP_KETRMIN:
3221 case OP_KETRMAX:
3222 {
Guido van Rossum50700601997-12-08 17:15:20 +00003223 int number;
Guido van Rossum58132c61997-12-17 00:24:13 +00003224 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003225
Guido van Rossum50700601997-12-08 17:15:20 +00003226 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003227 {
Guido van Rossum50700601997-12-08 17:15:20 +00003228 md->end_match_ptr = eptr; /* For ONCE */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003229 md->end_offset_top = offset_top;
3230 SUCCEED;
3231 }
3232
3233 /* In all other cases we have to check the group number back at the
3234 start and if necessary complete handling an extraction by setting the
3235 final offset and bumping the high water mark. */
3236
3237 number = (*prev - OP_BRA) << 1;
3238
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003239#ifdef DEBUG
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003240 printf("end bracket %d\n", number/2);
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003241#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003242
3243 if (number > 0)
3244 {
3245 if (number >= md->offset_end) md->offset_overflow = TRUE; else
3246 {
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003247 md->offset_vector[number+1] = eptr - md->start_subject;
3248 if (offset_top <= number) offset_top = number + 2;
3249 }
3250 }
3251
3252 /* For a non-repeating ket, just advance to the next node and continue at
3253 this level. */
3254
3255 if (*ecode == OP_KET)
3256 {
3257 ecode += 3;
3258 break;
3259 }
3260
3261 /* The repeating kets try the rest of the pattern or restart from the
3262 preceding bracket, in the appropriate order. */
3263
3264 if (*ecode == OP_KETRMIN)
3265 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003266 const uschar *ptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003267 if (match(eptr, ecode+3, offset_top, md)) goto succeed;
3268 /* Handle alternation inside the BRA...KET; push the additional
Guido van Rossum58132c61997-12-17 00:24:13 +00003269 alternatives onto the stack */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003270 ptr=prev;
3271 do {
3272 ptr += (ptr[1]<<8)+ ptr[2];
3273 if (*ptr==OP_ALT)
3274 {
Guido van Rossum50700601997-12-08 17:15:20 +00003275 if (md->length == md->point)
3276 {
3277 grow_stack(md);
3278 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003279 md->offset_top[md->point] = offset_top;
3280 md->eptr[md->point] = eptr;
3281 md->ecode[md->point] = ptr+3;
3282 md->r1[md->point] = 0;
3283 md->r2[md->point] = 0;
3284 md->off_num[md->point] = 0;
3285 md->point++;
3286 }
3287 } while (*ptr==OP_ALT);
3288 ecode=prev+3; goto match_loop;
3289 }
3290 else /* OP_KETRMAX */
3291 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003292 const uschar *ptr;
3293 /*int points_pushed=0;*/
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003294
3295 /* Push one failure point, that will resume matching at the code after
3296 the KETRMAX opcode. */
Guido van Rossum50700601997-12-08 17:15:20 +00003297 if (md->length == md->point)
3298 {
3299 grow_stack(md);
3300 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003301 md->offset_top[md->point] = offset_top;
3302 md->eptr[md->point] = eptr;
3303 md->ecode[md->point] = ecode+3;
3304 md->r1[md->point] = md->offset_vector[number];
3305 md->r2[md->point] = md->offset_vector[number+1];
3306 md->off_num[md->point] = number;
3307 md->point++;
3308
3309 md->offset_vector[number] = eptr - md->start_subject;
3310 /* Handle alternation inside the BRA...KET; push each of the
Guido van Rossum58132c61997-12-17 00:24:13 +00003311 additional alternatives onto the stack */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003312 ptr=prev;
3313 do {
3314 ptr += (ptr[1]<<8)+ ptr[2];
3315 if (*ptr==OP_ALT)
3316 {
Guido van Rossum50700601997-12-08 17:15:20 +00003317 if (md->length == md->point)
3318 if (md->length == md->point)
3319 {
3320 grow_stack(md);
3321 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003322 md->offset_top[md->point] = offset_top;
3323 md->eptr[md->point] = eptr;
3324 md->ecode[md->point] = ptr+3;
3325 md->r1[md->point] = 0;
3326 md->r2[md->point] = 0;
3327 md->off_num[md->point] = 0;
3328 md->point++;
Guido van Rossum58132c61997-12-17 00:24:13 +00003329 /*points_pushed++;*/
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003330 }
3331 } while (*ptr==OP_ALT);
3332 /* Jump to the first (or only) alternative and resume trying to match */
3333 ecode=prev+3; goto match_loop;
3334 }
3335 }
Guido van Rossum58132c61997-12-17 00:24:13 +00003336 break;
3337
Guido van Rossum50700601997-12-08 17:15:20 +00003338 /* Start of subject unless notbol, or after internal newline if multiline */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003339
3340 case OP_CIRC:
Guido van Rossum50700601997-12-08 17:15:20 +00003341 if (md->notbol && eptr == md->start_subject) FAIL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003342 if (md->multiline)
3343 {
3344 if (eptr != md->start_subject && eptr[-1] != '\n') FAIL;
3345 ecode++;
3346 break;
3347 }
3348 /* ... else fall through */
3349
3350 /* Start of subject assertion */
3351
3352 case OP_SOD:
3353 if (eptr != md->start_subject) FAIL;
3354 ecode++;
3355 break;
3356
Guido van Rossum50700601997-12-08 17:15:20 +00003357 /* Assert before internal newline if multiline, or before
3358 a terminating newline unless endonly is set, else end of subject unless
3359 noteol is set. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003360
3361 case OP_DOLL:
Guido van Rossum50700601997-12-08 17:15:20 +00003362 if (md->noteol && eptr >= md->end_subject) FAIL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003363 if (md->multiline)
3364 {
3365 if (eptr < md->end_subject && *eptr != '\n') FAIL;
3366 ecode++;
3367 break;
3368 }
Guido van Rossum50700601997-12-08 17:15:20 +00003369 else if (!md->endonly)
3370 {
3371 if (eptr < md->end_subject - 1 ||
3372 (eptr == md->end_subject - 1 && *eptr != '\n')) FAIL;
3373 ecode++;
3374 break;
3375 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003376 /* ... else fall through */
3377
3378 /* End of subject assertion */
3379
3380 case OP_EOD:
3381 if (eptr < md->end_subject) FAIL;
3382 ecode++;
3383 break;
3384
3385 /* Word boundary assertions */
3386
3387 case OP_NOT_WORD_BOUNDARY:
3388 case OP_WORD_BOUNDARY:
3389 {
3390 BOOL prev_is_word = (eptr != md->start_subject) &&
3391 ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
3392 BOOL cur_is_word = (eptr < md->end_subject) &&
3393 ((pcre_ctypes[*eptr] & ctype_word) != 0);
3394 if ((*ecode++ == OP_WORD_BOUNDARY)?
3395 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3396 FAIL;
3397 }
3398 break;
3399
Guido van Rossum50700601997-12-08 17:15:20 +00003400 case OP_NOT_WORD_BOUNDARY_L:
3401 case OP_WORD_BOUNDARY_L:
3402 {
3403 BOOL prev_is_word = (eptr != md->start_subject) &&
Guido van Rossum58132c61997-12-17 00:24:13 +00003404 (isalnum(eptr[-1]) || eptr[-1]=='_');
Guido van Rossum50700601997-12-08 17:15:20 +00003405 BOOL cur_is_word = (eptr < md->end_subject) &&
Guido van Rossum58132c61997-12-17 00:24:13 +00003406 (isalnum(*eptr) || *eptr=='_');
Guido van Rossum50700601997-12-08 17:15:20 +00003407 if ((*ecode++ == OP_WORD_BOUNDARY_L)?
3408 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3409 FAIL;
3410 }
3411 break;
3412
3413
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003414 /* Match a single character type; inline for speed */
3415
3416 case OP_ANY:
3417 if (!md->dotall && eptr < md->end_subject && *eptr == '\n') FAIL;
3418 if (eptr++ >= md->end_subject) FAIL;
3419 ecode++;
3420 break;
3421
3422 case OP_NOT_DIGIT:
3423 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)
3424 FAIL;
3425 ecode++;
3426 break;
3427
3428 case OP_DIGIT:
3429 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)
3430 FAIL;
3431 ecode++;
3432 break;
3433
3434 case OP_NOT_WHITESPACE:
3435 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)
3436 FAIL;
3437 ecode++;
3438 break;
3439
3440 case OP_WHITESPACE:
3441 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)
3442 FAIL;
3443 ecode++;
3444 break;
3445
3446 case OP_NOT_WORDCHAR:
3447 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)
3448 FAIL;
3449 ecode++;
3450 break;
3451
3452 case OP_WORDCHAR:
3453 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)
3454 FAIL;
3455 ecode++;
3456 break;
3457
Guido van Rossum50700601997-12-08 17:15:20 +00003458 case OP_NOT_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00003459 if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) ))
Guido van Rossum50700601997-12-08 17:15:20 +00003460 return FALSE;
3461 eptr++;
3462 ecode++;
3463 break;
3464
3465 case OP_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00003466 if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) ))
Guido van Rossum50700601997-12-08 17:15:20 +00003467 return FALSE;
3468 eptr++;
3469 ecode++;
3470 break;
3471
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003472 /* Match a back reference, possibly repeatedly. Look past the end of the
3473 item to see if there is repeat information following. The code is similar
3474 to that for character classes, but repeated for efficiency. Then obey
3475 similar code to character type repeats - written out again for speed.
3476 However, if the referenced string is the empty string, always treat
3477 it as matched, any number of times (otherwise there could be infinite
3478 loops). */
3479
3480 case OP_REF:
3481 {
3482 int length;
3483 int number = ecode[1] << 1; /* Doubled reference number */
3484 ecode += 2; /* Advance past the item */
3485
3486 if (number >= offset_top || md->offset_vector[number] < 0)
3487 {
3488 md->errorcode = PCRE_ERROR_BADREF;
3489 FAIL;
3490 }
3491
3492 length = md->offset_vector[number+1] - md->offset_vector[number];
3493
3494 switch (*ecode)
3495 {
3496 case OP_CRSTAR:
3497 case OP_CRMINSTAR:
3498 case OP_CRPLUS:
3499 case OP_CRMINPLUS:
3500 case OP_CRQUERY:
3501 case OP_CRMINQUERY:
3502 c = *ecode++ - OP_CRSTAR;
3503 minimize = (c & 1) != 0;
3504 min = rep_min[c]; /* Pick up values from tables; */
3505 max = rep_max[c]; /* zero for max => infinity */
3506 if (max == 0) max = INT_MAX;
3507 break;
3508
3509 case OP_CRRANGE:
3510 case OP_CRMINRANGE:
3511 minimize = (*ecode == OP_CRMINRANGE);
3512 min = (ecode[1] << 8) + ecode[2];
3513 max = (ecode[3] << 8) + ecode[4];
3514 if (max == 0) max = INT_MAX;
3515 ecode += 5;
3516 break;
3517
3518 default: /* No repeat follows */
3519 if (!match_ref(number, eptr, length, md)) FAIL;
3520 eptr += length;
3521 continue; /* With the main loop */
3522 }
3523
3524 /* If the length of the reference is zero, just continue with the
3525 main loop. */
3526
3527 if (length == 0) continue;
3528
3529 /* First, ensure the minimum number of matches are present. We get back
3530 the length of the reference string explicitly rather than passing the
3531 address of eptr, so that eptr can be a register variable. */
3532
3533 for (i = 1; i <= min; i++)
3534 {
3535 if (!match_ref(number, eptr, length, md)) FAIL;
3536 eptr += length;
3537 }
3538
3539 /* If min = max, continue at the same level without recursion.
3540 They are not both allowed to be zero. */
3541
3542 if (min == max) continue;
3543
3544 /* If minimizing, keep trying and advancing the pointer */
3545
3546 if (minimize)
3547 {
3548 for (i = min;; i++)
3549 {
3550 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3551 if (i >= max || !match_ref(number, eptr, length, md))
3552 FAIL;
3553 eptr += length;
3554 }
3555 /* Control never gets here */
3556 }
3557
3558 /* If maximizing, find the longest string and work backwards */
3559
3560 else
3561 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003562 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003563 for (i = min; i < max; i++)
3564 {
3565 if (!match_ref(number, eptr, length, md)) break;
3566 eptr += length;
3567 }
3568 while (eptr >= pp)
3569 {
3570 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3571 eptr -= length;
3572 }
3573 FAIL;
3574 }
3575 }
3576 /* Control never gets here */
3577
3578 /* Match a character class, possibly repeatedly. Look past the end of the
3579 item to see if there is repeat information following. Then obey similar
Guido van Rossum50700601997-12-08 17:15:20 +00003580 code to character type repeats - written out again for speed. If caseless
3581 matching was set at runtime but not at compile time, we have to check both
3582 versions of a character. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003583
3584 case OP_CLASS:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003585 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003586 const uschar *data = ecode + 1; /* Save for matching */
3587 ecode += 33; /* Advance past the item */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003588
3589 switch (*ecode)
3590 {
3591 case OP_CRSTAR:
3592 case OP_CRMINSTAR:
3593 case OP_CRPLUS:
3594 case OP_CRMINPLUS:
3595 case OP_CRQUERY:
3596 case OP_CRMINQUERY:
3597 c = *ecode++ - OP_CRSTAR;
3598 minimize = (c & 1) != 0;
3599 min = rep_min[c]; /* Pick up values from tables; */
3600 max = rep_max[c]; /* zero for max => infinity */
3601 if (max == 0) max = INT_MAX;
3602 break;
3603
3604 case OP_CRRANGE:
3605 case OP_CRMINRANGE:
3606 minimize = (*ecode == OP_CRMINRANGE);
3607 min = (ecode[1] << 8) + ecode[2];
3608 max = (ecode[3] << 8) + ecode[4];
3609 if (max == 0) max = INT_MAX;
3610 ecode += 5;
3611 break;
3612
3613 default: /* No repeat follows */
Guido van Rossum50700601997-12-08 17:15:20 +00003614 if (eptr >= md->end_subject) FAIL;
3615 c = *eptr++;
3616 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
3617 if (md->runtime_caseless)
3618 {
3619 c = pcre_fcc[c];
3620 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
3621 }
3622 FAIL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003623 }
3624
3625 /* First, ensure the minimum number of matches are present. */
3626
3627 for (i = 1; i <= min; i++)
Guido van Rossum50700601997-12-08 17:15:20 +00003628 {
3629 if (eptr >= md->end_subject) FAIL;
3630 c = *eptr++;
3631 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3632 if (md->runtime_caseless)
3633 {
3634 c = pcre_fcc[c];
3635 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3636 }
3637 FAIL;
3638 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003639
3640 /* If max == min we can continue with the main loop without the
3641 need to recurse. */
3642
3643 if (min == max) continue;
3644
3645 /* If minimizing, keep testing the rest of the expression and advancing
3646 the pointer while it matches the class. */
3647
3648 if (minimize)
3649 {
3650 for (i = min;; i++)
3651 {
3652 if (match(eptr, ecode, offset_top, md)) SUCCEED;
Guido van Rossum50700601997-12-08 17:15:20 +00003653 if (i >= max || eptr >= md->end_subject) FAIL;
3654 c = *eptr++;
3655 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3656 if (md->runtime_caseless)
3657 {
3658 c = pcre_fcc[c];
3659 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3660 }
3661 FAIL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003662 }
3663 /* Control never gets here */
3664 }
3665
3666 /* If maximizing, find the longest possible run, then work backwards. */
3667
3668 else
3669 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003670 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00003671 for (i = min; i < max; eptr++, i++)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003672 {
Guido van Rossum50700601997-12-08 17:15:20 +00003673 if (eptr >= md->end_subject) break;
3674 c = *eptr;
3675 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3676 if (md->runtime_caseless)
3677 {
3678 c = pcre_fcc[c];
3679 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3680 }
3681 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003682 }
Guido van Rossum50700601997-12-08 17:15:20 +00003683
3684 while (eptr >= pp)
3685 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3686 FAIL;
3687 }
3688 }
3689 /* Control never gets here */
3690
3691 /* OP_CLASS_L opcode: handles localized character classes */
3692
3693 case OP_CLASS_L:
3694 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003695 const uschar *data = ecode + 1; /* Save for matching */
3696 const uschar locale_flag = *data;
Guido van Rossum50700601997-12-08 17:15:20 +00003697 ecode++; data++; /* The localization support adds an extra byte */
3698
3699 ecode += 33; /* Advance past the item */
3700
3701 switch (*ecode)
3702 {
3703 case OP_CRSTAR:
3704 case OP_CRMINSTAR:
3705 case OP_CRPLUS:
3706 case OP_CRMINPLUS:
3707 case OP_CRQUERY:
3708 case OP_CRMINQUERY:
3709 c = *ecode++ - OP_CRSTAR;
3710 minimize = (c & 1) != 0;
3711 min = rep_min[c]; /* Pick up values from tables; */
3712 max = rep_max[c]; /* zero for max => infinity */
3713 if (max == 0) max = INT_MAX;
3714 break;
3715
3716 case OP_CRRANGE:
3717 case OP_CRMINRANGE:
3718 minimize = (*ecode == OP_CRMINRANGE);
3719 min = (ecode[1] << 8) + ecode[2];
3720 max = (ecode[3] << 8) + ecode[4];
3721 if (max == 0) max = INT_MAX;
3722 ecode += 5;
3723 break;
3724
3725 default: /* No repeat follows */
3726 if (eptr >= md->end_subject) FAIL;
3727 c = *eptr++;
3728 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
Guido van Rossum58132c61997-12-17 00:24:13 +00003729 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3730 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003731#if 0
3732 if ( (locale_flag & 4) && isdigit(c) ) continue; /* Locale \d */
3733 if ( (locale_flag & 8) && !isdigit(c) ) continue; /* Locale \D */
3734 if ( (locale_flag & 16) && isspace(c) ) continue; /* Locale \s */
3735 if ( (locale_flag & 32) && !isspace(c) ) continue; /* Locale \S */
3736#endif
3737
3738 if (md->runtime_caseless)
3739 {
3740 c = pcre_fcc[c];
3741 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
3742
Guido van Rossum58132c61997-12-17 00:24:13 +00003743 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3744 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003745 }
3746 FAIL;
3747 }
3748
3749 /* First, ensure the minimum number of matches are present. */
3750
3751 for (i = 1; i <= min; i++)
3752 {
3753 if (eptr >= md->end_subject) FAIL;
3754 c = *eptr++;
3755 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003756 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3757 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003758
3759 if (md->runtime_caseless)
3760 {
3761 c = pcre_fcc[c];
3762 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003763 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3764 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003765 }
3766 FAIL;
3767 }
3768
3769 /* If max == min we can continue with the main loop without the
3770 need to recurse. */
3771
3772 if (min == max) continue;
3773
3774 /* If minimizing, keep testing the rest of the expression and advancing
3775 the pointer while it matches the class. */
3776
3777 if (minimize)
3778 {
3779 for (i = min;; i++)
3780 {
3781 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3782 if (i >= max || eptr >= md->end_subject) FAIL;
3783 c = *eptr++;
3784 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003785 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3786 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003787
3788 if (md->runtime_caseless)
3789 {
3790 c = pcre_fcc[c];
3791 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003792 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3793 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003794 }
3795 FAIL;
3796 }
3797 /* Control never gets here */
3798 }
3799
3800 /* If maximizing, find the longest possible run, then work backwards. */
3801
3802 else
3803 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003804 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00003805 for (i = min; i < max; eptr++, i++)
3806 {
3807 if (eptr >= md->end_subject) break;
3808 c = *eptr;
3809 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003810 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3811 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003812 if (md->runtime_caseless)
3813 {
3814 c = pcre_fcc[c];
3815 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003816 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3817 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003818 }
3819 break;
3820 }
3821
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003822 while (eptr >= pp)
3823 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3824 FAIL;
3825 }
3826 }
3827 /* Control never gets here */
3828
3829 /* Match a run of characters */
3830
3831 case OP_CHARS:
3832 {
3833 register int length = ecode[1];
3834 ecode += 2;
3835
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003836#ifdef DEBUG
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003837 if (eptr >= md->end_subject)
3838 printf("matching subject <null> against pattern ");
3839 else
3840 {
3841 printf("matching subject ");
3842 pchars(eptr, length, TRUE, md);
3843 printf(" against pattern ");
3844 }
3845 pchars(ecode, length, FALSE, md);
3846 printf("\n");
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003847#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003848
3849 if (length > md->end_subject - eptr) FAIL;
3850 if (md->caseless)
3851 {
3852 while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) FAIL;
3853 }
3854 else
3855 {
3856 while (length-- > 0) if (*ecode++ != *eptr++) FAIL;
3857 }
3858 }
3859 break;
3860
3861 /* Match a single character repeatedly; different opcodes share code. */
3862
3863 case OP_EXACT:
3864 min = max = (ecode[1] << 8) + ecode[2];
3865 ecode += 3;
3866 goto REPEATCHAR;
3867
3868 case OP_UPTO:
3869 case OP_MINUPTO:
3870 min = 0;
3871 max = (ecode[1] << 8) + ecode[2];
3872 minimize = *ecode == OP_MINUPTO;
3873 ecode += 3;
3874 goto REPEATCHAR;
3875
3876 case OP_STAR:
3877 case OP_MINSTAR:
3878 case OP_PLUS:
3879 case OP_MINPLUS:
3880 case OP_QUERY:
3881 case OP_MINQUERY:
3882 c = *ecode++ - OP_STAR;
3883 minimize = (c & 1) != 0;
3884 min = rep_min[c]; /* Pick up values from tables; */
3885 max = rep_max[c]; /* zero for max => infinity */
3886 if (max == 0) max = INT_MAX;
3887
3888 /* Common code for all repeated single-character matches. We can give
3889 up quickly if there are fewer than the minimum number of characters left in
3890 the subject. */
3891
3892 REPEATCHAR:
3893 if (min > md->end_subject - eptr) FAIL;
3894 c = *ecode++;
3895
3896 /* The code is duplicated for the caseless and caseful cases, for speed,
3897 since matching characters is likely to be quite common. First, ensure the
3898 minimum number of matches are present. If min = max, continue at the same
3899 level without recursing. Otherwise, if minimizing, keep trying the rest of
3900 the expression and advancing one matching character if failing, up to the
3901 maximum. Alternatively, if maximizing, find the maximum number of
3902 characters and work backwards. */
3903
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003904#ifdef DEBUG
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003905 printf("matching %c{%d,%d} against subject %.*s\n", c, min, max,
3906 max, eptr);
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003907#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003908
3909 if (md->caseless)
3910 {
3911 c = pcre_lcc[c];
3912 for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) FAIL;
3913 if (min == max) continue;
3914 if (minimize)
3915 {
3916 for (i = min;; i++)
3917 {
3918 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3919 if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])
3920 FAIL;
3921 }
3922 /* Control never gets here */
3923 }
3924 else
3925 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003926 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003927 for (i = min; i < max; i++)
3928 {
3929 if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;
3930 eptr++;
3931 }
3932 while (eptr >= pp)
3933 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3934 FAIL;
3935 }
Guido van Rossum50700601997-12-08 17:15:20 +00003936 /* Control never gets here */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003937 }
3938
3939 /* Caseful comparisons */
3940
3941 else
3942 {
3943 for (i = 1; i <= min; i++) if (c != *eptr++) FAIL;
3944 if (min == max) continue;
3945 if (minimize)
3946 {
3947 for (i = min;; i++)
3948 {
3949 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3950 if (i >= max || eptr >= md->end_subject || c != *eptr++) FAIL;
3951 }
3952 /* Control never gets here */
3953 }
3954 else
3955 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003956 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003957 for (i = min; i < max; i++)
3958 {
3959 if (eptr >= md->end_subject || c != *eptr) break;
3960 eptr++;
3961 }
3962 while (eptr >= pp)
3963 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3964 FAIL;
3965 }
3966 }
3967 /* Control never gets here */
3968
Guido van Rossum50700601997-12-08 17:15:20 +00003969 /* Match a negated single character */
3970
3971 case OP_NOT:
3972 if (eptr > md->end_subject) FAIL;
3973 ecode++;
3974 if (md->caseless)
3975 {
3976 if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) FAIL;
3977 }
3978 else
3979 {
3980 if (*ecode++ == *eptr++) FAIL;
3981 }
3982 break;
3983
3984 /* Match a negated single character repeatedly. This is almost a repeat of
3985 the code for a repeated single character, but I haven't found a nice way of
3986 commoning these up that doesn't require a test of the positive/negative
3987 option for each character match. Maybe that wouldn't add very much to the
3988 time taken, but character matching *is* what this is all about... */
3989
3990 case OP_NOTEXACT:
3991 min = max = (ecode[1] << 8) + ecode[2];
3992 ecode += 3;
3993 goto REPEATNOTCHAR;
3994
3995 case OP_NOTUPTO:
3996 case OP_NOTMINUPTO:
3997 min = 0;
3998 max = (ecode[1] << 8) + ecode[2];
3999 minimize = *ecode == OP_NOTMINUPTO;
4000 ecode += 3;
4001 goto REPEATNOTCHAR;
4002
4003 case OP_NOTSTAR:
4004 case OP_NOTMINSTAR:
4005 case OP_NOTPLUS:
4006 case OP_NOTMINPLUS:
4007 case OP_NOTQUERY:
4008 case OP_NOTMINQUERY:
4009 c = *ecode++ - OP_NOTSTAR;
4010 minimize = (c & 1) != 0;
4011 min = rep_min[c]; /* Pick up values from tables; */
4012 max = rep_max[c]; /* zero for max => infinity */
4013 if (max == 0) max = INT_MAX;
4014
4015 /* Common code for all repeated single-character matches. We can give
4016 up quickly if there are fewer than the minimum number of characters left in
4017 the subject. */
4018
4019 REPEATNOTCHAR:
4020 if (min > md->end_subject - eptr) FAIL;
4021 c = *ecode++;
4022
4023 /* The code is duplicated for the caseless and caseful cases, for speed,
4024 since matching characters is likely to be quite common. First, ensure the
4025 minimum number of matches are present. If min = max, continue at the same
4026 level without recursing. Otherwise, if minimizing, keep trying the rest of
4027 the expression and advancing one matching character if failing, up to the
4028 maximum. Alternatively, if maximizing, find the maximum number of
4029 characters and work backwards. */
4030
4031#ifdef DEBUG
4032 printf("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
4033 max, eptr);
4034#endif
4035
4036 if (md->caseless)
4037 {
4038 c = pcre_lcc[c];
4039 for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) FAIL;
4040 if (min == max) continue;
4041 if (minimize)
4042 {
4043 for (i = min;; i++)
4044 {
4045 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4046 if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])
4047 FAIL;
4048 }
4049 /* Control never gets here */
4050 }
4051 else
4052 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004053 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00004054 for (i = min; i < max; i++)
4055 {
4056 if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;
4057 eptr++;
4058 }
4059 while (eptr >= pp)
4060 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4061 FAIL;
4062 }
4063 /* Control never gets here */
4064 }
4065
4066 /* Caseful comparisons */
4067
4068 else
4069 {
4070 for (i = 1; i <= min; i++) if (c == *eptr++) FAIL;
4071 if (min == max) continue;
4072 if (minimize)
4073 {
4074 for (i = min;; i++)
4075 {
4076 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4077 if (i >= max || eptr >= md->end_subject || c == *eptr++) FAIL;
4078 }
4079 /* Control never gets here */
4080 }
4081 else
4082 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004083 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00004084 for (i = min; i < max; i++)
4085 {
4086 if (eptr >= md->end_subject || c == *eptr) break;
4087 eptr++;
4088 }
4089 while (eptr >= pp)
4090 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4091 FAIL;
4092 }
4093 }
4094 /* Control never gets here */
4095
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004096 /* Match a single character type repeatedly; several different opcodes
4097 share code. This is very similar to the code for single characters, but we
4098 repeat it in the interests of efficiency. */
4099
4100 case OP_TYPEEXACT:
4101 min = max = (ecode[1] << 8) + ecode[2];
4102 minimize = TRUE;
4103 ecode += 3;
4104 goto REPEATTYPE;
4105
4106 case OP_TYPEUPTO:
4107 case OP_TYPEMINUPTO:
4108 min = 0;
4109 max = (ecode[1] << 8) + ecode[2];
4110 minimize = *ecode == OP_TYPEMINUPTO;
4111 ecode += 3;
4112 goto REPEATTYPE;
4113
4114 case OP_TYPESTAR:
4115 case OP_TYPEMINSTAR:
4116 case OP_TYPEPLUS:
4117 case OP_TYPEMINPLUS:
4118 case OP_TYPEQUERY:
4119 case OP_TYPEMINQUERY:
4120 c = *ecode++ - OP_TYPESTAR;
4121 minimize = (c & 1) != 0;
4122 min = rep_min[c]; /* Pick up values from tables; */
4123 max = rep_max[c]; /* zero for max => infinity */
4124 if (max == 0) max = INT_MAX;
4125
4126 /* Common code for all repeated single character type matches */
4127
4128 REPEATTYPE:
4129 ctype = *ecode++; /* Code for the character type */
4130
4131 /* First, ensure the minimum number of matches are present. Use inline
4132 code for maximizing the speed, and do the type test once at the start
4133 (i.e. keep it out of the loop). Also test that there are at least the
4134 minimum number of characters before we start. */
4135
4136 if (min > md->end_subject - eptr) FAIL;
4137 if (min > 0) switch(ctype)
4138 {
4139 case OP_ANY:
4140 if (!md->dotall)
4141 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') FAIL; }
4142 else eptr += min;
4143 break;
4144
4145 case OP_NOT_DIGIT:
4146 for (i = 1; i <= min; i++)
4147 if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) FAIL;
4148 break;
4149
4150 case OP_DIGIT:
4151 for (i = 1; i <= min; i++)
4152 if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) FAIL;
4153 break;
4154
4155 case OP_NOT_WHITESPACE:
4156 for (i = 1; i <= min; i++)
4157 if ((pcre_ctypes[*eptr++] & ctype_space) != 0) FAIL;
4158 break;
4159
4160 case OP_WHITESPACE:
4161 for (i = 1; i <= min; i++)
4162 if ((pcre_ctypes[*eptr++] & ctype_space) == 0) FAIL;
4163 break;
4164
4165 case OP_NOT_WORDCHAR:
4166 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
4167 FAIL;
4168 break;
4169
4170 case OP_WORDCHAR:
4171 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
4172 FAIL;
4173 break;
Guido van Rossum50700601997-12-08 17:15:20 +00004174
4175 case OP_NOT_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00004176 for (i = 1; i <= min; i++, eptr++) if (*eptr=='_' || isalnum(*eptr))
Guido van Rossum50700601997-12-08 17:15:20 +00004177 return FALSE;
4178 break;
4179
4180 case OP_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00004181 for (i = 1; i <= min; i++, eptr++) if (*eptr!='_' && !isalnum(*eptr))
Guido van Rossum50700601997-12-08 17:15:20 +00004182 return FALSE;
4183 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004184 }
4185
4186 /* If min = max, continue at the same level without recursing */
4187
4188 if (min == max) continue;
4189
4190 /* If minimizing, we have to test the rest of the pattern before each
4191 subsequent match, so inlining isn't much help; just use the function. */
4192
4193 if (minimize)
4194 {
4195 for (i = min;; i++)
4196 {
4197 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4198 if (i >= max || eptr >= md->end_subject ||
4199 !match_type(ctype, *eptr++, md->dotall))
4200 FAIL;
4201 }
4202 /* Control never gets here */
4203 }
4204
4205 /* If maximizing it is worth using inline code for speed, doing the type
4206 test once at the start (i.e. keep it out of the loop). */
4207
4208 else
4209 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004210 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004211 switch(ctype)
4212 {
4213 case OP_ANY:
4214 if (!md->dotall)
4215 {
4216 for (i = min; i < max; i++)
4217 {
4218 if (eptr >= md->end_subject || *eptr == '\n') break;
4219 eptr++;
4220 }
4221 }
4222 else
4223 {
4224 c = max - min;
4225 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4226 eptr += c;
4227 }
4228 break;
4229
4230 case OP_NOT_DIGIT:
4231 for (i = min; i < max; i++)
4232 {
4233 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)
4234 break;
4235 eptr++;
4236 }
4237 break;
4238
4239 case OP_DIGIT:
4240 for (i = min; i < max; i++)
4241 {
4242 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)
4243 break;
4244 eptr++;
4245 }
4246 break;
4247
4248 case OP_NOT_WHITESPACE:
4249 for (i = min; i < max; i++)
4250 {
4251 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)
4252 break;
4253 eptr++;
4254 }
4255 break;
4256
4257 case OP_WHITESPACE:
4258 for (i = min; i < max; i++)
4259 {
4260 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)
4261 break;
4262 eptr++;
4263 }
4264 break;
4265
4266 case OP_NOT_WORDCHAR:
4267 for (i = min; i < max; i++)
4268 {
4269 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)
4270 break;
4271 eptr++;
4272 }
4273 break;
4274
4275 case OP_WORDCHAR:
4276 for (i = min; i < max; i++)
4277 {
Guido van Rossum50700601997-12-08 17:15:20 +00004278 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)
4279 break;
4280 eptr++;
4281 }
4282 break;
4283 case OP_NOT_WORDCHAR_L:
4284 for (i = min; i < max; i++)
4285 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004286 if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) ) )
Guido van Rossum50700601997-12-08 17:15:20 +00004287 break;
4288 eptr++;
4289 }
4290 break;
4291
4292 case OP_WORDCHAR_L:
4293 for (i = min; i < max; i++)
4294 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004295 if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) ) )
Guido van Rossum50700601997-12-08 17:15:20 +00004296 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004297 eptr++;
4298 }
4299 break;
4300 }
4301
4302 while (eptr >= pp)
4303 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4304 FAIL;
4305 }
4306 /* Control never gets here */
4307
4308 /* There's been some horrible disaster. */
4309
4310 default:
Guido van Rossum57ba4f31997-12-02 20:40:28 +00004311#ifdef DEBUG
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004312 printf("Unknown opcode %d\n", *ecode);
Guido van Rossum57ba4f31997-12-02 20:40:28 +00004313#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004314 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4315 FAIL;
4316 }
4317
4318 /* Do not stick any code in here without much thought; it is assumed
4319 that "continue" in the code above comes out to here to repeat the main
4320 loop. */
4321
4322 } /* End of main loop */
4323/* Control never reaches here */
4324
4325fail:
4326 if (md->point > save_stack_position)
4327 {
4328 /* If there are still points remaining on the stack, pop the next one off */
Guido van Rossumc3861071997-10-08 02:07:40 +00004329 int off_num;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004330
4331 md->point--;
4332 offset_top = md->offset_top[md->point];
4333 eptr = md->eptr[md->point];
4334 ecode = md->ecode[md->point];
4335 off_num = md->off_num[md->point];
4336 md->offset_vector[off_num] = md->r1[md->point];
4337 md->offset_vector[off_num+1] = md->r2[md->point];
4338 goto match_loop;
4339 }
4340 /* Failure, and nothing left on the stack, so end this function call */
4341
4342 /* Restore the top of the stack to where it was before this function
4343 call. This lets us use one stack for everything; recursive calls
4344 can push and pop information, and may increase the stack. When
4345 the call returns, the parent function can resume pushing and
4346 popping wherever it was. */
4347
4348 md->point = save_stack_position;
4349 return FALSE;
4350
4351succeed:
4352 return TRUE;
4353}
4354
4355
Guido van Rossum50700601997-12-08 17:15:20 +00004356
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004357/*************************************************
4358* Execute a Regular Expression *
4359*************************************************/
4360
4361/* This function applies a compiled re to a subject string and picks out
4362portions of the string if it matches. Two elements in the vector are set for
4363each substring: the offsets to the start and end of the substring.
4364
4365Arguments:
Guido van Rossum50700601997-12-08 17:15:20 +00004366 external_re points to the compiled expression
4367 external_extra points to "hints" from pcre_study() or is NULL
4368 subject points to the subject string
4369 length length of subject string (may contain binary zeros)
4370 options option bits
4371 offsets points to a vector of ints to be filled in with offsets
4372 offsetcount the number of elements in the vector
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004373
Guido van Rossum50700601997-12-08 17:15:20 +00004374Returns: > 0 => success; value is the number of elements filled in
4375 = 0 => success, but offsets is not big enough
4376 -1 => failed to match
4377 < -1 => some kind of unexpected problem
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004378*/
4379
4380int
Guido van Rossum50700601997-12-08 17:15:20 +00004381pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4382 const char *subject, int length, int options, int *offsets, int offsetcount)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004383{
Guido van Rossum58132c61997-12-17 00:24:13 +00004384 /* The "volatile" directives are to make gcc -Wall stop complaining
4385 that these variables can be clobbered by the longjmp. Hopefully
4386 they won't cost too much performance. */
4387volatile int resetcount;
4388volatile int ocount = offsetcount;
4389volatile int first_char = -1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004390match_data match_block;
Guido van Rossum58132c61997-12-17 00:24:13 +00004391volatile const uschar *start_bits = NULL;
4392const uschar *start_match = (uschar *)subject;
4393const uschar *end_subject;
4394const real_pcre *re = (const real_pcre *)external_re;
4395const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4396volatile BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4397volatile BOOL startline = (re->options & PCRE_STARTLINE) != 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004398
4399if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4400
4401if (re == NULL || subject == NULL ||
4402 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4403if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4404
Guido van Rossum58132c61997-12-17 00:24:13 +00004405match_block.start_subject = (const uschar *)subject;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004406match_block.end_subject = match_block.start_subject + length;
4407end_subject = match_block.end_subject;
4408
Guido van Rossum50700601997-12-08 17:15:20 +00004409match_block.caseless = ((re->options | options) & PCRE_CASELESS) != 0;
4410match_block.runtime_caseless = match_block.caseless &&
4411 (re->options & PCRE_CASELESS) == 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004412
Guido van Rossum50700601997-12-08 17:15:20 +00004413match_block.multiline = ((re->options | options) & PCRE_MULTILINE) != 0;
4414match_block.dotall = ((re->options | options) & PCRE_DOTALL) != 0;
4415match_block.endonly = ((re->options | options) & PCRE_DOLLAR_ENDONLY) != 0;
4416
4417match_block.notbol = (options & PCRE_NOTBOL) != 0;
4418match_block.noteol = (options & PCRE_NOTEOL) != 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004419
4420match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
4421
4422/* Set the stack state to empty */
4423 match_block.off_num = match_block.offset_top = NULL;
4424 match_block.r1 = match_block.r2 = NULL;
4425 match_block.eptr = match_block.ecode = NULL;
4426 match_block.point = match_block.length = 0;
4427
Guido van Rossum50700601997-12-08 17:15:20 +00004428/* If the expression has got more back references than the offsets supplied can
4429hold, we get a temporary bit of working store to use during the matching.
4430Otherwise, we can use the vector supplied, rounding down the size of it to a
4431multiple of 2. */
4432
4433ocount &= (-2);
4434if (re->top_backref > 0 && re->top_backref + 1 >= ocount/2)
4435 {
4436 ocount = re->top_backref * 2 + 2;
4437 match_block.offset_vector = (pcre_malloc)(ocount * sizeof(int));
4438 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4439#ifdef DEBUG
4440 printf("Got memory to hold back references\n");
4441#endif
4442 }
4443else match_block.offset_vector = offsets;
4444
4445match_block.offset_end = ocount;
4446match_block.offset_overflow = FALSE;
4447
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004448/* Compute the minimum number of offsets that we need to reset each time. Doing
4449this makes a huge difference to execution time when there aren't many brackets
4450in the pattern. */
4451
4452resetcount = 2 + re->top_bracket * 2;
Guido van Rossum50700601997-12-08 17:15:20 +00004453if (resetcount > offsetcount) resetcount = ocount;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004454
4455/* If MULTILINE is set at exec time but was not set at compile time, and the
4456anchored flag is set, we must re-check because a setting provoked by ^ in the
4457pattern is not right in multi-line mode. Calling is_anchored() again here does
4458the right check, because multiline is now set. If it now yields FALSE, the
4459expression must have had ^ starting some of its branches. Check to see if
4460that is true for *all* branches, and if so, set the startline flag. */
4461
4462if (match_block. multiline && anchored && (re->options & PCRE_MULTILINE) == 0 &&
4463 !is_anchored(re->code, match_block.multiline))
4464 {
4465 anchored = FALSE;
4466 if (is_startline(re->code)) startline = TRUE;
4467 }
4468
4469/* Set up the first character to match, if available. The first_char value is
4470never set for an anchored regular expression, but the anchoring may be forced
4471at run time, so we have to test for anchoring. The first char may be unset for
4472an unanchored pattern, of course. If there's no first char and the pattern was
4473studied, the may be a bitmap of possible first characters. However, we can
4474use this only if the caseless state of the studying was correct. */
4475
4476if (!anchored)
4477 {
4478 if ((re->options & PCRE_FIRSTSET) != 0)
4479 {
4480 first_char = re->first_char;
4481 if (match_block.caseless) first_char = pcre_lcc[first_char];
4482 }
4483 else
4484 if (!startline && extra != NULL &&
4485 (extra->options & PCRE_STUDY_MAPPED) != 0 &&
4486 ((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless)
4487 start_bits = extra->start_bits;
4488 }
4489
4490/* Loop for unanchored matches; for anchored regexps the loop runs just once. */
4491
4492do
4493 {
Guido van Rossum50700601997-12-08 17:15:20 +00004494 register int *iptr = match_block.offset_vector;
4495 register int *iend = iptr + resetcount;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004496
4497 /* Reset the maximum number of extractions we might see. */
4498
4499 while (iptr < iend) *iptr++ = -1;
4500
4501 /* Advance to a unique first char if possible */
4502
4503 if (first_char >= 0)
4504 {
4505 if (match_block.caseless)
4506 while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
4507 start_match++;
4508 else
4509 while (start_match < end_subject && *start_match != first_char)
4510 start_match++;
4511 }
4512
4513 /* Or to just after \n for a multiline match if possible */
4514
4515 else if (startline)
4516 {
4517 if (start_match > match_block.start_subject)
4518 {
4519 while (start_match < end_subject && start_match[-1] != '\n')
4520 start_match++;
4521 }
4522 }
4523
4524 /* Or to a non-unique first char */
4525
4526 else if (start_bits != NULL)
4527 {
4528 while (start_match < end_subject)
4529 {
4530 register int c = *start_match;
Guido van Rossum50700601997-12-08 17:15:20 +00004531 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004532 }
4533 }
4534
Guido van Rossum57ba4f31997-12-02 20:40:28 +00004535#ifdef DEBUG
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004536 printf(">>>> Match against: ");
4537 pchars(start_match, end_subject - start_match, TRUE, &match_block);
4538 printf("\n");
Guido van Rossum57ba4f31997-12-02 20:40:28 +00004539#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004540
4541 /* When a match occurs, substrings will be set for all internal extractions;
4542 we just need to set up the whole thing as substring 0 before returning. If
Guido van Rossum50700601997-12-08 17:15:20 +00004543 there were too many extractions, set the return code to zero. In the case
4544 where we had to get some local store to hold offsets for backreferences, copy
4545 those back references that we can. In this case there need not be overflow
4546 if certain parts of the pattern were not used.
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004547
Guido van Rossum50700601997-12-08 17:15:20 +00004548 Before starting the match, we have to set up a longjmp() target to enable
4549 the "cut" operation to fail a match completely without backtracking. */
4550
4551 /* To handle errors such as running out of memory for the failure
4552 stack, we need to save this location via setjmp(), so
4553 error-handling code can call longjmp() to jump out of deeply-nested code. */
4554 if (setjmp(match_block.error_env)==0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004555 {
Guido van Rossum50700601997-12-08 17:15:20 +00004556
4557 if (setjmp(match_block.fail_env) == 0 &&
4558 match(start_match, re->code, 2, &match_block))
4559 {
4560 int rc;
4561
4562 if (ocount != offsetcount)
4563 {
4564 if (offsetcount >= 4)
4565 {
4566 memcpy(offsets + 2, match_block.offset_vector + 2,
4567 (offsetcount - 2) * sizeof(int));
4568#ifdef DEBUG
4569 printf("Copied offsets; freeing temporary memory\n");
4570#endif
4571 }
4572 if (match_block.end_offset_top > offsetcount)
4573 match_block.offset_overflow = TRUE;
4574
4575#ifdef DEBUG
4576 printf("Freeing temporary memory\n");
4577#endif
4578
4579 (pcre_free)(match_block.offset_vector);
4580 }
4581
4582 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
4583
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004584 if (match_block.offset_end < 2) rc = 0; else
4585 {
4586 offsets[0] = start_match - match_block.start_subject;
4587 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
4588 }
Guido van Rossum50700601997-12-08 17:15:20 +00004589
Guido van Rossum57ba4f31997-12-02 20:40:28 +00004590#ifdef DEBUG
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004591 printf(">>>> returning %d\n", rc);
Guido van Rossum57ba4f31997-12-02 20:40:28 +00004592#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004593 free_stack(&match_block);
4594 return rc;
4595 }
Guido van Rossum50700601997-12-08 17:15:20 +00004596 } /* End of (if setjmp(match_block.error_env)...) */
4597 /* Return an error code; pcremodule.c will preserve the exception */
4598 if (PyErr_Occurred()) return PCRE_ERROR_NOMEMORY;
4599
4600 free_stack(&match_block);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004601 }
4602while (!anchored &&
4603 match_block.errorcode == PCRE_ERROR_NOMATCH &&
4604 start_match++ < end_subject);
4605
4606#ifdef DEBUG
4607printf(">>>> returning %d\n", match_block.errorcode);
4608#endif
Guido van Rossum50700601997-12-08 17:15:20 +00004609
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004610return match_block.errorcode;
4611}
4612
4613/* End of pcre.c */