blob: be9963dcd40cb9afd5a43a54c15eeb4e9079ae91 [file] [log] [blame]
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001
2/*************************************************
3* Perl-Compatible Regular Expressions *
4*************************************************/
5
6/* DO NOT EDIT THIS FILE! */
7
8/* This file is automatically written by the merge-files.py script
9included with the PCRE distribution for Python; it's produced from
10several C files, and code is removed in the process. If you want to
11modify the code or track down bugs, it will be much easier to work
12with the code in its original, multiple-file form. Don't edit this
13file by hand, or submit patches to it.
14
15The Python-specific PCRE distribution can be retrieved from
16 http://starship.skyport.net/crew/amk/regex/
17
18The unmodified original PCRE distribution doesn't have a fixed URL
19yet; write Philip Hazel <ph10@cam.ac.uk> for the latest version.
20
21Written by: Philip Hazel <ph10@cam.ac.uk>
22
23Extensively modified by the Python String-SIG: <string-sig@python.org>
24Send bug reports to: <string-sig@python.org>
25(They'll figure out if it's a bug in PCRE or in the Python-specific
26changes.)
27
28 Copyright (c) 1997 University of Cambridge
29
30-----------------------------------------------------------------------------
31Permission is granted to anyone to use this software for any purpose on any
32computer system, and to redistribute it freely, subject to the following
33restrictions:
34
351. This software is distributed in the hope that it will be useful,
36 but WITHOUT ANY WARRANTY; without even the implied warranty of
37 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
38
392. The origin of this software must not be misrepresented, either by
40 explicit claim or by omission.
41
423. Altered versions must be plainly marked as such, and must not be
43 misrepresented as being the original software.
44-----------------------------------------------------------------------------
45*/
46
47
48#define FOR_PYTHON
49#include "pcre-internal.h"
50#include "Python.h"
51#include "graminit.h"
52
53/*************************************************
54* Perl-Compatible Regular Expressions *
55*************************************************/
56
57/* This file is automatically written by the makechartables auxiliary
58program. If you edit it by hand, you might like to edit the Makefile to
59prevent its ever being regenerated. */
60
61/* This table is a lower casing table. */
62
63unsigned char pcre_lcc[] = {
64 0, 1, 2, 3, 4, 5, 6, 7,
65 8, 9, 10, 11, 12, 13, 14, 15,
66 16, 17, 18, 19, 20, 21, 22, 23,
67 24, 25, 26, 27, 28, 29, 30, 31,
68 32, 33, 34, 35, 36, 37, 38, 39,
69 40, 41, 42, 43, 44, 45, 46, 47,
70 48, 49, 50, 51, 52, 53, 54, 55,
71 56, 57, 58, 59, 60, 61, 62, 63,
72 64, 97, 98, 99,100,101,102,103,
73 104,105,106,107,108,109,110,111,
74 112,113,114,115,116,117,118,119,
75 120,121,122, 91, 92, 93, 94, 95,
76 96, 97, 98, 99,100,101,102,103,
77 104,105,106,107,108,109,110,111,
78 112,113,114,115,116,117,118,119,
79 120,121,122,123,124,125,126,127,
80 128,129,130,131,132,133,134,135,
81 136,137,138,139,140,141,142,143,
82 144,145,146,147,148,149,150,151,
83 152,153,154,155,156,157,158,159,
84 160,161,162,163,164,165,166,167,
85 168,169,170,171,172,173,174,175,
86 176,177,178,179,180,181,182,183,
87 184,185,186,187,188,189,190,191,
88 192,193,194,195,196,197,198,199,
89 200,201,202,203,204,205,206,207,
90 208,209,210,211,212,213,214,215,
91 216,217,218,219,220,221,222,223,
92 224,225,226,227,228,229,230,231,
93 232,233,234,235,236,237,238,239,
94 240,241,242,243,244,245,246,247,
95 248,249,250,251,252,253,254,255 };
96
97/* This table is an upper casing table. */
98
99unsigned char pcre_ucc[] = {
100 0, 1, 2, 3, 4, 5, 6, 7,
101 8, 9, 10, 11, 12, 13, 14, 15,
102 16, 17, 18, 19, 20, 21, 22, 23,
103 24, 25, 26, 27, 28, 29, 30, 31,
104 32, 33, 34, 35, 36, 37, 38, 39,
105 40, 41, 42, 43, 44, 45, 46, 47,
106 48, 49, 50, 51, 52, 53, 54, 55,
107 56, 57, 58, 59, 60, 61, 62, 63,
108 64, 65, 66, 67, 68, 69, 70, 71,
109 72, 73, 74, 75, 76, 77, 78, 79,
110 80, 81, 82, 83, 84, 85, 86, 87,
111 88, 89, 90, 91, 92, 93, 94, 95,
112 96, 65, 66, 67, 68, 69, 70, 71,
113 72, 73, 74, 75, 76, 77, 78, 79,
114 80, 81, 82, 83, 84, 85, 86, 87,
115 88, 89, 90,123,124,125,126,127,
116 128,129,130,131,132,133,134,135,
117 136,137,138,139,140,141,142,143,
118 144,145,146,147,148,149,150,151,
119 152,153,154,155,156,157,158,159,
120 160,161,162,163,164,165,166,167,
121 168,169,170,171,172,173,174,175,
122 176,177,178,179,180,181,182,183,
123 184,185,186,187,188,189,190,191,
124 192,193,194,195,196,197,198,199,
125 200,201,202,203,204,205,206,207,
126 208,209,210,211,212,213,214,215,
127 216,217,218,219,220,221,222,223,
128 224,225,226,227,228,229,230,231,
129 232,233,234,235,236,237,238,239,
130 240,241,242,243,244,245,246,247,
131 248,249,250,251,252,253,254,255 };
132
133/* This table identifies various classes of character by individual bits:
134 1 white space character
135 2 decimal digit
136 4 hexadecimal digit
137 8 alphanumeric or '_'
138 16 octal digit
139 128 regular expression metacharacter or binary zero
140*/
141
142unsigned char pcre_ctypes[] = {
143 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
144 0x00,0x01,0x01,0x01,0x01,0x01,0x00,0x00, /* 8- 15 */
145 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
146 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
147 0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /* - ' */
148 0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /* ( - / */
149 0x1e,0x1e,0x1e,0x1e,0x1e,0x1e,0x1e,0x1e, /* 0 - 7 */
150 0x0e,0x0e,0x00,0x00,0x00,0x00,0x00,0x80, /* 8 - ? */
151 0x00,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x08, /* @ - G */
152 0x08,0x08,0x08,0x08,0x08,0x08,0x08,0x08, /* H - O */
153 0x08,0x08,0x08,0x08,0x08,0x08,0x08,0x08, /* P - W */
154 0x08,0x08,0x08,0x80,0x00,0x00,0x80,0x08, /* X - _ */
155 0x00,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x08, /* ` - g */
156 0x08,0x08,0x08,0x08,0x08,0x08,0x08,0x08, /* h - o */
157 0x08,0x08,0x08,0x08,0x08,0x08,0x08,0x08, /* p - w */
158 0x08,0x08,0x08,0x80,0x80,0x00,0x00,0x00, /* x -127 */
159 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
160 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
161 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
162 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
163 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
164 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
165 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
166 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
167 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
168 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
169 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
170 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
171 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
172 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
173 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
174 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
175
176/* End of pcre-chartables.c */
177/*************************************************
178* Perl-Compatible Regular Expressions *
179*************************************************/
180
181/*
182This is a library of functions to support regular expressions whose syntax
183and semantics are as close as possible to those of the Perl 5 language.
184
185Written by: Philip Hazel <ph10@cam.ac.uk>
186
187 Copyright (c) 1997 University of Cambridge
188
189-----------------------------------------------------------------------------
190Permission is granted to anyone to use this software for any purpose on any
191computer system, and to redistribute it freely, subject to the following
192restrictions:
193
1941. This software is distributed in the hope that it will be useful,
195 but WITHOUT ANY WARRANTY; without even the implied warranty of
196 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
197
1982. The origin of this software must not be misrepresented, either by
199 explicit claim or by omission.
200
2013. Altered versions must be plainly marked as such, and must not be
202 misrepresented as being the original software.
203-----------------------------------------------------------------------------
204
205See the file Tech.Notes for some information on the internals.
206*/
207
208/* This module contains the actual definition of global variables that are
209shared between the different modules. In fact, these are limited to the
210indirections for memory management functions. */
211
212/* Include the internals header, which itself includes Standard C headers plus
213the external pcre header. */
214
215
216/* Store get and free functions. */
217
218void *(*pcre_malloc)(size_t) = malloc;
219void (*pcre_free)(void *) = free;
220
221/* End of pcre-globals.c */
222/*************************************************
223* Perl-Compatible Regular Expressions *
224*************************************************/
225
226/*
227This is a library of functions to support regular expressions whose syntax
228and semantics are as close as possible to those of the Perl 5 language. See
229the file Tech.Notes for some information on the internals.
230
231Written by: Philip Hazel <ph10@cam.ac.uk>
232
233 Copyright (c) 1997 University of Cambridge
234
235-----------------------------------------------------------------------------
236Permission is granted to anyone to use this software for any purpose on any
237computer system, and to redistribute it freely, subject to the following
238restrictions:
239
2401. This software is distributed in the hope that it will be useful,
241 but WITHOUT ANY WARRANTY; without even the implied warranty of
242 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
243
2442. The origin of this software must not be misrepresented, either by
245 explicit claim or by omission.
246
2473. Altered versions must be plainly marked as such, and must not be
248 misrepresented as being the original software.
249-----------------------------------------------------------------------------
250*/
251
252
253/* Include the internals header, which itself includes Standard C headers plus
254the external pcre header. */
255
256
257
258/* Character types for class type bits */
259
260static char class_types[] = { ctype_digit, ctype_space, ctype_word };
261
262
263
264/*************************************************
265* Set a range of bits in the map *
266*************************************************/
267
268/* This function is called for character types.
269
270Arguments:
271 start_bits points to the bit map
272 type a character type bit
273 include TRUE to include the type;
274 FALSE to include all but the type
275
276Returns: nothing
277*/
278
279static void
280set_type_bits(uschar *start_bits, int type, BOOL include)
281{
282int i;
283for (i = 0; i < 256; i++)
284 if (((pcre_ctypes[i] & type) != 0) == include) start_bits[i/8] |= (1<<(i%8));
285}
286
287
288
289/*************************************************
290* Set one bit in the map *
291*************************************************/
292
293/* This function is called to set a bit in the map for a given character,
294or both cases of a letter if caseless. It could be replaced by a macro if
295better performance is wanted.
296
297Arguments:
298 start_bits points to 32-byte table
299 c the character
300 caseless TRUE if caseless
301
302Returns: nothing
303*/
304
305static void
306set_bit(uschar *start_bits, int c, BOOL caseless)
307{
308if (caseless)
309 {
310 int d = pcre_ucc[c];
311 start_bits[d/8] |= (1<<(d%8));
312 c = pcre_lcc[c];
313 }
314start_bits[c/8] |= (1<<(c%8));
315}
316
317
318
319/*************************************************
320* Create bitmap of starting chars *
321*************************************************/
322
323/* This function scans a compiled unanchored expression and attempts to build a
324bitmap of the set of initial characters. If it can't, it returns FALSE. As time
325goes by, we may be able to get more clever at doing this.
326
327Arguments:
328 code points to an expression
329 start_bits points to a 32-byte table, initialized to 0
330 caseless TRUE if caseless
331
332Returns: TRUE if table built, FALSE otherwise
333*/
334
335static BOOL
336set_start_bits(uschar *code, uschar *start_bits, BOOL caseless)
337{
338do
339 {
340 uschar *tcode = code + 3;
341 BOOL try_next = TRUE;
342
343 while (try_next)
344 {
345 try_next = FALSE;
346
347 if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
348 {
349 if (!set_start_bits(tcode, start_bits, caseless)) return FALSE;
350 }
351
352 else switch(*tcode)
353 {
354 default:
355 return FALSE;
356
357 /* BRAZERO does the bracket, but carries on. */
358
359 case OP_BRAZERO:
360 case OP_BRAMINZERO:
361 if (!set_start_bits(++tcode, start_bits, caseless)) return FALSE;
362 do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
363 tcode += 3;
364 try_next = TRUE;
365 break;
366
367 /* Single-char * or ? sets the bit and tries the next item */
368
369 case OP_STAR:
370 case OP_MINSTAR:
371 case OP_QUERY:
372 case OP_MINQUERY:
373 set_bit(start_bits, tcode[1], caseless);
374 tcode += 2;
375 try_next = TRUE;
376 break;
377
378 /* Single-char upto sets the bit and tries the next */
379
380 case OP_UPTO:
381 case OP_MINUPTO:
382 set_bit(start_bits, tcode[3], caseless);
383 tcode += 4;
384 try_next = TRUE;
385 break;
386
387 /* At least one single char sets the bit and stops */
388
389 case OP_EXACT: /* Fall through */
390 tcode++;
391
392 case OP_CHARS: /* Fall through */
393 tcode++;
394
395 case OP_PLUS:
396 case OP_MINPLUS:
397 set_bit(start_bits, tcode[1], caseless);
398 break;
399
400 /* Single character type sets the bits and stops */
401
402 case OP_NOT_DIGIT:
403 set_type_bits(start_bits, ctype_digit, FALSE);
404 break;
405
406 case OP_DIGIT:
407 set_type_bits(start_bits, ctype_digit, TRUE);
408 break;
409
410 case OP_NOT_WHITESPACE:
411 set_type_bits(start_bits, ctype_space, FALSE);
412 break;
413
414 case OP_WHITESPACE:
415 set_type_bits(start_bits, ctype_space, TRUE);
416 break;
417
418 case OP_NOT_WORDCHAR:
419 set_type_bits(start_bits, ctype_word, FALSE);
420 break;
421
422 case OP_WORDCHAR:
423 set_type_bits(start_bits, ctype_word, TRUE);
424 break;
425
426 /* One or more character type fudges the pointer and restarts, knowing
427 it will hit a single character type and stop there. */
428
429 case OP_TYPEPLUS:
430 case OP_TYPEMINPLUS:
431 tcode++;
432 try_next = TRUE;
433 break;
434
435 case OP_TYPEEXACT:
436 tcode += 3;
437 try_next = TRUE;
438 break;
439
440 /* Zero or more repeats of character types set the bits and then
441 try again. */
442
443 case OP_TYPEUPTO:
444 case OP_TYPEMINUPTO:
445 tcode += 2; /* Fall through */
446
447 case OP_TYPESTAR:
448 case OP_TYPEMINSTAR:
449 case OP_TYPEQUERY:
450 case OP_TYPEMINQUERY:
451 switch(tcode[1])
452 {
453 case OP_NOT_DIGIT:
454 set_type_bits(start_bits, ctype_digit, FALSE);
455 break;
456
457 case OP_DIGIT:
458 set_type_bits(start_bits, ctype_digit, TRUE);
459 break;
460
461 case OP_NOT_WHITESPACE:
462 set_type_bits(start_bits, ctype_space, FALSE);
463 break;
464
465 case OP_WHITESPACE:
466 set_type_bits(start_bits, ctype_space, TRUE);
467 break;
468
469 case OP_NOT_WORDCHAR:
470 set_type_bits(start_bits, ctype_word, FALSE);
471 break;
472
473 case OP_WORDCHAR:
474 set_type_bits(start_bits, ctype_word, TRUE);
475 break;
476 }
477
478 tcode += 2;
479 try_next = TRUE;
480 break;
481
482 /* Character class: set the bits and either carry on or not,
483 according to the repeat count. */
484
485 case OP_CLASS:
486 case OP_NEGCLASS:
487 {
488 uschar *base = tcode;
489 uschar *data, *end;
490 uschar *bitmap = start_bits;
491 uschar local[32];
492 int flags = base[1];
493 int i;
494
495 tcode += 4 + 2 * tcode[2] + tcode[3]; /* Advance past the item */
496 switch (*tcode)
497 {
498 case OP_CRSTAR:
499 case OP_CRMINSTAR:
500 case OP_CRQUERY:
501 case OP_CRMINQUERY:
502 tcode++;
503 try_next = TRUE;
504 break;
505
506 case OP_CRRANGE:
507 case OP_CRMINRANGE:
508 if (((tcode[1] << 8) + tcode[2]) == 0)
509 {
510 tcode += 5;
511 try_next = TRUE;
512 }
513 break;
514 }
515
516 /* For a negated class, we have to build a separate table of all
517 the bits in the class, and then turn all other bits on in the main
518 table. Otherwise there are problems with things like [^\da]. */
519
520 if (*base == OP_NEGCLASS)
521 {
522 memset(local, 0, 32);
523 bitmap = local;
524 }
525
526 /* Character types */
527
528 for (i = 0; flags != 0; i++)
529 {
530 if ((flags & 1) != 0)
531 set_type_bits(bitmap, class_types[i/2], (i & 1) == 0);
532 flags >>= 1;
533 }
534
535 /* Ranges and individual characters */
536
537 data = base + 4;
538 end = data + base[2] * 2;
539 while (data < end)
540 {
541 for (i = *data; i <= data[1]; i++) set_bit(bitmap, i, caseless);
542 data += 2;
543 }
544
545 end += base[3];
546 while (data < end) set_bit(bitmap, *data++, caseless);
547
548 /* If a negated class, transfer data from local map to the main one */
549
550 if (bitmap != start_bits)
551 for (i = 0; i < 32; i++) start_bits[i] |= ~local[i];
552 }
553 break; /* End of class handling */
554
555 } /* End of switch */
556 } /* End of try_next loop */
557
558 code += (code[1] << 8) + code[2]; /* Advance to next branch */
559 }
560while (*code == OP_ALT);
561return TRUE;
562}
563
564
565
566/*************************************************
567* Study a compiled expression *
568*************************************************/
569
570/* This function is handed a compiled expression that it must study to produce
571information that will speed up the matching. It returns a pcre_extra block
572which then gets handed back to pcre_exec().
573
574Arguments:
575 re points to the compiled expression
576 options contains option bits
577 errorptr points to where to place error messages;
578 set NULL unless error
579
580Returns: pointer to a pcre_extra block,
581 NULL on error or if no optimization possible
582*/
583
584pcre_extra *
585pcre_study(pcre *external_re, int options, char **errorptr)
586{
587BOOL caseless;
588uschar start_bits[32];
589real_pcre_extra *extra;
590real_pcre *re = (real_pcre *)external_re;
591
592*errorptr = NULL;
593
594if (re == NULL || re->magic_number != MAGIC_NUMBER)
595 {
596 *errorptr = "argument is not a compiled regular expression";
597 return NULL;
598 }
599
600if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
601 {
602 *errorptr = "unknown or incorrect option bit(s) set";
603 return NULL;
604 }
605
606/* For an anchored pattern, or an unchored pattern that has a first char, or a
607multiline pattern that matches only at "line starts", no further processing at
608present. */
609
610if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
611 return NULL;
612
613/* Determine the caseless state from the compiled pattern and the current
614options. */
615
616caseless = ((re->options | options) & PCRE_CASELESS) != 0;
617
618/* See if we can find a fixed set of initial characters for the pattern. */
619
620memset(start_bits, 0, 32);
621if (!set_start_bits(re->code, start_bits, caseless)) return NULL;
622
623/* Get an "extra" block and put the information therein. */
624
625extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
626
627if (extra == NULL)
628 {
629 *errorptr = "failed to get memory";
630 return NULL;
631 }
632extra->options = PCRE_STUDY_MAPPED | (caseless? PCRE_STUDY_CASELESS : 0);
633memcpy(extra->start_bits, start_bits, 32);
634
635return (pcre_extra *)extra;
636}
637
638/* End of pcre-study.c */
639/*************************************************
640* Perl-Compatible Regular Expressions *
641*************************************************/
642
643/*
644This is a library of functions to support regular expressions whose syntax
645and semantics are as close as possible to those of the Perl 5 language. See
646the file Tech.Notes for some information on the internals.
647
648Written by: Philip Hazel <ph10@cam.ac.uk>
649
650 Copyright (c) 1997 University of Cambridge
651
652-----------------------------------------------------------------------------
653Permission is granted to anyone to use this software for any purpose on any
654computer system, and to redistribute it freely, subject to the following
655restrictions:
656
6571. This software is distributed in the hope that it will be useful,
658 but WITHOUT ANY WARRANTY; without even the implied warranty of
659 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
660
6612. The origin of this software must not be misrepresented, either by
662 explicit claim or by omission.
663
6643. Altered versions must be plainly marked as such, and must not be
665 misrepresented as being the original software.
666-----------------------------------------------------------------------------
667*/
668
669
670/* Define DEBUG to get debugging output on stdout. */
671
672/* #define DEBUG */
673
674
675/* Include the internals header, which itself includes Standard C headers plus
676the external pcre header. */
677
678
679#ifndef Py_eval_input
680/* For Python 1.4, graminit.h has to be explicitly included */
681#define Py_eval_input eval_input
682#endif
683
684/* Min and max values for the common repeats; for the maxima, 0 => infinity */
685
686static char rep_min[] = { 0, 0, 1, 1, 0, 0 };
687static char rep_max[] = { 0, 0, 0, 0, 1, 1 };
688
689/* Text forms of OP_ values and things, for debugging */
690
691#ifdef DEBUG
692static char *OP_names[] = { "End", "\\A", "\\B", "\\b", "\\D", "\\d",
693 "\\S", "\\s", "\\W", "\\w", "\\Z", "^", "$", "Any", "chars",
694 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
695 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
696 "*", "*?", "+", "+?", "?", "??", "{", "{",
697 "class", "negclass", "Ref",
698 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
699 "Brazero", "Braminzero", "Bra"
700};
701
702static char *class_names[] = { "\\d", "\\D", "\\s", "\\S", "\\w", "\\W" };
703#endif
704
705/* Table of character type operators that correspond to the bits in the
706character class flags, starting at the least significant end. */
707
708static char class_ops[] = {
709 OP_DIGIT, OP_NOT_DIGIT,
710 OP_WHITESPACE, OP_NOT_WHITESPACE,
711 OP_WORDCHAR, OP_NOT_WORDCHAR };
712
713/* Table for handling escaped characters in the range '0'-'z'. Positive returns
714are simple data values; negative values are for special things like \d and so
715on. Zero means further processing is needed (for things like \x), or the escape
716is invalid. */
717
718/* PYTHON: Python doesn't support \e, but does support \v */
719
720static short int escapes[] = {
721 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
722 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
723 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
724 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
725 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
726 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
727 '`', 7, -ESC_b, 0, -ESC_d, 0, '\f', 0, /* ` - g */
728 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
729 0, 0, '\r', -ESC_s, '\t', 0, '\v', -ESC_w, /* p - w */
730 0, 0, 0 /* x - z */
731};
732
733
734/* Definition to allow mutual recursion */
735
736static BOOL compile_regexp(BOOL, int *, uschar **, uschar **,
737 char **, PyObject *);
738
739/* Structure for passing "static" information around between the functions
740doing the matching, so that they are thread-safe. */
741
742typedef struct match_data {
743 int errorcode; /* As it says */
744 int *offset_vector; /* Offset vector */
745 int offset_end; /* One past the end */
746 BOOL offset_overflow; /* Set if too many extractions */
747 BOOL caseless; /* Case-independent flag */
748 BOOL multiline; /* Multiline flag */
749 uschar *start_subject; /* Start of the subject string */
750 uschar *end_subject; /* End of the subject string */
751
752 uschar *end_match_ptr; /* Subject position at end match */
753 int end_offset_top; /* Highwater mark at end of match */
754 BOOL dotall; /* Dotall flag */
755 int length; /* Length of the allocated stacks */
756 int point; /* Point to add next item pushed onto stacks */
757 /* Pointers to the 6 stacks */
758 int *off_num, *offset_top, *r1, *r2;
759 uschar **eptr, **ecode;
760} match_data;
761
762
763
764/*************************************************
765* Return version string *
766*************************************************/
767
768char *
769pcre_version(void)
770{
771return PCRE_VERSION;
772}
773
774
775
776
777/*************************************************
778* Return info about a compiled pattern *
779*************************************************/
780
781/* This function picks potentially useful data out of the private
782structure.
783
784Arguments:
785 external_re points to compiled code
786 optptr where to pass back the options
787 first_char where to pass back the first character,
788 or -1 if multiline and all branches start ^,
789 or -2 otherwise
790
791Returns: number of identifying extraction brackets
792 or negative values on error
793*/
794
795int
796pcre_info(pcre *external_re, int *optptr, int *first_char)
797{
798real_pcre *re = (real_pcre *)external_re;
799if (re == NULL) return PCRE_ERROR_NULL;
800if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
801if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
802if (first_char != NULL)
803 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
804 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
805return re->top_bracket;
806}
807
808
809
810
811#ifdef DEBUG
812/*************************************************
813* Debugging function to print chars *
814*************************************************/
815
816/* Print a sequence of chars in printable format, stopping at the end of the
817subject if the requested.
818
819Arguments:
820 p points to characters
821 length number to print
822 is_subject TRUE if printing from within md->start_subject
823 md pointer to matching data block, if is_subject is TRUE
824
825Returns: nothing
826*/
827
828static pchars(uschar *p, int length, BOOL is_subject, match_data *md)
829{
830int c;
831if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
832while (length-- > 0)
833 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
834}
835#endif
836
837
838
839
840/*************************************************
841* Check subpattern for empty operand *
842*************************************************/
843
844/* This function checks a bracketed subpattern to see if any of the paths
845through it could match an empty string. This is used to diagnose an error if
846such a subpattern is followed by a quantifier with an unlimited upper bound.
847
848Argument:
849 code points to the opening bracket
850
851Returns: TRUE or FALSE
852*/
853
854static BOOL
855could_be_empty(uschar *code)
856{
857do {
858 uschar *cc = code + 3;
859
860 /* Scan along the opcodes for this branch; as soon as we find something
861 that matches a non-empty string, break out and advance to test the next
862 branch. If we get to the end of the branch, return TRUE for the whole
863 sub-expression. */
864
865 for (;;)
866 {
867 /* Test an embedded subpattern; if it could not be empty, break the
868 loop. Otherwise carry on in the branch. */
869
870 if ((int)(*cc) >= OP_BRA)
871 {
872 if (!could_be_empty(cc)) break;
873 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
874 cc += 3;
875 }
876
877 else switch (*cc)
878 {
879 /* Reached end of a branch: the subpattern may match the empty string */
880
881 case OP_ALT:
882 case OP_KET:
883 case OP_KETRMAX:
884 case OP_KETRMIN:
885 return TRUE;
886
887 /* Skip over assertive subpatterns */
888
889 case OP_ASSERT:
890 case OP_ASSERT_NOT:
891 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
892 cc += 3;
893 break;
894
895 /* Skip over things that don't match chars */
896
897 case OP_SOD:
898 case OP_EOD:
899 case OP_CIRC:
900 case OP_DOLL:
901 case OP_BRAZERO:
902 case OP_BRAMINZERO:
903 case OP_NOT_WORD_BOUNDARY:
904 case OP_WORD_BOUNDARY:
905 cc++;
906 break;
907
908 /* Skip over simple repeats with zero lower bound */
909
910 case OP_STAR:
911 case OP_MINSTAR:
912 case OP_QUERY:
913 case OP_MINQUERY:
914 case OP_TYPESTAR:
915 case OP_TYPEMINSTAR:
916 case OP_TYPEQUERY:
917 case OP_TYPEMINQUERY:
918 cc += 2;
919 break;
920
921 /* Skip over UPTOs (lower bound is zero) */
922
923 case OP_UPTO:
924 case OP_MINUPTO:
925 case OP_TYPEUPTO:
926 case OP_TYPEMINUPTO:
927 cc += 4;
928 break;
929
930 /* Check a class or a back reference for a zero minimum */
931
932 case OP_CLASS:
933 case OP_NEGCLASS:
934 case OP_REF:
935 cc += (*cc == OP_REF)? 2 : 4 + 2 * cc[2] + cc[3];
936
937 switch (*cc)
938 {
939 case OP_CRSTAR:
940 case OP_CRMINSTAR:
941 case OP_CRQUERY:
942 case OP_CRMINQUERY:
943 cc++;
944 break;
945
946 case OP_CRRANGE:
947 case OP_CRMINRANGE:
948 if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
949 cc += 3;
950 break;
951
952 default:
953 goto NEXT_BRANCH;
954 }
955 break;
956
957 /* Anything else matches at least one character */
958
959 default:
960 goto NEXT_BRANCH;
961 }
962 }
963
964 NEXT_BRANCH:
965 code += (code[1] << 8) + code[2];
966 }
967while (*code == OP_ALT);
968
969/* No branches match the empty string */
970
971return FALSE;
972}
973
974
975/* Determine the length of a group ID in an expression like
976 (?P<foo_123>...)
977Arguments:
978 ptr pattern position pointer (say that 3 times fast)
979 finalchar the character that will mark the end of the ID
980 errorptr points to the pointer to the error message
981*/
982
983static int
984get_group_id(uschar *ptr, char finalchar, char **errorptr)
985{
986 uschar *start = ptr;
987
988 /* If the first character is not in \w, or is in \w but is a digit,
989 report an error */
990 if (!(pcre_ctypes[*ptr] & ctype_word) ||
991 (pcre_ctypes[*ptr++] & ctype_digit))
992 {
993 *errorptr = "(?P identifier must start with a letter or underscore";
994 return 0;
995 }
996
997 /* Increment ptr until we either hit a null byte, the desired
998 final character, or a non-word character */
999 for(; (*ptr != 0) && (*ptr != finalchar) &&
1000 (pcre_ctypes[*ptr] & ctype_word); ptr++)
1001 {
Guido van Rossumc3861071997-10-08 02:07:40 +00001002 /* Empty loop body */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001003 }
1004 if (*ptr==finalchar)
1005 return ptr-start;
1006 if (*ptr==0)
1007 {
1008 *errorptr = "unterminated (?P identifier";
1009 return 0;
1010 }
1011 *errorptr = "illegal character in (?P identifier";
1012 return 0;
1013}
1014
1015/*************************************************
1016* Handle escapes *
1017*************************************************/
1018
1019/* This function is called when a \ has been encountered. It either returns a
1020positive value for a simple escape such as \n, or a negative value which
1021encodes one of the more complicated things such as \d. On entry, ptr is
1022pointing at the \. On exit, it is on the final character of the escape
1023sequence.
1024
1025Arguments:
1026 ptrptr points to the pattern position pointer
1027 errorptr points to the pointer to the error message
1028
1029Returns: zero or positive => a data character
1030 negative => a special escape sequence
1031 on error, errorptr is set
1032*/
1033
1034static int
1035check_escape(uschar **ptrptr, char **errorptr)
1036{
1037uschar *ptr = *ptrptr;
1038int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
1039int i;
1040
1041if (c == 0) *errorptr = "\\ at end of pattern";
1042
1043/* Digits or letters may have special meaning; all others are literals. */
1044
1045else if (c < '0' || c > 'z') {}
1046
1047/* Do an initial lookup in a table. A non-zero result is something that can be
1048returned immediately. Otherwise further processing may be required. */
1049
1050else if ((i = escapes[c - '0']) != 0) c = i;
1051
1052/* Escapes that need further processing, or are illegal. */
1053
1054else switch (c)
1055 {
1056 case '0':
1057 c = 0;
1058 while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_odigit) != 0 )
1059 c = c * 8 + *(++ptr) - '0';
1060 break;
1061
1062 case '1': case '2': case '3': case '4': case '5':
1063 case '6': case '7': case '8': case '9':
1064 {
1065 /* PYTHON: Try to compute an octal value for a character */
1066 for(c=0, i=0; c!=-1 && ptr[i]!=0 && i<3; i++)
1067 {
1068 if (( pcre_ctypes[ ptr[i] ] & ctype_odigit) != 0)
1069 c = c * 8 + ptr[i]-'0';
1070 else
1071 c = -1; /* Non-octal character */
1072 }
1073 /* Aha! There were 3 octal digits, so it must be a character */
1074 if (c != -1 && i == 3)
1075 {
1076 ptr += i-1;
1077 break;
1078 }
1079 c = ptr[0]; /* Restore the first character after the \ */
1080 c -= '0'; i = 1;
1081 while (i<2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0)
1082 {
1083 c = c * 10 + ptr[1] - '0';
1084 ptr++; i++;
1085 }
1086 if (c > 255 - ESC_REF) *errorptr = "back reference too big";
1087 c = -(ESC_REF + c);
1088 }
1089 break;
1090
1091 case 'x':
1092 {
Guido van Rossumc3861071997-10-08 02:07:40 +00001093 int length;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001094 char *string;
Guido van Rossumc3861071997-10-08 02:07:40 +00001095 PyObject *result;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001096
1097 i=1;
1098 while (ptr[i]!=0 &&
1099 ( pcre_ctypes[ptr[i]] & ctype_xdigit) != 0)
1100 i++;
1101 if (i==1)
1102 {
1103 *errorptr="\\x must be followed by hex digits";
1104 break;
1105 }
1106 length=i-1;
1107 string=malloc(length+4+1);
1108 if (string==NULL)
1109 {
1110 *errorptr="can't allocate memory for \\x string";
1111 break;
1112 }
1113 /* Create a string containing "\x<hexdigits>", which will be
1114 passed to eval() */
1115 string[0]=string[length+3]='"';
1116 string[1]='\\';
1117 string[length+4]='\0';
1118 memcpy(string+2, ptr, length+1);
1119 ptr += length;
Guido van Rossumc3861071997-10-08 02:07:40 +00001120 result=PyRun_String((char *)string, Py_eval_input,
1121 PyEval_GetGlobals(), PyEval_GetLocals());
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001122 free(string);
1123 /* The evaluation raised an exception */
Guido van Rossumc3861071997-10-08 02:07:40 +00001124 if (result==NULL)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001125 {
1126 *errorptr="exception occurred during evaluation of \\x";
1127 break;
1128 }
Guido van Rossumc3861071997-10-08 02:07:40 +00001129 if (PyString_Size(result)!=1)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001130 {
Guido van Rossumc3861071997-10-08 02:07:40 +00001131 Py_DECREF(result);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001132 *errorptr="\\x string is not one byte in length";
1133 break;
1134 }
Guido van Rossumc3861071997-10-08 02:07:40 +00001135 c=*(unsigned char *)PyString_AsString(result);
1136 Py_DECREF(result);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001137 break;
1138 }
1139 break;
1140
1141
1142 case 'l':
1143 case 'L':
1144 case 'u':
1145 case 'U':
1146 case 'Q':
1147 case 'E':
1148 *errorptr = "the Perl escapes \\u, \\U, \\l, \\L, \\Q, \\E are not valid";
1149 break;
1150
1151 default:
1152 /* In Python, an unrecognized escape will simply return the character
1153 after the backslash, so do nothing */
1154 break;
1155 }
1156
1157*ptrptr = ptr;
1158return c;
1159}
1160
1161
1162
1163/*************************************************
1164* Read repeat counts *
1165*************************************************/
1166
1167/* Read an item of the form {n,m} and return the values.
1168
1169Arguments:
1170 p pointer to first char after '{'
1171 minp pointer to int for min
1172 maxp pointer to int for max
1173 returned as -1 if no max
1174 errorptr points to pointer to error message
1175
1176Returns: pointer to '}' on success;
1177 current ptr on error, with errorptr set
1178*/
1179
1180static uschar *
1181read_repeat_counts(uschar *p, int *minp, int *maxp, char **errorptr)
1182{
1183int min = 0;
1184int max = -1;
1185
1186if ((pcre_ctypes[*p] & ctype_digit) == 0)
1187 {
1188 *errorptr = "number expected after {";
1189 return p;
1190 }
1191
1192while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
1193
1194if (*p == '}') max = min; else
1195 {
1196 if (*p++ != ',')
1197 {
1198 *errorptr = "comma expected";
1199 return p-1;
1200 }
1201 if (*p != '}')
1202 {
1203 max = 0;
1204 while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
1205 if (*p != '}')
1206 {
1207 *errorptr = "} expected";
1208 return p;
1209 }
1210 if (max < min)
1211 {
1212 *errorptr = "numbers out of order";
1213 return p;
1214 }
1215 }
1216 }
1217
1218/* Do paranoid checks, then fill in the required variables, and pass back the
1219pointer to the terminating '}'. */
1220
1221if (max == 0) *errorptr = "zero maximum not allowed";
1222else if (min > 65535 || max > 65535) *errorptr = "number too big";
1223else
1224 {
1225 *minp = min;
1226 *maxp = max;
1227 }
1228return p;
1229}
1230
1231
1232
1233/*************************************************
1234* Compile one branch *
1235*************************************************/
1236
1237/* Scan the pattern, compiling it into the code vector.
1238
1239Arguments:
1240 extended TRUE if the PCRE_EXTENDED option was set
1241 brackets points to 2-element bracket vector
1242 code points to the pointer to the current code point
1243 ptrptr points to the current pattern pointer
1244 errorptr points to pointer to error message
1245
1246Returns: TRUE on success
1247 FALSE, with *errorptr set on error
1248*/
1249
1250static BOOL
1251compile_branch(BOOL extended, int *brackets, uschar **codeptr,
1252 uschar **ptrptr, char **errorptr, PyObject *dictionary)
1253{
1254int repeat_type, op_type;
1255int repeat_min, repeat_max;
1256int bravalue, length;
1257register int c;
1258register uschar *code = *codeptr;
1259uschar *ptr = *ptrptr;
1260uschar *previous = NULL;
1261uschar *oldptr;
1262
1263/* Switch on next character until the end of the branch */
1264
1265for (;; ptr++)
1266 {
1267 c = *ptr;
1268 if (extended)
1269 {
1270 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1271 if (c == '#')
1272 {
1273 while ((c = *(++ptr)) != 0 && c != '\n');
1274 continue;
1275 }
1276 }
1277
1278 switch(c)
1279 {
1280 /* The branch terminates at end of string, |, or ). */
1281
1282 case 0:
1283 case '|':
1284 case ')':
1285 *codeptr = code;
1286 *ptrptr = ptr;
1287 return TRUE;
1288
1289 /* Handle single-character metacharacters */
1290
1291 case '^':
1292 previous = NULL;
1293 *code++ = OP_CIRC;
1294 break;
1295
1296 case '$':
1297 previous = NULL;
1298 *code++ = OP_DOLL;
1299 break;
1300
1301 case '.':
1302 previous = code;
1303 *code++ = OP_ANY;
1304 break;
1305
1306 /* Character classes. We do quite a bit of munging around here. There are
1307 always four initial bytes: the op_code, a flags byte for things like \d, a
1308 count of pairs and a count of single characters. The pairs then follow, and
1309 finally the single characters. */
1310
1311 case '[':
1312 {
1313 int rangecount = 0;
1314 int flags = 0;
1315 int singles_count = 0;
1316 char singles[256];
1317
1318 previous = code;
1319
1320 /* If the first character is '^', set the negation flag */
1321
1322 if ((c = *(++ptr)) == '^') { *code = OP_NEGCLASS; c = *(++ptr); }
1323 else *code = OP_CLASS;
1324 code += 4;
1325
1326 /* Process characters until ] is reached. By writing this as a "do" it
1327 means that an initial ] is taken as a data character. */
1328
1329 do
1330 {
1331 if (c == 0)
1332 {
1333 *errorptr = "] missing";
1334 goto FAILED;
1335 }
1336
1337 /*** Perl treats '-' here as a data character, so PCRE had better
1338 do the same ... cut out this diagnosis.
1339
1340 if (c == '-')
1341 {
1342 *errorptr = "unexpected '-' in character class";
1343 goto FAILED;
1344 }
1345 ... ***/
1346
1347 /* Backslash may introduce a single character, or it may introduce one
1348 of the specials, which just set a flag. Escaped items are checked for
1349 validity in the pre-compiling pass. The sequence \b is a special case.
1350 Inside a class (and only there) it is treated as backslash. Elsewhere
1351 it marks a word boundary. */
1352
1353 if (c == '\\')
1354 {
1355 uschar *save_ptr = ptr+1;
1356 c = check_escape(&ptr, errorptr);
1357 if (c < 0)
1358 {
1359 switch (-c)
1360 {
1361 case ESC_d: flags |= CLASS_DIGITS; continue;
1362 case ESC_D: flags |= CLASS_NOT_DIGITS; continue;
1363 case ESC_s: flags |= CLASS_WHITESPACE; continue;
1364 case ESC_S: flags |= CLASS_NOT_WHITESPACE; continue;
1365 case ESC_w: flags |= CLASS_WORD; continue;
1366 case ESC_W: flags |= CLASS_NOT_WORD; continue;
1367 default:
1368 ptr = save_ptr;
1369 c = *ptr;
1370 break;
1371
1372 case ESC_b: c = '\b'; /* Treat as single character */
1373 break;
1374 }
1375 }
1376 /* Fall through if single character */
1377 }
1378
1379 /* A single character may be followed by '-' to form a range. However,
1380 Perl does not permit ']' to be the end of the range. A '-' character
1381 here is treated as a literal. */
1382
1383 if (ptr[1] == '-' && ptr[2] != ']')
1384 {
1385 int d;
1386 ptr += 2;
1387 d = *ptr;
1388
1389 if (d == 0)
1390 {
1391 *errorptr = "incomplete range";
1392 goto FAILED;
1393 }
1394
1395 /* The second part of a range can be a single-character escape, but
1396 not any of the other escapes. */
1397
1398 if (d == '\\')
1399 {
1400 d = check_escape(&ptr, errorptr);
1401 if (d < 0)
1402 {
1403 if (d == -ESC_b) d = '\b'; else
1404 {
1405 *errorptr = "invalid range";
1406 goto FAILED;
1407 }
1408 }
1409 }
1410
1411 if (d < c)
1412 {
1413 *errorptr = "range out of order";
1414 goto FAILED;
1415 }
1416
1417 if (rangecount >= 255)
1418 {
1419 *errorptr = "too many ranges inside []";
1420 goto FAILED;
1421 }
1422
1423 rangecount++;
1424 *code++ = c;
1425 *code++ = d;
1426 continue;
1427 }
1428
1429 /* Handle a lone single character: save it up for outputting at the
1430 end. Be paranoid and check that the buffer isn't going to overflow. */
1431
1432 if (singles_count >= 255)
1433 {
1434 *errorptr = "too many characters inside []";
1435 goto FAILED;
1436 }
1437 singles[singles_count++] = c;
1438 }
1439
1440 /* Loop until ']' reached; the check for end of string happens inside the
1441 loop. This "while" is the end of the "do" above. */
1442
1443 while ((c = *(++ptr)) != ']');
1444
1445 /* Copy saved single characters, which follow the ranges in the output. */
1446
1447 c = 0;
1448 while (c < singles_count) *code++ = singles[c++];
1449
1450 /* Finally fill in the flags and counts of ranges and single characters,
1451 and advance the pointer past the ]. */
1452
1453 previous[1] = flags;
1454 previous[2] = rangecount;
1455 previous[3] = singles_count;
1456 }
1457 break;
1458
1459 /* Various kinds of repeat */
1460
1461 case '{':
1462 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
1463 if (*errorptr != NULL) goto FAILED;
1464 goto REPEAT;
1465
1466 case '*':
1467 repeat_min = 0;
1468 repeat_max = -1;
1469 goto REPEAT;
1470
1471 case '+':
1472 repeat_min = 1;
1473 repeat_max = -1;
1474 goto REPEAT;
1475
1476 case '?':
1477 repeat_min = 0;
1478 repeat_max = 1;
1479
1480 REPEAT:
1481 if (previous == NULL)
1482 {
1483 *errorptr = "nothing to repeat";
1484 goto FAILED;
1485 }
1486
1487 /* If the next character is '?' this is a minimizing repeat. Advance to the
1488 next character. */
1489
1490 if (ptr[1] == '?') { repeat_type = 1; ptr++; } else repeat_type = 0;
1491
1492 /* If previous was a string of characters, chop off the last one and use it
1493 as the subject of the repeat. If there was only one character, we can
1494 abolish the previous item altogether. */
1495
1496 if (*previous == OP_CHARS)
1497 {
1498 int len = previous[1];
1499 if (len == 1)
1500 {
1501 c = previous[2];
1502 code = previous;
1503 }
1504 else
1505 {
1506 c = previous[len+1];
1507 previous[1]--;
1508 code--;
1509 }
1510 op_type = 0; /* Use single-char op codes */
1511 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1512 }
1513
1514 /* If previous was a character type match (\d or similar), abolish it and
1515 create a suitable repeat item. The code is shared with single-character
1516 repeats by adding a suitable offset into repeat_type. */
1517
1518 if ((int)*previous < OP_EOD || *previous == OP_ANY)
1519 {
1520 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1521 c = *previous;
1522 code = previous;
1523
1524 OUTPUT_SINGLE_REPEAT:
1525 repeat_type += op_type; /* Combine both values for many cases */
1526
1527 /* A minimum of zero is handled either as the special case * or ?, or as
1528 an UPTO, with the maximum given. */
1529
1530 if (repeat_min == 0)
1531 {
1532 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1533 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1534 else
1535 {
1536 *code++ = OP_UPTO + repeat_type;
1537 *code++ = repeat_max >> 8;
1538 *code++ = (repeat_max & 255);
1539 }
1540 }
1541
1542 /* The case {1,} is handled as the special case + */
1543
1544 else if (repeat_min == 1 && repeat_max == -1)
1545 *code++ = OP_PLUS + repeat_type;
1546
1547 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1548 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1549
1550 else
1551 {
1552 if (repeat_min != 1)
1553 {
1554 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1555 *code++ = repeat_min >> 8;
1556 *code++ = (repeat_min & 255);
1557 }
1558
1559 /* If the mininum is 1 and the previous item was a character string,
1560 we either have to put back the item that got cancelled if the string
1561 length was 1, or add the character back onto the end of a longer
1562 string. For a character type nothing need be done; it will just get put
1563 back naturally. */
1564
1565 else if (*previous == OP_CHARS)
1566 {
1567 if (code == previous) code += 2; else previous[1]++;
1568 }
1569
1570 /* Insert an UPTO if the max is greater than the min. */
1571
1572 if (repeat_max != repeat_min)
1573 {
1574 *code++ = c;
1575 repeat_max -= repeat_min;
1576 *code++ = OP_UPTO + repeat_type;
1577 *code++ = repeat_max >> 8;
1578 *code++ = (repeat_max & 255);
1579 }
1580 }
1581
1582 /* The character or character type itself comes last in all cases. */
1583
1584 *code++ = c;
1585 }
1586
1587 /* If previous was a character class or a back reference, we put the repeat
1588 stuff after it. */
1589
1590 else if (*previous == OP_CLASS || *previous == OP_NEGCLASS ||
1591 *previous == OP_REF)
1592 {
1593 if (repeat_min == 0 && repeat_max == -1)
1594 *code++ = OP_CRSTAR + repeat_type;
1595 else if (repeat_min == 1 && repeat_max == -1)
1596 *code++ = OP_CRPLUS + repeat_type;
1597 else if (repeat_min == 0 && repeat_max == 1)
1598 *code++ = OP_CRQUERY + repeat_type;
1599 else
1600 {
1601 *code++ = OP_CRRANGE + repeat_type;
1602 *code++ = repeat_min >> 8;
1603 *code++ = repeat_min & 255;
1604 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1605 *code++ = repeat_max >> 8;
1606 *code++ = repeat_max & 255;
1607 }
1608 }
1609
1610 /* If previous was a bracket group, we may have to replicate it in certain
1611 cases. If the maximum repeat count is unlimited, check that the bracket
1612 group cannot match the empty string, and diagnose an error if it can. */
1613
1614 else if ((int)*previous >= OP_BRA)
1615 {
1616 int i;
1617 int length = code - previous;
1618
1619 if (repeat_max == -1 && could_be_empty(previous))
1620 {
1621 *errorptr = "operand of unlimited repeat could match the empty string";
1622 goto FAILED;
1623 }
1624
1625 /* If the minimum is greater than zero, and the maximum is unlimited or
1626 equal to the minimum, the first copy remains where it is, and is
1627 replicated up to the minimum number of times. This case includes the +
1628 repeat, but of course no replication is needed in that case. */
1629
1630 if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1631 {
1632 for (i = 1; i < repeat_min; i++)
1633 {
1634 memcpy(code, previous, length);
1635 code += length;
1636 }
1637 }
1638
1639 /* If the minimum is zero, stick BRAZERO in front of the first copy.
1640 Then, if there is a fixed upper limit, replicated up to that many times,
1641 sticking BRAZERO in front of all the optional ones. */
1642
1643 else
1644 {
1645 if (repeat_min == 0)
1646 {
1647 memmove(previous+1, previous, length);
1648 code++;
1649 *previous++ = OP_BRAZERO + repeat_type;
1650 }
1651
1652 for (i = 1; i < repeat_min; i++)
1653 {
1654 memcpy(code, previous, length);
1655 code += length;
1656 }
1657
1658 for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1659 {
1660 *code++ = OP_BRAZERO + repeat_type;
1661 memcpy(code, previous, length);
1662 code += length;
1663 }
1664 }
1665
1666 /* If the maximum is unlimited, set a repeater in the final copy. */
1667
1668 if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
1669 }
1670
1671 /* Else there's some kind of shambles */
1672
1673 else
1674 {
1675 *errorptr = "internal error 1 (unexpected repeat)";
1676 goto FAILED;
1677 }
1678
1679 /* In all case we no longer have a previous item. */
1680
1681 previous = NULL;
1682 break;
1683
1684
1685 /* Start of nested bracket sub-expression, or comment or lookahead.
1686 First deal with special things that can come after a bracket; all are
1687 introduced by ?, and the appearance of any of them means that this is not a
1688 referencing group. They were checked for validity in the first pass over
1689 the string, so we don't have to check for syntax errors here. */
1690
1691 case '(':
1692 previous = code; /* Only real brackets can be repeated */
1693 if (*(++ptr) == '?')
1694 {
1695 bravalue = OP_BRA;
1696
1697 switch (*(++ptr))
1698 {
1699 case '#':
1700 case 'i':
1701 case 'm':
1702 case 's':
1703 case 'x':
1704 ptr++;
1705 while (*ptr != ')') ptr++;
1706 previous = NULL;
1707 continue;
1708
1709 case ':': /* Non-extracting bracket */
1710 ptr++;
1711 break;
1712
1713 case '=': /* Assertions can't be repeated */
1714 bravalue = OP_ASSERT;
1715 ptr++;
1716 previous = NULL;
1717 break;
1718
1719 case '!':
1720 bravalue = OP_ASSERT_NOT;
1721 ptr++;
1722 previous = NULL;
1723 break;
1724
1725 case ('P'):
1726 ptr++;
1727 if (*ptr=='<')
1728 {
1729 /* (?P<groupname>...) */
1730 int idlen;
1731 PyObject *string, *intobj;
1732
1733 ptr++;
1734 idlen = get_group_id(ptr, '>', errorptr);
1735 if (*errorptr) {
1736 goto FAILED;
1737 }
Guido van Rossum57ba4f31997-12-02 20:40:28 +00001738 string = PyString_FromStringAndSize((char*)ptr, idlen);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001739 intobj = PyInt_FromLong( brackets[0] );
1740 if (intobj == NULL || string==NULL)
1741 {
1742 Py_XDECREF(string);
1743 Py_XDECREF(intobj);
1744 *errorptr = "exception raised";
1745 goto FAILED;
1746 }
1747 PyDict_SetItem(dictionary, string, intobj);
1748 Py_DECREF(string); Py_DECREF(intobj);
1749 ptr += idlen+1; /* Point to rest of expression */
1750 goto do_grouping_bracket;
1751 }
1752 if (*ptr=='=')
1753 {
1754 /* (?P=groupname) */
1755 int idlen, refnum;
1756 PyObject *string, *intobj;
1757
1758 ptr++;
1759 idlen = get_group_id(ptr, ')', errorptr);
1760 if (*errorptr) {
1761 goto FAILED;
1762 }
Guido van Rossum57ba4f31997-12-02 20:40:28 +00001763 string = PyString_FromStringAndSize((char*)ptr, idlen);
Guido van Rossumc3861071997-10-08 02:07:40 +00001764 if (string==NULL) {
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001765 Py_XDECREF(string);
1766 *errorptr = "exception raised";
1767 goto FAILED;
1768 }
1769 intobj = PyDict_GetItem(dictionary, string);
1770 if (intobj==NULL) {
Guido van Rossumc3861071997-10-08 02:07:40 +00001771 Py_DECREF(string);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001772 *errorptr = "?P= group identifier isn't defined";
1773 goto FAILED;
1774 }
1775
1776 refnum = PyInt_AsLong(intobj);
Guido van Rossumc3861071997-10-08 02:07:40 +00001777 Py_DECREF(string); Py_DECREF(intobj);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001778 *code++ = OP_REF;
1779 *code++ = refnum;
1780 /* The continue will cause the top-level for() loop to
1781 be resumed, so ptr will be immediately incremented.
1782 Therefore, the following line adds just idlen, not
1783 idlen+1 */
1784 ptr += idlen;
1785 continue;
1786 }
1787 /* The character after ?P is neither < nor =, so
1788 report an error. Add more Python-extensions here. */
1789 *errorptr="unknown after (?P";
1790 goto FAILED;
1791 break;
1792 default:
1793 *errorptr = "unknown after (?";
1794 goto FAILED;
1795 }
1796 }
1797
1798 /* Else we have a referencing group */
1799
1800 else
1801 {
1802 do_grouping_bracket:
1803 if (brackets[0] > EXTRACT_MAX)
1804 {
1805 *errorptr = "too many extraction brackets";
1806 goto FAILED;
1807 }
1808 brackets[1] = brackets[0];
1809 bravalue = OP_BRA + brackets[0]++;
1810 }
1811
1812 /* Process nested bracketed re; at end pointer is on the bracket. We copy
1813 code into a non-register variable in order to be able to pass its address
1814 because some compilers complain otherwise. */
1815
1816 *code = bravalue;
1817 {
1818 uschar *mcode = code;
1819 if (!compile_regexp(extended, brackets, &mcode, &ptr, errorptr, dictionary))
1820 goto FAILED;
1821 code = mcode;
1822 }
1823
1824 if (*ptr != ')')
1825 {
1826 *errorptr = "missing )";
1827 goto FAILED;
1828 }
1829 break;
1830
1831 /* Check \ for being a real metacharacter; if not, fall through and handle
1832 it as a data character at the start of a string. Escape items are checked
1833 for validity in the pre-compiling pass. */
1834
1835 case '\\':
1836 oldptr = ptr;
1837 c = check_escape(&ptr, errorptr);
1838
1839 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1840 are arranged to be the negation of the corresponding OP_values. For the
1841 back references, the values are ESC_REF plus the reference number. Only
1842 back references and those types that consume a character may be repeated.
1843 We can test for values between ESC_b and ESC_Z for the latter; this may
1844 have to change if any new ones are ever created. */
1845
1846 if (c < 0)
1847 {
1848 if (-c >= ESC_REF)
1849 {
1850 int refnum = -c -ESC_REF;
1851 if (brackets[1] < refnum ) {
1852 *errorptr = "backreference to non-existent group";
1853 goto FAILED;
1854 }
1855 previous = code;
1856 *code++ = OP_REF;
1857 *code++ = refnum;
1858 }
1859 else
1860 {
1861 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
1862 *code++ = -c;
1863 }
1864 continue;
1865 }
1866
1867 /* Reset and fall through */
1868
1869 ptr = oldptr;
1870 c = '\\';
1871
1872 /* Handle a run of data characters until a metacharacter is encountered.
1873 The first character is guaranteed not to be whitespace or # when the
1874 extended flag is set. */
1875
1876 default:
1877 previous = code;
1878 *code = OP_CHARS;
1879 code += 2;
1880 length = 0;
1881
1882 do
1883 {
1884 if (extended)
1885 {
1886 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1887 if (c == '#')
1888 {
1889 while ((c = *(++ptr)) != 0 && c != '\n');
1890 if (c == 0) break;
1891 continue;
1892 }
1893 }
1894
1895 /* Backslash may introduce a data char or a metacharacter. Escaped items
1896 are checked for validity in the pre-compiling pass. Stop the string
1897 before a metaitem. */
1898
1899 if (c == '\\')
1900 {
1901 oldptr = ptr;
1902 c = check_escape(&ptr, errorptr);
1903 if (c < 0) { ptr = oldptr; break; }
1904 }
1905
1906 /* Ordinary character or single-char escape */
1907
1908 *code++ = c;
1909 length++;
1910 }
1911
1912 /* This "while" is the end of the "do" above. */
1913
1914 while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
1915
1916 /* Compute the length and set it in the data vector, and advance to
1917 the next state. */
1918
1919 previous[1] = length;
1920 ptr--;
1921 break;
1922 }
1923 } /* end of big loop */
1924
1925/* Control never reaches here by falling through, only by a goto for all the
1926error states. Pass back the position in the pattern so that it can be displayed
1927to the user for diagnosing the error. */
1928
1929FAILED:
1930*ptrptr = ptr;
1931return FALSE;
1932}
1933
1934
1935
1936
1937/*************************************************
1938* Compile sequence of alternatives *
1939*************************************************/
1940
1941/* On entry, ptr is pointing past the bracket character, but on return
1942it points to the closing bracket, or vertical bar, or end of string.
1943The code variable is pointing at the byte into which the BRA operator has been
1944stored.
1945
1946Argument:
1947 extended TRUE if PCRE_EXTENDED was set
1948 brackets -> 2-element vector containing next and top bracket numbers
1949 codeptr -> the address of the current code pointer
1950 ptrptr -> the address of the current pattern pointer
1951 errorptr -> pointer to error message
1952
1953Returns: TRUE on success
1954*/
1955
1956static BOOL
1957compile_regexp(BOOL extended, int *brackets, uschar **codeptr,
1958 uschar **ptrptr, char **errorptr, PyObject *dictionary)
1959{
1960uschar *ptr = *ptrptr;
1961uschar *code = *codeptr;
1962uschar *start_bracket = code;
1963
1964for (;;)
1965 {
1966 int length;
1967 uschar *last_branch = code;
1968
1969 code += 3;
1970 if (!compile_branch(extended, brackets, &code, &ptr, errorptr, dictionary))
1971 {
1972 *ptrptr = ptr;
1973 return FALSE;
1974 }
1975
1976 /* Fill in the length of the last branch */
1977
1978 length = code - last_branch;
1979 last_branch[1] = length >> 8;
1980 last_branch[2] = length & 255;
1981
1982 /* Reached end of expression, either ')' or end of pattern. Insert a
1983 terminating ket and the length of the whole bracketed item, and return,
1984 leaving the pointer at the terminating char. */
1985
1986 if (*ptr != '|')
1987 {
1988 length = code - start_bracket;
1989 *code++ = OP_KET;
1990 *code++ = length >> 8;
1991 *code++ = length & 255;
1992 *codeptr = code;
1993 *ptrptr = ptr;
1994 return TRUE;
1995 }
1996
1997 /* Another branch follows; insert an "or" node and advance the pointer. */
1998
1999 *code = OP_ALT;
2000 ptr++;
2001 }
2002/* Control never reaches here */
2003}
2004
2005
2006
2007/*************************************************
2008* Check for anchored expression *
2009*************************************************/
2010
2011/* Try to find out if this is an anchored regular expression. Consider each
2012alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2013all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2014it's anchored. However, if this is a multiline pattern, then only OP_SOD
2015counts, since OP_CIRC can match in the middle.
2016
2017A branch is also implicitly anchored if it starts with .* because that will try
2018the rest of the pattern at all possible matching points, so there is no point
2019trying them again.
2020
2021Argument: points to start of expression (the bracket)
2022Returns: TRUE or FALSE
2023*/
2024
2025static BOOL
2026is_anchored(register uschar *code, BOOL multiline)
2027{
2028do {
2029 int op = (int)code[3];
2030 if (op >= OP_BRA || op == OP_ASSERT)
2031 { if (!is_anchored(code+3, multiline)) return FALSE; }
2032 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2033 { if (code[4] != OP_ANY) return FALSE; }
2034 else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;
2035 code += (code[1] << 8) + code[2];
2036 }
2037while (*code == OP_ALT);
2038return TRUE;
2039}
2040
2041
2042
2043/*************************************************
2044* Check for start with \n line expression *
2045*************************************************/
2046
2047/* This is called for multiline expressions to try to find out if every branch
2048starts with ^ so that "first char" processing can be done to speed things up.
2049
2050Argument: points to start of expression (the bracket)
2051Returns: TRUE or FALSE
2052*/
2053
2054static BOOL
2055is_startline(uschar *code)
2056{
2057do {
2058 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2059 { if (!is_startline(code+3)) return FALSE; }
2060 else if (code[3] != OP_CIRC) return FALSE;
2061 code += (code[1] << 8) + code[2];
2062 }
2063while (*code == OP_ALT);
2064return TRUE;
2065}
2066
2067
2068
2069/*************************************************
2070* Check for fixed first char *
2071*************************************************/
2072
2073/* Try to find out if there is a fixed first character. This is called for
2074unanchored expressions, as it speeds up their processing quite considerably.
2075Consider each alternative branch. If they all start with the same char, or with
2076a bracket all of whose alternatives start with the same char (recurse ad lib),
2077then we return that char, otherwise -1.
2078
2079Argument: points to start of expression (the bracket)
2080Returns: -1 or the fixed first char
2081*/
2082
2083static int
2084find_firstchar(uschar *code)
2085{
2086register int c = -1;
2087do
2088 {
2089 register int charoffset = 4;
2090
2091 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2092 {
2093 register int d;
2094 if ((d = find_firstchar(code+3)) < 0) return -1;
2095 if (c < 0) c = d; else if (c != d) return -1;
2096 }
2097
2098 else switch(code[3])
2099 {
2100 default:
2101 return -1;
2102
2103 case OP_EXACT: /* Fall through */
2104 charoffset++;
2105
2106 case OP_CHARS: /* Fall through */
2107 charoffset++;
2108
2109 case OP_PLUS:
2110 case OP_MINPLUS:
2111 if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;
2112 break;
2113 }
2114 code += (code[1] << 8) + code[2];
2115 }
2116while (*code == OP_ALT);
2117return c;
2118}
2119
2120
2121
2122/*************************************************
2123* Compile a Regular Expression *
2124*************************************************/
2125
2126/* This function takes a string and returns a pointer to a block of store
2127holding a compiled version of the expression.
2128
2129Arguments:
2130 pattern the regular expression
2131 options various option bits
2132 errorptr pointer to pointer to error text
2133 erroroffset ptr offset in pattern where error was detected
2134
2135Returns: pointer to compiled data block, or NULL on error,
2136 with errorptr and erroroffset set
2137*/
2138
2139pcre *
2140pcre_compile(char *pattern, int options, char **errorptr, int
2141 *erroroffset, PyObject *dictionary)
2142{
2143real_pcre *re;
2144int spaces = 0;
2145int length = 3; /* For initial BRA plus length */
2146int runlength;
2147int c, size;
2148int brackets[2];
2149int brastack[200];
2150int brastackptr = 0;
2151BOOL extended = (options & PCRE_EXTENDED) != 0;
2152uschar *code, *ptr;
2153
2154#ifdef DEBUG
2155uschar *code_base, *code_end;
2156#endif
2157
2158/* Miscellaneous initialization; the copy the error pointers into static
2159variables so all functions can access them. */
2160
2161brackets[0] = 1; /* Next bracket number */
2162brackets[1] = 0; /* Highest used bracket number */
2163
2164*errorptr = NULL;
2165*erroroffset = 0;
2166
2167if ((options & ~PUBLIC_OPTIONS) != 0)
2168 {
2169 *errorptr = "unknown option bit(s) set";
2170 return NULL;
2171 }
2172
2173#ifdef DEBUG
2174printf("------------------------------------------------------------------\n");
2175printf("%s\n", pattern);
2176#endif
2177
2178/* The first thing to do is to make a pass over the pattern to compute the
2179amount of store required to hold the compiled code. This does not have to be
2180perfect as long as errors are overestimates. At the same time we can detect any
2181internal flag settings. Make an attempt to correct for any counted white space
2182if an "extended" flag setting appears late in the pattern. We can't be so
2183clever for #-comments. */
2184
2185ptr = (uschar *)(pattern - 1);
2186while ((c = *(++ptr)) != 0)
2187 {
2188 int min, max;
2189
2190 if ((pcre_ctypes[c] & ctype_space) != 0)
2191 {
2192 if (extended) continue;
2193 spaces++;
2194 }
2195
2196 if (extended && c == '#')
2197 {
2198 while ((c = *(++ptr)) != 0 && c != '\n');
2199 continue;
2200 }
2201
2202 switch(c)
2203 {
2204 /* A backslashed item may be an escaped "normal" character or a
2205 character type. For a "normal" character, put the pointers and
2206 character back so that tests for whitespace etc. in the input
2207 are done correctly. */
2208
2209 case '\\':
2210 {
2211 uschar *save_ptr = ptr;
2212 c = check_escape(&ptr, errorptr);
2213 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2214 if (c >= 0)
2215 {
2216 ptr = save_ptr;
2217 c = '\\';
2218 goto NORMAL_CHAR;
2219 }
2220 }
2221 length++;
2222
2223 /* A back reference needs an additional char, plus either one or 5
2224 bytes for a repeat. */
2225
2226 if (c <= -ESC_REF)
2227 {
2228 length++; /* For single back reference */
2229 if (ptr[1] == '{')
2230 {
2231 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2232 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2233 if ((min == 0 && (max == 1 || max == -1)) ||
2234 (min == 1 && max == -1))
2235 length++;
2236 else length += 5;
2237 if (ptr[1] == '?') ptr++;
2238 }
2239 }
2240 continue;
2241
2242 case '^':
2243 case '.':
2244 case '$':
2245 case '*': /* These repeats won't be after brackets; */
2246 case '+': /* those are handled separately */
2247 case '?':
2248 length++;
2249 continue;
2250
2251 /* This covers the cases of repeats after a single char, metachar, class,
2252 or back reference. */
2253
2254 case '{':
2255 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
2256 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2257 if ((min == 0 && (max == 1 || max == -1)) ||
2258 (min == 1 && max == -1))
2259 length++;
2260 else
2261 {
2262 length--; /* Uncount the original char or metachar */
2263 if (min == 1) length++; else if (min > 0) length += 4;
2264 if (max > 0) length += 4; else length += 2;
2265 }
2266 if (ptr[1] == '?') ptr++;
2267 continue;
2268
2269 /* An alternation contains an offset to the next branch or ket. */
2270 case '|':
2271 length += 3;
2272 continue;
2273
2274 /* A character class uses 4 characters plus the characters in it. Don't
2275 worry about character types that aren't allowed in classes - they'll get
2276 picked up during the compile. */
2277
2278 case '[':
2279 length += 4;
2280 if (ptr[1] == '^') ptr++;
2281 do
2282 {
2283 if (*(++ptr) == '\\')
2284 {
2285 (void)check_escape(&ptr, errorptr);
2286 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2287 }
2288 length++;
2289 }
2290 while (*ptr != 0 && *ptr != ']');
2291
2292 /* A repeat needs either 1 or 5 bytes. */
2293
2294 if (ptr[1] == '{')
2295 {
2296 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2297 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2298 if ((min == 0 && (max == 1 || max == -1)) ||
2299 (min == 1 && max == -1))
2300 length++;
2301 else length += 5;
2302 if (ptr[1] == '?') ptr++;
2303 }
2304 continue;
2305
2306 /* Brackets may be genuine groups or special things */
2307
2308 case '(':
2309
2310 /* Handle special forms of bracket, which all start (? */
2311
2312 if (ptr[1] == '?') switch (c = ptr[2])
2313 {
2314 /* Skip over comments entirely */
2315 case '#':
2316 ptr += 3;
2317 while (*ptr != 0 && *ptr != ')') ptr++;
2318 if (*ptr == 0)
2319 {
2320 *errorptr = "missing ) after comment";
2321 goto PCRE_ERROR_RETURN;
2322 }
2323 continue;
2324
2325 /* Non-referencing groups and lookaheads just move the pointer on, and
2326 then behave like a non-special bracket. */
2327
2328 case ':':
2329 case '=':
2330 case '!':
2331 ptr += 2;
2332 break;
2333
2334 /* Else loop setting valid options until ) is met. Anything else is an
2335 error. */
2336
2337 case ('P'):
2338 {
2339 int idlen;
2340 switch (*ptr++) {
2341 case ('<'):
2342 idlen = get_group_id(ptr++, '>', errorptr);
2343 if (*errorptr) goto PCRE_ERROR_RETURN;
2344 ptr += idlen+1;
2345 break;
2346 case ('='):
2347 idlen = get_group_id(ptr++, ')', errorptr);
2348 if (*errorptr) goto PCRE_ERROR_RETURN;
2349 ptr += idlen+1;
2350 length++;
2351 break;
2352 }
2353 }
2354 break;
2355
2356 default:
2357 ptr += 2;
2358 for (;; ptr++)
2359 {
2360 if ((c = *ptr) == 'i')
2361 {
2362 options |= PCRE_CASELESS;
2363 continue;
2364 }
2365 else if ((c = *ptr) == 'm')
2366 {
2367 options |= PCRE_MULTILINE;
2368 continue;
2369 }
2370 else if ((c = *ptr) == 's')
2371 {
2372 options |= PCRE_DOTALL;
2373 continue;
2374 }
2375 else if (c == 'x')
2376 {
2377 options |= PCRE_EXTENDED;
2378 extended = TRUE;
2379 length -= spaces; /* Already counted spaces */
2380 continue;
2381 }
2382 else if (c == ')') break;
2383
2384 *errorptr = "undefined after (?";
2385 goto PCRE_ERROR_RETURN;
2386 }
2387 continue; /* End of this bracket handling */
2388 }
2389
2390 /* Non-special forms of bracket. Save length for computing whole length
2391 at end if there's a repeat that requires duplication of the group. */
2392
2393 if (brastackptr >= sizeof(brastack)/sizeof(int))
2394 {
2395 *errorptr = "too many brackets";
2396 goto PCRE_ERROR_RETURN;
2397 }
2398
2399 brastack[brastackptr++] = length;
2400 length += 3;
2401 continue;
2402
2403 /* Handle ket. Look for subsequent max/min; for certain sets of values we
2404 have to replicate this bracket up to that many times. */
2405
2406 case ')':
2407 length += 3;
2408 {
2409 int min = 1;
2410 int max = 1;
2411 int duplength = length - brastack[--brastackptr];
2412
2413 /* Leave ptr at the final char; for read_repeat_counts this happens
2414 automatically; for the others we need an increment. */
2415
2416 if ((c = ptr[1]) == '{')
2417 {
2418 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2419 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2420 }
2421 else if (c == '*') { min = 0; max = -1; ptr++; }
2422 else if (c == '+') { max = -1; ptr++; }
2423 else if (c == '?') { min = 0; ptr++; }
2424
2425 /* If there is a minimum > 1 we have to replicate up to min-1 times; if
2426 there is a limited maximum we have to replicate up to max-1 times and
2427 allow for a BRAZERO item before each optional copy, as we also have to
2428 do before the first copy if the minimum is zero. */
2429
2430 if (min == 0) length++;
2431 else if (min > 1) length += (min - 1) * duplength;
2432 if (max > min) length += (max - min) * (duplength + 1);
2433 }
2434
2435 continue;
2436
2437 /* Non-special character. For a run of such characters the length required
2438 is the number of characters + 2, except that the maximum run length is 255.
2439 We won't get a skipped space or a non-data escape or the start of a #
2440 comment as the first character, so the length can't be zero. */
2441
2442 NORMAL_CHAR:
2443 default:
2444 length += 2;
2445 runlength = 0;
2446 do
2447 {
2448 if ((pcre_ctypes[c] & ctype_space) != 0)
2449 {
2450 if (extended) continue;
2451 spaces++;
2452 }
2453
2454 if (extended && c == '#')
2455 {
2456 while ((c = *(++ptr)) != 0 && c != '\n');
2457 continue;
2458 }
2459
2460 /* Backslash may introduce a data char or a metacharacter; stop the
2461 string before the latter. */
2462
2463 if (c == '\\')
2464 {
2465 uschar *saveptr = ptr;
2466 c = check_escape(&ptr, errorptr);
2467 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2468 if (c < 0) { ptr = saveptr; break; }
2469 }
2470
2471 /* Ordinary character or single-char escape */
2472
2473 runlength++;
2474 }
2475
2476 /* This "while" is the end of the "do" above. */
2477
2478 while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2479
2480 ptr--;
2481 length += runlength;
2482 continue;
2483 }
2484 }
2485
2486length += 4; /* For final KET and END */
2487
2488if (length > 65539)
2489 {
2490 *errorptr = "regular expression too large";
2491 return NULL;
2492 }
2493
2494/* Compute the size of data block needed and get it, either from malloc or
2495externally provided function. Put in the magic number and the options. */
2496
2497size = length + sizeof(real_pcre) - sizeof(re->code);
2498re = (real_pcre *)(pcre_malloc)(size);
2499
2500if (re == NULL)
2501 {
2502 *errorptr = "failed to get memory";
2503 return NULL;
2504 }
2505
2506re->magic_number = MAGIC_NUMBER;
2507re->options = options;
2508
2509/* Set up a starting, non-extracting bracket, then compile the expression. On
2510error, *errorptr will be set non-NULL, so we don't need to look at the result
2511of the function here. */
2512
2513ptr = (uschar *)pattern;
2514code = re->code;
2515*code = OP_BRA;
2516(void)compile_regexp(extended, brackets, &code, &ptr, errorptr, dictionary);
2517re->top_bracket = brackets[1];
2518
2519/* If not reached end of pattern on success, there's an excess bracket. */
2520
2521if (*errorptr == NULL && *ptr != 0) *errorptr = "unmatched brackets";
2522/* Fill in the terminating state and check for disastrous overflow, but
2523if debugging, leave the test till after things are printed out. */
2524
2525*code++ = OP_END;
2526
2527#ifndef DEBUG
2528if (code - re->code > length) *errorptr = "internal error: code overflow";
2529#endif
2530
2531/* Failed to compile */
2532
2533if (*errorptr != NULL)
2534 {
2535 (pcre_free)(re);
2536 PCRE_ERROR_RETURN:
2537 *erroroffset = ptr - (uschar *)pattern;
2538 return NULL;
2539 }
2540
2541/* If the anchored option was not passed, set flag if we can determine that it
2542is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
2543we can determine what the first character has to be, because that speeds up
2544unanchored matches no end. In the case of multiline matches, an alternative is
2545to set the PCRE_STARTLINE flag if all branches start with ^. */
2546
2547if ((options & PCRE_ANCHORED) == 0)
2548 {
2549 if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))
2550 re->options |= PCRE_ANCHORED;
2551 else
2552 {
2553 int c = find_firstchar(re->code);
2554 if (c >= 0)
2555 {
2556 re->first_char = c;
2557 re->options |= PCRE_FIRSTSET;
2558 }
2559 else if (is_startline(re->code))
2560 re->options |= PCRE_STARTLINE;
2561 }
2562 }
2563
2564/* Print out the compiled data for debugging */
2565
2566#ifdef DEBUG
2567
2568printf("Length = %d top_bracket = %d%s%s%s%s\n",
2569 length, re->top_bracket,
2570 ((re->options & PCRE_ANCHORED) != 0)? " anchored" : "",
2571 ((re->options & PCRE_CASELESS) != 0)? " caseless" : "",
2572 extended? " extended" : "",
2573 ((re->options & PCRE_MULTILINE) != 0)? " multiline" : "");
2574
2575if ((re->options & PCRE_FIRSTSET) != 0)
2576 {
2577 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2578 else printf("First char = \\x%02x\n", re->first_char);
2579 }
2580
2581code_end = code;
2582code_base = code = re->code;
2583
2584while (code < code_end)
2585 {
2586 int charlength;
2587
2588 printf("%3d ", code - code_base);
2589
2590 if (*code >= OP_BRA)
2591 {
2592 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2593 code += 2;
2594 }
2595
2596 else switch(*code)
2597 {
2598 case OP_CHARS:
2599 charlength = *(++code);
2600 printf("%3d ", charlength);
2601 while (charlength-- > 0)
2602 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2603 break;
2604
2605 case OP_KETRMAX:
2606 case OP_KETRMIN:
2607 case OP_ALT:
2608 case OP_KET:
2609 case OP_ASSERT:
2610 case OP_ASSERT_NOT:
2611 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2612 code += 2;
2613 break;
2614
2615 case OP_STAR:
2616 case OP_MINSTAR:
2617 case OP_PLUS:
2618 case OP_MINPLUS:
2619 case OP_QUERY:
2620 case OP_MINQUERY:
2621 case OP_TYPESTAR:
2622 case OP_TYPEMINSTAR:
2623 case OP_TYPEPLUS:
2624 case OP_TYPEMINPLUS:
2625 case OP_TYPEQUERY:
2626 case OP_TYPEMINQUERY:
2627 if (*code >= OP_TYPESTAR)
2628 printf(" %s", OP_names[code[1]]);
2629 else if (isprint(c = code[1])) printf(" %c", c);
2630 else printf(" \\x%02x", c);
2631 printf("%s", OP_names[*code++]);
2632 break;
2633
2634 case OP_EXACT:
2635 case OP_UPTO:
2636 case OP_MINUPTO:
2637 if (isprint(c = code[3])) printf(" %c{", c);
2638 else printf(" \\x%02x{", c);
2639 if (*code != OP_EXACT) printf(",");
2640 printf("%d}", (code[1] << 8) + code[2]);
2641 if (*code == OP_MINUPTO) printf("?");
2642 code += 3;
2643 break;
2644
2645 case OP_TYPEEXACT:
2646 case OP_TYPEUPTO:
2647 case OP_TYPEMINUPTO:
2648 printf(" %s{", OP_names[code[3]]);
2649 if (*code != OP_TYPEEXACT) printf(",");
2650 printf("%d}", (code[1] << 8) + code[2]);
2651 if (*code == OP_TYPEMINUPTO) printf("?");
2652 code += 3;
2653 break;
2654
2655 case OP_REF:
2656 printf(" \\%d", *(++code));
2657 break;
2658
2659 case OP_CLASS:
2660 case OP_NEGCLASS:
2661 {
2662 int i, min, max;
2663 int flags = code[1];
2664 int rangecount = code[2];
2665 int charcount = code[3];
2666
2667 printf(" [%s", (*code == OP_CLASS)? "" : "^");
2668 code += 3;
2669
2670 for (i = 0; i < 8; i++)
2671 if ((flags & (1 << i)) != 0) printf("%s", class_names[i]);
2672
2673 for (i = 0; i < rangecount; i++)
2674 {
2675 if (isprint(*(++code))) printf("%c-", *code); else printf("\\x%02x-", *code);
2676 if (isprint(*(++code))) printf("%c", *code); else printf("\\x%02x", *code);
2677 }
2678
2679 for (i = 0; i < charcount; i++)
2680 {
2681 if (!isprint(*(++code))) printf("\\x%02x", *code);
2682 else if (strchr("-\\]", *code) != NULL) printf("\\%c", *code);
2683 else printf("%c", *code);
2684 }
2685 printf("]");
2686
2687 switch(*(++code))
2688 {
2689 case OP_CRSTAR:
2690 case OP_CRMINSTAR:
2691 case OP_CRPLUS:
2692 case OP_CRMINPLUS:
2693 case OP_CRQUERY:
2694 case OP_CRMINQUERY:
2695 printf("%s", OP_names[*code]);
2696 break;
2697
2698 case OP_CRRANGE:
2699 case OP_CRMINRANGE:
2700 min = (code[1] << 8) + code[2];
2701 max = (code[3] << 8) + code[4];
2702 if (max == 0) printf("{%d,}", min);
2703 else printf("{%d,%d}", min, max);
2704 if (*code == OP_CRMINRANGE) printf("?");
2705 code += 4;
2706 break;
2707
2708 default:
2709 code--;
2710 }
2711 }
2712 break;
2713
2714 /* Anything else is just a one-node item */
2715
2716 default:
2717 printf(" %s", OP_names[*code]);
2718 break;
2719 }
2720
2721 code++;
2722 printf("\n");
2723 }
2724printf("------------------------------------------------------------------\n");
2725
2726/* This check is done here in the debugging case so that the code that
2727was compiled can be seen. */
2728
2729if (code - re->code > length)
2730 {
2731 *errorptr = "internal error: code overflow";
2732 (pcre_free)(re);
2733 *erroroffset = ptr - (uschar *)pattern;
2734 return NULL;
2735 }
2736#endif
2737
2738return (pcre *)re;
2739}
2740
2741
2742
2743/*************************************************
2744* Match a character type *
2745*************************************************/
2746
2747/* Not used in all the places it might be as it's sometimes faster
2748to put the code inline.
2749
2750Arguments:
2751 type the character type
2752 c the character
2753 multiline the multiline flag
2754
2755Returns: TRUE if character is of the type
2756*/
2757
2758static BOOL
2759match_type(int type, int c, BOOL dotall)
2760{
2761
2762#ifdef DEBUG
2763if (isprint(c)) printf("matching subject %c against ", c);
2764 else printf("matching subject \\x%02x against ", c);
2765printf("%s\n", OP_names[type]);
2766#endif
2767
2768switch(type)
2769 {
2770 case OP_ANY: return dotall || c != '\n';
2771 case OP_NOT_DIGIT: return (pcre_ctypes[c] & ctype_digit) == 0;
2772 case OP_DIGIT: return (pcre_ctypes[c] & ctype_digit) != 0;
2773 case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
2774 case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0;
2775 case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0;
2776 case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0;
2777 }
2778return FALSE;
2779}
2780
2781
2782/*************************************************
2783* Match a character class *
2784*************************************************/
2785
2786/* Return "result" if char is in the class and "!result" otherwise.
2787
2788Arguments:
2789 data points to the class item
2790 c the subject character
2791 result value to return if in class
2792 md matching "static" data
2793
2794Returns: result or !result
2795*/
2796
2797static BOOL
2798match_class(register uschar *data, register int c, BOOL result, match_data *md)
2799{
2800int flags = data[1];
2801int i;
2802uschar *base = data;
2803uschar *end;
2804
2805#ifdef DEBUG
2806 {
2807 uschar *d = base + 3;
2808
2809 if (isprint(c))
2810 printf("match %c against [%s", c, result? "" : "^");
2811 else
2812 printf("match \\x%02x against [%s", c, result? "" : "^");
2813
2814 for (i = 0; i < 8; i++)
2815 if ((flags & (1 << i)) != 0) printf("%s", class_names[i]);
2816
2817 for (i = 0; i < data[2]; i++)
2818 {
2819 if (isprint(*(++d))) printf("%c-", *d); else printf("\\x%02x-", *d);
2820 if (isprint(*(++d))) printf("%c", *d); else printf("\\x%02x", *d);
2821 }
2822
2823 for (i = 0; i < data[3]; i++)
2824 {
2825 if (!isprint(*(++d))) printf("\\x%02x", *d);
2826 else if (strchr("-\\]", *d) != NULL) printf("\\%c", *d);
2827 else printf("%c", *d);
2828 }
2829 printf("]\n");
2830 }
2831#endif
2832
2833/* Test for any character types */
2834
2835for (i = 0; flags != 0; i++)
2836 {
2837 if ((flags & 1) != 0 && match_type(class_ops[i], c, md->dotall))
2838 return result;
2839 flags >>= 1;
2840 }
2841
2842/* Advance pointer to the specific chars and do the caseless or caseful testing
2843of the ranges and individual characters as necessary. */
2844
2845data += 4;
2846end = data + base[2] * 2;
2847
2848/* Caseless character ranges are slightly tricky, because of cases like [W-c].
2849What we do is to uppercase the subject char if it is beyond the end of the
2850range, or lowercase it if it is before the start of the range and try again if
2851a caseful comparison has failed. This works because upper case letters come
2852before lower case in ASCII code. It would not work in EBCDIC, for example,
2853where they are the other way round, but then ranges like [W-c] would be illegal
2854in EBCDIC. */
2855
2856if (md->caseless)
2857 {
2858 while (data < end)
2859 {
2860 register int d;
2861 if (c >= (int)*data && c <= (int)data[1]) return result;
2862 d = (c < (int)*data)? pcre_lcc[c] : pcre_ucc[c];
2863 if (d >= (int)*data && d <= (int)data[1]) return result;
2864 data += 2;
2865 }
2866 end += base[3];
2867 c = pcre_lcc[c];
2868 while (data < end) if (c == pcre_lcc[*data++]) return result;
2869 }
2870
2871/* Caseful is easy */
2872
2873else
2874 {
2875 while (data < end)
2876 {
2877 if (c >= (int)*data && c <= (int)data[1]) return result;
2878 data += 2;
2879 }
2880 end += base[3];
2881 while (data < end) if (c == *data++) return result;
2882 }
2883
2884/* Character is not in the class */
2885
2886return !result;
2887}
2888
2889
2890
2891/*************************************************
2892* Match a back-reference *
2893*************************************************/
2894
2895/* If a back reference hasn't been set, the match fails.
2896
2897Arguments:
2898 number reference number
2899 eptr points into the subject
2900 length length to be matched
2901 md points to match data block
2902
2903Returns: TRUE if matched
2904*/
2905
2906static BOOL
2907match_ref(int number, register uschar *eptr, int length, match_data *md)
2908{
2909uschar *p = md->start_subject + md->offset_vector[number];
2910
2911#ifdef DEBUG
2912if (eptr >= md->end_subject)
2913 printf("matching subject <null>");
2914else
2915 {
2916 printf("matching subject ");
2917 pchars(eptr, length, TRUE, md);
2918 }
2919printf(" against backref ");
2920pchars(p, length, FALSE, md);
2921printf("\n");
2922#endif
2923
2924/* Always fail if not enough characters left */
2925
2926if (length > md->end_subject - p) return FALSE;
2927
2928/* Separate the caselesss case for speed */
2929
2930if (md->caseless)
2931 { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }
2932else
2933 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
2934
2935return TRUE;
2936}
2937
2938static int free_stack(match_data *md)
2939{
2940/* Free any stack space that was allocated by the call to match(). */
2941if (md->off_num) free(md->off_num);
2942if (md->offset_top) free(md->offset_top);
2943if (md->r1) free(md->r1);
2944if (md->r2) free(md->r2);
2945if (md->eptr) free(md->eptr);
Guido van Rossumc3861071997-10-08 02:07:40 +00002946if (md->ecode) free(md->ecode);
2947return 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002948}
2949
2950static int grow_stack(match_data *md)
2951{
2952 md->length = md->length ? md->length+md->length/2 : 200;
2953 md->offset_top=realloc(md->offset_top, md->length*sizeof(int));
2954 md->eptr=realloc(md->eptr, md->length*sizeof(void *));
2955 md->ecode=realloc(md->ecode, md->length*sizeof(void *));
2956 md->off_num=realloc(md->off_num, md->length*sizeof(int));
2957 md->r1=realloc(md->r1, md->length*sizeof(int));
2958 md->r2=realloc(md->r2, md->length*sizeof(int));
2959 return 0;
2960}
2961
2962/*************************************************
2963* Match from current position *
2964*************************************************/
2965
2966/* On entry ecode points to the first opcode, and eptr to the first character.
2967
2968Arguments:
2969 eptr pointer in subject
2970 ecode position in code
2971 offset_top current top pointer
2972 md pointer to "static" info for the match
2973
2974Returns: TRUE if matched
2975*/
2976
2977static BOOL
2978match(register uschar *eptr, register uschar *ecode, int offset_top,
2979 match_data *md)
2980{
2981 int save_stack_position = md->point;
2982match_loop:
2983
2984#define SUCCEED goto succeed
2985#define FAIL goto fail
2986
2987for (;;)
2988 {
2989 int min, max, ctype;
2990 register int i;
2991 register int c;
Guido van Rossumc3861071997-10-08 02:07:40 +00002992 BOOL minimize = 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002993
2994 /* Opening bracket. Check the alternative branches in turn, failing if none
2995 match. We have to set the start offset if required and there is space
2996 in the offset vector so that it is available for subsequent back references
2997 if the bracket matches. However, if the bracket fails, we must put back the
2998 previous value of both offsets in case they were set by a previous copy of
2999 the same bracket. Don't worry about setting the flag for the error case here;
3000 that is handled in the code for KET. */
3001
3002 if ((int)*ecode >= OP_BRA)
3003 {
3004 int number = (*ecode - OP_BRA) << 1;
Guido van Rossumc3861071997-10-08 02:07:40 +00003005 int save_offset1 = 0, save_offset2 = 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003006
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003007#ifdef DEBUG
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003008 printf("start bracket %d\n", number/2);
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003009#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003010
3011 if (number > 0 && number < md->offset_end)
3012 {
3013 save_offset1 = md->offset_vector[number];
3014 save_offset2 = md->offset_vector[number+1];
3015 md->offset_vector[number] = eptr - md->start_subject;
3016
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003017#ifdef DEBUG
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003018 printf("saving %d %d\n", save_offset1, save_offset2);
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003019#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003020 }
3021
3022 /* Recurse for all the alternatives. */
3023
3024 do
3025 {
3026 if (match(eptr, ecode+3, offset_top, md)) SUCCEED;
3027 ecode += (ecode[1] << 8) + ecode[2];
3028 }
3029 while (*ecode == OP_ALT);
3030
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003031#ifdef DEBUG
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003032 printf("bracket %d failed\n", number/2);
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003033#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003034
3035 if (number > 0 && number < md->offset_end)
3036 {
3037 md->offset_vector[number] = save_offset1;
3038 md->offset_vector[number+1] = save_offset2;
3039 }
3040
3041 FAIL;
3042 }
3043
3044 /* Other types of node can be handled by a switch */
3045
3046 switch(*ecode)
3047 {
3048 case OP_END:
3049 md->end_match_ptr = eptr; /* Record where we ended */
3050 md->end_offset_top = offset_top; /* and how many extracts were taken */
3051 SUCCEED;
3052
3053 /* Assertion brackets. Check the alternative branches in turn - the
3054 matching won't pass the KET for an assertion. If any one branch matches,
3055 the assertion is true. */
3056
3057 case OP_ASSERT:
3058 do
3059 {
3060 if (match(eptr, ecode+3, offset_top, md)) break;
3061 ecode += (ecode[1] << 8) + ecode[2];
3062 }
3063 while (*ecode == OP_ALT);
3064 if (*ecode == OP_KET) FAIL;
3065
3066 /* Continue from after the assertion, updating the offsets high water
3067 mark, since extracts may have been taken during the assertion. */
3068
3069 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3070 ecode += 3;
3071 offset_top = md->end_offset_top;
3072 continue;
3073
3074 /* Negative assertion: all branches must fail to match */
3075
3076 case OP_ASSERT_NOT:
3077 do
3078 {
3079 if (match(eptr, ecode+3, offset_top, md)) FAIL;
3080 ecode += (ecode[1] << 8) + ecode[2];
3081 }
3082 while (*ecode == OP_ALT);
3083 ecode += 3;
3084 continue;
3085
3086 /* An alternation is the end of a branch; scan along to find the end of the
3087 bracketed group and go to there. */
3088
3089 case OP_ALT:
3090 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3091 break;
3092
3093 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3094 that it may occur zero times. It may repeat infinitely, or not at all -
3095 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3096 repeat limits are compiled as a number of copies, with the optional ones
3097 preceded by BRAZERO or BRAMINZERO. */
3098
3099 case OP_BRAZERO:
3100 {
3101 uschar *next = ecode+1;
3102 if (match(eptr, next, offset_top, md)) SUCCEED;
3103 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3104 ecode = next + 3;
3105 }
3106 break;
3107
3108 case OP_BRAMINZERO:
3109 {
3110 uschar *next = ecode+1;
3111 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3112 if (match(eptr, next+3, offset_top, md)) SUCCEED;
3113 ecode++;
3114 }
3115 break;;
3116
3117 /* End of a group, repeated or non-repeating. If we are at the end of
3118 an assertion "group", stop matching and SUCCEED, but record the
3119 current high water mark for use by positive assertions. */
3120
3121 case OP_KET:
3122 case OP_KETRMIN:
3123 case OP_KETRMAX:
3124 {
3125 int number, start, end;
3126 uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3127
3128 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT)
3129 {
3130 md->end_offset_top = offset_top;
3131 SUCCEED;
3132 }
3133
3134 /* In all other cases we have to check the group number back at the
3135 start and if necessary complete handling an extraction by setting the
3136 final offset and bumping the high water mark. */
3137
3138 number = (*prev - OP_BRA) << 1;
3139
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003140#ifdef DEBUG
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003141 printf("end bracket %d\n", number/2);
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003142#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003143
3144 if (number > 0)
3145 {
3146 if (number >= md->offset_end) md->offset_overflow = TRUE; else
3147 {
3148 start=md->offset_vector[number] ; end =md->offset_vector[number+1];
3149 md->offset_vector[number+1] = eptr - md->start_subject;
3150 if (offset_top <= number) offset_top = number + 2;
3151 }
3152 }
3153
3154 /* For a non-repeating ket, just advance to the next node and continue at
3155 this level. */
3156
3157 if (*ecode == OP_KET)
3158 {
3159 ecode += 3;
3160 break;
3161 }
3162
3163 /* The repeating kets try the rest of the pattern or restart from the
3164 preceding bracket, in the appropriate order. */
3165
3166 if (*ecode == OP_KETRMIN)
3167 {
3168 uschar *ptr;
3169 if (match(eptr, ecode+3, offset_top, md)) goto succeed;
3170 /* Handle alternation inside the BRA...KET; push the additional
3171 alternatives onto the stack
3172 XXX this tries the alternatives backwards! */
3173 ptr=prev;
3174 do {
3175 ptr += (ptr[1]<<8)+ ptr[2];
3176 if (*ptr==OP_ALT)
3177 {
3178 if (md->length == md->point) grow_stack(md);
3179 md->offset_top[md->point] = offset_top;
3180 md->eptr[md->point] = eptr;
3181 md->ecode[md->point] = ptr+3;
3182 md->r1[md->point] = 0;
3183 md->r2[md->point] = 0;
3184 md->off_num[md->point] = 0;
3185 md->point++;
3186 }
3187 } while (*ptr==OP_ALT);
3188 ecode=prev+3; goto match_loop;
3189 }
3190 else /* OP_KETRMAX */
3191 {
3192 uschar *ptr;
3193 int points_pushed=0;
3194
3195 /* Push one failure point, that will resume matching at the code after
3196 the KETRMAX opcode. */
3197 if (md->length == md->point) grow_stack(md);
3198 md->offset_top[md->point] = offset_top;
3199 md->eptr[md->point] = eptr;
3200 md->ecode[md->point] = ecode+3;
3201 md->r1[md->point] = md->offset_vector[number];
3202 md->r2[md->point] = md->offset_vector[number+1];
3203 md->off_num[md->point] = number;
3204 md->point++;
3205
3206 md->offset_vector[number] = eptr - md->start_subject;
3207 /* Handle alternation inside the BRA...KET; push each of the
3208 additional alternatives onto the stack
3209 XXX this tries the alternatives backwards! */
3210 ptr=prev;
3211 do {
3212 ptr += (ptr[1]<<8)+ ptr[2];
3213 if (*ptr==OP_ALT)
3214 {
3215 if (md->length == md->point) grow_stack(md);
3216 md->offset_top[md->point] = offset_top;
3217 md->eptr[md->point] = eptr;
3218 md->ecode[md->point] = ptr+3;
3219 md->r1[md->point] = 0;
3220 md->r2[md->point] = 0;
3221 md->off_num[md->point] = 0;
3222 md->point++;
3223 points_pushed++;
3224 }
3225 } while (*ptr==OP_ALT);
3226 /* Jump to the first (or only) alternative and resume trying to match */
3227 ecode=prev+3; goto match_loop;
3228 }
3229 }
3230 FAIL;
3231
3232 /* Start of subject, or after internal newline if multiline */
3233
3234 case OP_CIRC:
3235 if (md->multiline)
3236 {
3237 if (eptr != md->start_subject && eptr[-1] != '\n') FAIL;
3238 ecode++;
3239 break;
3240 }
3241 /* ... else fall through */
3242
3243 /* Start of subject assertion */
3244
3245 case OP_SOD:
3246 if (eptr != md->start_subject) FAIL;
3247 ecode++;
3248 break;
3249
3250 /* End of subject, or before internal newline if multiline */
3251
3252 case OP_DOLL:
3253 if (md->multiline)
3254 {
3255 if (eptr < md->end_subject && *eptr != '\n') FAIL;
3256 ecode++;
3257 break;
3258 }
3259 /* ... else fall through */
3260
3261 /* End of subject assertion */
3262
3263 case OP_EOD:
3264 if (eptr < md->end_subject) FAIL;
3265 ecode++;
3266 break;
3267
3268 /* Word boundary assertions */
3269
3270 case OP_NOT_WORD_BOUNDARY:
3271 case OP_WORD_BOUNDARY:
3272 {
3273 BOOL prev_is_word = (eptr != md->start_subject) &&
3274 ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
3275 BOOL cur_is_word = (eptr < md->end_subject) &&
3276 ((pcre_ctypes[*eptr] & ctype_word) != 0);
3277 if ((*ecode++ == OP_WORD_BOUNDARY)?
3278 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3279 FAIL;
3280 }
3281 break;
3282
3283 /* Match a single character type; inline for speed */
3284
3285 case OP_ANY:
3286 if (!md->dotall && eptr < md->end_subject && *eptr == '\n') FAIL;
3287 if (eptr++ >= md->end_subject) FAIL;
3288 ecode++;
3289 break;
3290
3291 case OP_NOT_DIGIT:
3292 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)
3293 FAIL;
3294 ecode++;
3295 break;
3296
3297 case OP_DIGIT:
3298 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)
3299 FAIL;
3300 ecode++;
3301 break;
3302
3303 case OP_NOT_WHITESPACE:
3304 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)
3305 FAIL;
3306 ecode++;
3307 break;
3308
3309 case OP_WHITESPACE:
3310 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)
3311 FAIL;
3312 ecode++;
3313 break;
3314
3315 case OP_NOT_WORDCHAR:
3316 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)
3317 FAIL;
3318 ecode++;
3319 break;
3320
3321 case OP_WORDCHAR:
3322 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)
3323 FAIL;
3324 ecode++;
3325 break;
3326
3327 /* Match a back reference, possibly repeatedly. Look past the end of the
3328 item to see if there is repeat information following. The code is similar
3329 to that for character classes, but repeated for efficiency. Then obey
3330 similar code to character type repeats - written out again for speed.
3331 However, if the referenced string is the empty string, always treat
3332 it as matched, any number of times (otherwise there could be infinite
3333 loops). */
3334
3335 case OP_REF:
3336 {
3337 int length;
3338 int number = ecode[1] << 1; /* Doubled reference number */
3339 ecode += 2; /* Advance past the item */
3340
3341 if (number >= offset_top || md->offset_vector[number] < 0)
3342 {
3343 md->errorcode = PCRE_ERROR_BADREF;
3344 FAIL;
3345 }
3346
3347 length = md->offset_vector[number+1] - md->offset_vector[number];
3348
3349 switch (*ecode)
3350 {
3351 case OP_CRSTAR:
3352 case OP_CRMINSTAR:
3353 case OP_CRPLUS:
3354 case OP_CRMINPLUS:
3355 case OP_CRQUERY:
3356 case OP_CRMINQUERY:
3357 c = *ecode++ - OP_CRSTAR;
3358 minimize = (c & 1) != 0;
3359 min = rep_min[c]; /* Pick up values from tables; */
3360 max = rep_max[c]; /* zero for max => infinity */
3361 if (max == 0) max = INT_MAX;
3362 break;
3363
3364 case OP_CRRANGE:
3365 case OP_CRMINRANGE:
3366 minimize = (*ecode == OP_CRMINRANGE);
3367 min = (ecode[1] << 8) + ecode[2];
3368 max = (ecode[3] << 8) + ecode[4];
3369 if (max == 0) max = INT_MAX;
3370 ecode += 5;
3371 break;
3372
3373 default: /* No repeat follows */
3374 if (!match_ref(number, eptr, length, md)) FAIL;
3375 eptr += length;
3376 continue; /* With the main loop */
3377 }
3378
3379 /* If the length of the reference is zero, just continue with the
3380 main loop. */
3381
3382 if (length == 0) continue;
3383
3384 /* First, ensure the minimum number of matches are present. We get back
3385 the length of the reference string explicitly rather than passing the
3386 address of eptr, so that eptr can be a register variable. */
3387
3388 for (i = 1; i <= min; i++)
3389 {
3390 if (!match_ref(number, eptr, length, md)) FAIL;
3391 eptr += length;
3392 }
3393
3394 /* If min = max, continue at the same level without recursion.
3395 They are not both allowed to be zero. */
3396
3397 if (min == max) continue;
3398
3399 /* If minimizing, keep trying and advancing the pointer */
3400
3401 if (minimize)
3402 {
3403 for (i = min;; i++)
3404 {
3405 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3406 if (i >= max || !match_ref(number, eptr, length, md))
3407 FAIL;
3408 eptr += length;
3409 }
3410 /* Control never gets here */
3411 }
3412
3413 /* If maximizing, find the longest string and work backwards */
3414
3415 else
3416 {
3417 uschar *pp = eptr;
3418 for (i = min; i < max; i++)
3419 {
3420 if (!match_ref(number, eptr, length, md)) break;
3421 eptr += length;
3422 }
3423 while (eptr >= pp)
3424 {
3425 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3426 eptr -= length;
3427 }
3428 FAIL;
3429 }
3430 }
3431 /* Control never gets here */
3432
3433 /* Match a character class, possibly repeatedly. Look past the end of the
3434 item to see if there is repeat information following. Then obey similar
3435 code to character type repeats - written out again for speed. */
3436
3437 case OP_CLASS:
3438 case OP_NEGCLASS:
3439 {
3440 BOOL result = *ecode == OP_CLASS;
3441 uschar *data = ecode; /* Save for matching */
3442
3443 ecode += 4 + 2 * ecode[2] + ecode[3]; /* Advance past the item */
3444
3445 switch (*ecode)
3446 {
3447 case OP_CRSTAR:
3448 case OP_CRMINSTAR:
3449 case OP_CRPLUS:
3450 case OP_CRMINPLUS:
3451 case OP_CRQUERY:
3452 case OP_CRMINQUERY:
3453 c = *ecode++ - OP_CRSTAR;
3454 minimize = (c & 1) != 0;
3455 min = rep_min[c]; /* Pick up values from tables; */
3456 max = rep_max[c]; /* zero for max => infinity */
3457 if (max == 0) max = INT_MAX;
3458 break;
3459
3460 case OP_CRRANGE:
3461 case OP_CRMINRANGE:
3462 minimize = (*ecode == OP_CRMINRANGE);
3463 min = (ecode[1] << 8) + ecode[2];
3464 max = (ecode[3] << 8) + ecode[4];
3465 if (max == 0) max = INT_MAX;
3466 ecode += 5;
3467 break;
3468
3469 default: /* No repeat follows */
3470 if (eptr >= md->end_subject || !match_class(data, *eptr++, result, md))
3471 FAIL;
3472 continue; /* With the main loop */
3473 }
3474
3475 /* First, ensure the minimum number of matches are present. */
3476
3477 for (i = 1; i <= min; i++)
3478 if (eptr >= md->end_subject || !match_class(data, *eptr++, result, md))
3479 FAIL;
3480
3481 /* If max == min we can continue with the main loop without the
3482 need to recurse. */
3483
3484 if (min == max) continue;
3485
3486 /* If minimizing, keep testing the rest of the expression and advancing
3487 the pointer while it matches the class. */
3488
3489 if (minimize)
3490 {
3491 for (i = min;; i++)
3492 {
3493 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3494 if (i >= max || eptr >= md->end_subject ||
3495 !match_class(data, *eptr++, result, md)) FAIL;
3496 }
3497 /* Control never gets here */
3498 }
3499
3500 /* If maximizing, find the longest possible run, then work backwards. */
3501
3502 else
3503 {
3504 uschar *pp = eptr;
3505 for (i = min; i < max; i++)
3506 {
3507 if (eptr >= md->end_subject || !match_class(data, *eptr, result, md))
3508 break;
3509 eptr++;
3510 }
3511 while (eptr >= pp)
3512 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3513 FAIL;
3514 }
3515 }
3516 /* Control never gets here */
3517
3518 /* Match a run of characters */
3519
3520 case OP_CHARS:
3521 {
3522 register int length = ecode[1];
3523 ecode += 2;
3524
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003525#ifdef DEBUG
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003526 if (eptr >= md->end_subject)
3527 printf("matching subject <null> against pattern ");
3528 else
3529 {
3530 printf("matching subject ");
3531 pchars(eptr, length, TRUE, md);
3532 printf(" against pattern ");
3533 }
3534 pchars(ecode, length, FALSE, md);
3535 printf("\n");
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003536#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003537
3538 if (length > md->end_subject - eptr) FAIL;
3539 if (md->caseless)
3540 {
3541 while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) FAIL;
3542 }
3543 else
3544 {
3545 while (length-- > 0) if (*ecode++ != *eptr++) FAIL;
3546 }
3547 }
3548 break;
3549
3550 /* Match a single character repeatedly; different opcodes share code. */
3551
3552 case OP_EXACT:
3553 min = max = (ecode[1] << 8) + ecode[2];
3554 ecode += 3;
3555 goto REPEATCHAR;
3556
3557 case OP_UPTO:
3558 case OP_MINUPTO:
3559 min = 0;
3560 max = (ecode[1] << 8) + ecode[2];
3561 minimize = *ecode == OP_MINUPTO;
3562 ecode += 3;
3563 goto REPEATCHAR;
3564
3565 case OP_STAR:
3566 case OP_MINSTAR:
3567 case OP_PLUS:
3568 case OP_MINPLUS:
3569 case OP_QUERY:
3570 case OP_MINQUERY:
3571 c = *ecode++ - OP_STAR;
3572 minimize = (c & 1) != 0;
3573 min = rep_min[c]; /* Pick up values from tables; */
3574 max = rep_max[c]; /* zero for max => infinity */
3575 if (max == 0) max = INT_MAX;
3576
3577 /* Common code for all repeated single-character matches. We can give
3578 up quickly if there are fewer than the minimum number of characters left in
3579 the subject. */
3580
3581 REPEATCHAR:
3582 if (min > md->end_subject - eptr) FAIL;
3583 c = *ecode++;
3584
3585 /* The code is duplicated for the caseless and caseful cases, for speed,
3586 since matching characters is likely to be quite common. First, ensure the
3587 minimum number of matches are present. If min = max, continue at the same
3588 level without recursing. Otherwise, if minimizing, keep trying the rest of
3589 the expression and advancing one matching character if failing, up to the
3590 maximum. Alternatively, if maximizing, find the maximum number of
3591 characters and work backwards. */
3592
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003593#ifdef DEBUG
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003594 printf("matching %c{%d,%d} against subject %.*s\n", c, min, max,
3595 max, eptr);
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003596#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003597
3598 if (md->caseless)
3599 {
3600 c = pcre_lcc[c];
3601 for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) FAIL;
3602 if (min == max) continue;
3603 if (minimize)
3604 {
3605 for (i = min;; i++)
3606 {
3607 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3608 if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])
3609 FAIL;
3610 }
3611 /* Control never gets here */
3612 }
3613 else
3614 {
3615 uschar *pp = eptr;
3616 for (i = min; i < max; i++)
3617 {
3618 if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;
3619 eptr++;
3620 }
3621 while (eptr >= pp)
3622 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3623 FAIL;
3624 }
3625 }
3626
3627 /* Caseful comparisons */
3628
3629 else
3630 {
3631 for (i = 1; i <= min; i++) if (c != *eptr++) FAIL;
3632 if (min == max) continue;
3633 if (minimize)
3634 {
3635 for (i = min;; i++)
3636 {
3637 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3638 if (i >= max || eptr >= md->end_subject || c != *eptr++) FAIL;
3639 }
3640 /* Control never gets here */
3641 }
3642 else
3643 {
3644 uschar *pp = eptr;
3645 for (i = min; i < max; i++)
3646 {
3647 if (eptr >= md->end_subject || c != *eptr) break;
3648 eptr++;
3649 }
3650 while (eptr >= pp)
3651 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3652 FAIL;
3653 }
3654 }
3655 /* Control never gets here */
3656
3657 /* Match a single character type repeatedly; several different opcodes
3658 share code. This is very similar to the code for single characters, but we
3659 repeat it in the interests of efficiency. */
3660
3661 case OP_TYPEEXACT:
3662 min = max = (ecode[1] << 8) + ecode[2];
3663 minimize = TRUE;
3664 ecode += 3;
3665 goto REPEATTYPE;
3666
3667 case OP_TYPEUPTO:
3668 case OP_TYPEMINUPTO:
3669 min = 0;
3670 max = (ecode[1] << 8) + ecode[2];
3671 minimize = *ecode == OP_TYPEMINUPTO;
3672 ecode += 3;
3673 goto REPEATTYPE;
3674
3675 case OP_TYPESTAR:
3676 case OP_TYPEMINSTAR:
3677 case OP_TYPEPLUS:
3678 case OP_TYPEMINPLUS:
3679 case OP_TYPEQUERY:
3680 case OP_TYPEMINQUERY:
3681 c = *ecode++ - OP_TYPESTAR;
3682 minimize = (c & 1) != 0;
3683 min = rep_min[c]; /* Pick up values from tables; */
3684 max = rep_max[c]; /* zero for max => infinity */
3685 if (max == 0) max = INT_MAX;
3686
3687 /* Common code for all repeated single character type matches */
3688
3689 REPEATTYPE:
3690 ctype = *ecode++; /* Code for the character type */
3691
3692 /* First, ensure the minimum number of matches are present. Use inline
3693 code for maximizing the speed, and do the type test once at the start
3694 (i.e. keep it out of the loop). Also test that there are at least the
3695 minimum number of characters before we start. */
3696
3697 if (min > md->end_subject - eptr) FAIL;
3698 if (min > 0) switch(ctype)
3699 {
3700 case OP_ANY:
3701 if (!md->dotall)
3702 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') FAIL; }
3703 else eptr += min;
3704 break;
3705
3706 case OP_NOT_DIGIT:
3707 for (i = 1; i <= min; i++)
3708 if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) FAIL;
3709 break;
3710
3711 case OP_DIGIT:
3712 for (i = 1; i <= min; i++)
3713 if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) FAIL;
3714 break;
3715
3716 case OP_NOT_WHITESPACE:
3717 for (i = 1; i <= min; i++)
3718 if ((pcre_ctypes[*eptr++] & ctype_space) != 0) FAIL;
3719 break;
3720
3721 case OP_WHITESPACE:
3722 for (i = 1; i <= min; i++)
3723 if ((pcre_ctypes[*eptr++] & ctype_space) == 0) FAIL;
3724 break;
3725
3726 case OP_NOT_WORDCHAR:
3727 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
3728 FAIL;
3729 break;
3730
3731 case OP_WORDCHAR:
3732 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
3733 FAIL;
3734 break;
3735 }
3736
3737 /* If min = max, continue at the same level without recursing */
3738
3739 if (min == max) continue;
3740
3741 /* If minimizing, we have to test the rest of the pattern before each
3742 subsequent match, so inlining isn't much help; just use the function. */
3743
3744 if (minimize)
3745 {
3746 for (i = min;; i++)
3747 {
3748 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3749 if (i >= max || eptr >= md->end_subject ||
3750 !match_type(ctype, *eptr++, md->dotall))
3751 FAIL;
3752 }
3753 /* Control never gets here */
3754 }
3755
3756 /* If maximizing it is worth using inline code for speed, doing the type
3757 test once at the start (i.e. keep it out of the loop). */
3758
3759 else
3760 {
3761 uschar *pp = eptr;
3762 switch(ctype)
3763 {
3764 case OP_ANY:
3765 if (!md->dotall)
3766 {
3767 for (i = min; i < max; i++)
3768 {
3769 if (eptr >= md->end_subject || *eptr == '\n') break;
3770 eptr++;
3771 }
3772 }
3773 else
3774 {
3775 c = max - min;
3776 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3777 eptr += c;
3778 }
3779 break;
3780
3781 case OP_NOT_DIGIT:
3782 for (i = min; i < max; i++)
3783 {
3784 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)
3785 break;
3786 eptr++;
3787 }
3788 break;
3789
3790 case OP_DIGIT:
3791 for (i = min; i < max; i++)
3792 {
3793 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)
3794 break;
3795 eptr++;
3796 }
3797 break;
3798
3799 case OP_NOT_WHITESPACE:
3800 for (i = min; i < max; i++)
3801 {
3802 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)
3803 break;
3804 eptr++;
3805 }
3806 break;
3807
3808 case OP_WHITESPACE:
3809 for (i = min; i < max; i++)
3810 {
3811 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)
3812 break;
3813 eptr++;
3814 }
3815 break;
3816
3817 case OP_NOT_WORDCHAR:
3818 for (i = min; i < max; i++)
3819 {
3820 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)
3821 break;
3822 eptr++;
3823 }
3824 break;
3825
3826 case OP_WORDCHAR:
3827 for (i = min; i < max; i++)
3828 {
3829 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)
3830 break;
3831 eptr++;
3832 }
3833 break;
3834 }
3835
3836 while (eptr >= pp)
3837 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3838 FAIL;
3839 }
3840 /* Control never gets here */
3841
3842 /* There's been some horrible disaster. */
3843
3844 default:
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003845#ifdef DEBUG
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003846 printf("Unknown opcode %d\n", *ecode);
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003847#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003848 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
3849 FAIL;
3850 }
3851
3852 /* Do not stick any code in here without much thought; it is assumed
3853 that "continue" in the code above comes out to here to repeat the main
3854 loop. */
3855
3856 } /* End of main loop */
3857/* Control never reaches here */
3858
3859fail:
3860 if (md->point > save_stack_position)
3861 {
3862 /* If there are still points remaining on the stack, pop the next one off */
Guido van Rossumc3861071997-10-08 02:07:40 +00003863 int off_num;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003864
3865 md->point--;
3866 offset_top = md->offset_top[md->point];
3867 eptr = md->eptr[md->point];
3868 ecode = md->ecode[md->point];
3869 off_num = md->off_num[md->point];
3870 md->offset_vector[off_num] = md->r1[md->point];
3871 md->offset_vector[off_num+1] = md->r2[md->point];
3872 goto match_loop;
3873 }
3874 /* Failure, and nothing left on the stack, so end this function call */
3875
3876 /* Restore the top of the stack to where it was before this function
3877 call. This lets us use one stack for everything; recursive calls
3878 can push and pop information, and may increase the stack. When
3879 the call returns, the parent function can resume pushing and
3880 popping wherever it was. */
3881
3882 md->point = save_stack_position;
3883 return FALSE;
3884
3885succeed:
3886 return TRUE;
3887}
3888
3889
3890/*************************************************
3891* Execute a Regular Expression *
3892*************************************************/
3893
3894/* This function applies a compiled re to a subject string and picks out
3895portions of the string if it matches. Two elements in the vector are set for
3896each substring: the offsets to the start and end of the substring.
3897
3898Arguments:
3899 re points to the compiled expression
3900 extra points to "hints" from pcre_study() or is NULL
3901 subject points to the subject string
3902 length length of subject string (may contain binary zeros)
3903 options option bits
3904 offsets points to a vector of ints to be filled in with offsets
3905 offsetcount the number of elements in the vector
3906
3907Returns: > 0 => success; value is the number of elements filled in
3908 = 0 => success, but offsets is not big enough
3909 -1 => failed to match
3910 < -1 => some kind of unexpected problem
3911*/
3912
3913int
3914pcre_exec(pcre *external_re, pcre_extra *external_extra, char *subject,
3915 int length, int options, int *offsets, int offsetcount)
3916{
3917int resetcount;
3918int first_char = -1;
3919match_data match_block;
3920uschar *start_bits = NULL;
3921uschar *start_match = (uschar *)subject;
3922uschar *end_subject;
3923real_pcre *re = (real_pcre *)external_re;
3924real_pcre_extra *extra = (real_pcre_extra *)external_extra;
3925BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3926BOOL startline = (re->options & PCRE_STARTLINE) != 0;
3927
3928if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3929
3930if (re == NULL || subject == NULL ||
3931 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3932if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
3933
3934match_block.start_subject = (uschar *)subject;
3935match_block.end_subject = match_block.start_subject + length;
3936end_subject = match_block.end_subject;
3937
3938match_block.caseless = ((re->options | options) & PCRE_CASELESS) != 0;
3939match_block.multiline = ((re->options |options) & PCRE_MULTILINE) != 0;
3940match_block.dotall = ((re->options |options) & PCRE_DOTALL) != 0;
3941
3942match_block.offset_vector = offsets; /* Where offsets go */
3943match_block.offset_end = (offsetcount & (-2)); /* Past max permitted (even) */
3944match_block.offset_overflow = FALSE;
3945
3946match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
3947
3948/* Set the stack state to empty */
3949 match_block.off_num = match_block.offset_top = NULL;
3950 match_block.r1 = match_block.r2 = NULL;
3951 match_block.eptr = match_block.ecode = NULL;
3952 match_block.point = match_block.length = 0;
3953
3954/* Compute the minimum number of offsets that we need to reset each time. Doing
3955this makes a huge difference to execution time when there aren't many brackets
3956in the pattern. */
3957
3958resetcount = 2 + re->top_bracket * 2;
3959if (resetcount > offsetcount) resetcount = offsetcount;
3960
3961/* If MULTILINE is set at exec time but was not set at compile time, and the
3962anchored flag is set, we must re-check because a setting provoked by ^ in the
3963pattern is not right in multi-line mode. Calling is_anchored() again here does
3964the right check, because multiline is now set. If it now yields FALSE, the
3965expression must have had ^ starting some of its branches. Check to see if
3966that is true for *all* branches, and if so, set the startline flag. */
3967
3968if (match_block. multiline && anchored && (re->options & PCRE_MULTILINE) == 0 &&
3969 !is_anchored(re->code, match_block.multiline))
3970 {
3971 anchored = FALSE;
3972 if (is_startline(re->code)) startline = TRUE;
3973 }
3974
3975/* Set up the first character to match, if available. The first_char value is
3976never set for an anchored regular expression, but the anchoring may be forced
3977at run time, so we have to test for anchoring. The first char may be unset for
3978an unanchored pattern, of course. If there's no first char and the pattern was
3979studied, the may be a bitmap of possible first characters. However, we can
3980use this only if the caseless state of the studying was correct. */
3981
3982if (!anchored)
3983 {
3984 if ((re->options & PCRE_FIRSTSET) != 0)
3985 {
3986 first_char = re->first_char;
3987 if (match_block.caseless) first_char = pcre_lcc[first_char];
3988 }
3989 else
3990 if (!startline && extra != NULL &&
3991 (extra->options & PCRE_STUDY_MAPPED) != 0 &&
3992 ((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless)
3993 start_bits = extra->start_bits;
3994 }
3995
3996/* Loop for unanchored matches; for anchored regexps the loop runs just once. */
3997
3998do
3999 {
4000 register int *iptr = offsets;
4001 register int *iend = offsets + resetcount;
4002
4003 /* Reset the maximum number of extractions we might see. */
4004
4005 while (iptr < iend) *iptr++ = -1;
4006
4007 /* Advance to a unique first char if possible */
4008
4009 if (first_char >= 0)
4010 {
4011 if (match_block.caseless)
4012 while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
4013 start_match++;
4014 else
4015 while (start_match < end_subject && *start_match != first_char)
4016 start_match++;
4017 }
4018
4019 /* Or to just after \n for a multiline match if possible */
4020
4021 else if (startline)
4022 {
4023 if (start_match > match_block.start_subject)
4024 {
4025 while (start_match < end_subject && start_match[-1] != '\n')
4026 start_match++;
4027 }
4028 }
4029
4030 /* Or to a non-unique first char */
4031
4032 else if (start_bits != NULL)
4033 {
4034 while (start_match < end_subject)
4035 {
4036 register int c = *start_match;
4037 if ((start_bits[c/8] & (1<<(c%8))) == 0) start_match++; else break;
4038 }
4039 }
4040
Guido van Rossum57ba4f31997-12-02 20:40:28 +00004041#ifdef DEBUG
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004042 printf(">>>> Match against: ");
4043 pchars(start_match, end_subject - start_match, TRUE, &match_block);
4044 printf("\n");
Guido van Rossum57ba4f31997-12-02 20:40:28 +00004045#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004046
4047 /* When a match occurs, substrings will be set for all internal extractions;
4048 we just need to set up the whole thing as substring 0 before returning. If
4049 there were too many extractions, set the return code to zero. */
4050
4051 if (match(start_match, re->code, 2, &match_block))
4052 {
4053 int rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
4054 if (match_block.offset_end < 2) rc = 0; else
4055 {
4056 offsets[0] = start_match - match_block.start_subject;
4057 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
4058 }
Guido van Rossum57ba4f31997-12-02 20:40:28 +00004059#ifdef DEBUG
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004060 printf(">>>> returning %d\n", rc);
Guido van Rossum57ba4f31997-12-02 20:40:28 +00004061#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004062 free_stack(&match_block);
4063 return rc;
4064 }
4065 }
4066while (!anchored &&
4067 match_block.errorcode == PCRE_ERROR_NOMATCH &&
4068 start_match++ < end_subject);
4069
4070#ifdef DEBUG
4071printf(">>>> returning %d\n", match_block.errorcode);
4072#endif
4073free_stack(&match_block);
4074return match_block.errorcode;
4075}
4076
4077/* End of pcre.c */