blob: 9b4b8eee47a042f7f23a3d109cffb608193b8eb7 [file] [log] [blame]
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001
2/*************************************************
3* Perl-Compatible Regular Expressions *
4*************************************************/
5
6/* DO NOT EDIT THIS FILE! */
7
8/* This file is automatically written by the merge-files.py script
9included with the PCRE distribution for Python; it's produced from
10several C files, and code is removed in the process. If you want to
11modify the code or track down bugs, it will be much easier to work
12with the code in its original, multiple-file form. Don't edit this
13file by hand, or submit patches to it.
14
15The Python-specific PCRE distribution can be retrieved from
16 http://starship.skyport.net/crew/amk/regex/
17
18The unmodified original PCRE distribution doesn't have a fixed URL
19yet; write Philip Hazel <ph10@cam.ac.uk> for the latest version.
20
21Written by: Philip Hazel <ph10@cam.ac.uk>
22
23Extensively modified by the Python String-SIG: <string-sig@python.org>
24Send bug reports to: <string-sig@python.org>
25(They'll figure out if it's a bug in PCRE or in the Python-specific
26changes.)
27
28 Copyright (c) 1997 University of Cambridge
29
30-----------------------------------------------------------------------------
31Permission is granted to anyone to use this software for any purpose on any
32computer system, and to redistribute it freely, subject to the following
33restrictions:
34
351. This software is distributed in the hope that it will be useful,
36 but WITHOUT ANY WARRANTY; without even the implied warranty of
37 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
38
392. The origin of this software must not be misrepresented, either by
40 explicit claim or by omission.
41
423. Altered versions must be plainly marked as such, and must not be
43 misrepresented as being the original software.
44-----------------------------------------------------------------------------
45*/
46
47
48#define FOR_PYTHON
49#include "pcre-internal.h"
50#include "Python.h"
51#include "graminit.h"
52
53/*************************************************
54* Perl-Compatible Regular Expressions *
55*************************************************/
56
57/* This file is automatically written by the makechartables auxiliary
58program. If you edit it by hand, you might like to edit the Makefile to
59prevent its ever being regenerated. */
60
61/* This table is a lower casing table. */
62
63unsigned char pcre_lcc[] = {
64 0, 1, 2, 3, 4, 5, 6, 7,
65 8, 9, 10, 11, 12, 13, 14, 15,
66 16, 17, 18, 19, 20, 21, 22, 23,
67 24, 25, 26, 27, 28, 29, 30, 31,
68 32, 33, 34, 35, 36, 37, 38, 39,
69 40, 41, 42, 43, 44, 45, 46, 47,
70 48, 49, 50, 51, 52, 53, 54, 55,
71 56, 57, 58, 59, 60, 61, 62, 63,
72 64, 97, 98, 99,100,101,102,103,
73 104,105,106,107,108,109,110,111,
74 112,113,114,115,116,117,118,119,
75 120,121,122, 91, 92, 93, 94, 95,
76 96, 97, 98, 99,100,101,102,103,
77 104,105,106,107,108,109,110,111,
78 112,113,114,115,116,117,118,119,
79 120,121,122,123,124,125,126,127,
80 128,129,130,131,132,133,134,135,
81 136,137,138,139,140,141,142,143,
82 144,145,146,147,148,149,150,151,
83 152,153,154,155,156,157,158,159,
84 160,161,162,163,164,165,166,167,
85 168,169,170,171,172,173,174,175,
86 176,177,178,179,180,181,182,183,
87 184,185,186,187,188,189,190,191,
88 192,193,194,195,196,197,198,199,
89 200,201,202,203,204,205,206,207,
90 208,209,210,211,212,213,214,215,
91 216,217,218,219,220,221,222,223,
92 224,225,226,227,228,229,230,231,
93 232,233,234,235,236,237,238,239,
94 240,241,242,243,244,245,246,247,
95 248,249,250,251,252,253,254,255 };
96
97/* This table is an upper casing table. */
98
99unsigned char pcre_ucc[] = {
100 0, 1, 2, 3, 4, 5, 6, 7,
101 8, 9, 10, 11, 12, 13, 14, 15,
102 16, 17, 18, 19, 20, 21, 22, 23,
103 24, 25, 26, 27, 28, 29, 30, 31,
104 32, 33, 34, 35, 36, 37, 38, 39,
105 40, 41, 42, 43, 44, 45, 46, 47,
106 48, 49, 50, 51, 52, 53, 54, 55,
107 56, 57, 58, 59, 60, 61, 62, 63,
108 64, 65, 66, 67, 68, 69, 70, 71,
109 72, 73, 74, 75, 76, 77, 78, 79,
110 80, 81, 82, 83, 84, 85, 86, 87,
111 88, 89, 90, 91, 92, 93, 94, 95,
112 96, 65, 66, 67, 68, 69, 70, 71,
113 72, 73, 74, 75, 76, 77, 78, 79,
114 80, 81, 82, 83, 84, 85, 86, 87,
115 88, 89, 90,123,124,125,126,127,
116 128,129,130,131,132,133,134,135,
117 136,137,138,139,140,141,142,143,
118 144,145,146,147,148,149,150,151,
119 152,153,154,155,156,157,158,159,
120 160,161,162,163,164,165,166,167,
121 168,169,170,171,172,173,174,175,
122 176,177,178,179,180,181,182,183,
123 184,185,186,187,188,189,190,191,
124 192,193,194,195,196,197,198,199,
125 200,201,202,203,204,205,206,207,
126 208,209,210,211,212,213,214,215,
127 216,217,218,219,220,221,222,223,
128 224,225,226,227,228,229,230,231,
129 232,233,234,235,236,237,238,239,
130 240,241,242,243,244,245,246,247,
131 248,249,250,251,252,253,254,255 };
132
133/* This table identifies various classes of character by individual bits:
134 1 white space character
135 2 decimal digit
136 4 hexadecimal digit
137 8 alphanumeric or '_'
138 16 octal digit
139 128 regular expression metacharacter or binary zero
140*/
141
142unsigned char pcre_ctypes[] = {
143 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
144 0x00,0x01,0x01,0x01,0x01,0x01,0x00,0x00, /* 8- 15 */
145 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
146 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
147 0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /* - ' */
148 0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /* ( - / */
149 0x1e,0x1e,0x1e,0x1e,0x1e,0x1e,0x1e,0x1e, /* 0 - 7 */
150 0x0e,0x0e,0x00,0x00,0x00,0x00,0x00,0x80, /* 8 - ? */
151 0x00,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x08, /* @ - G */
152 0x08,0x08,0x08,0x08,0x08,0x08,0x08,0x08, /* H - O */
153 0x08,0x08,0x08,0x08,0x08,0x08,0x08,0x08, /* P - W */
154 0x08,0x08,0x08,0x80,0x00,0x00,0x80,0x08, /* X - _ */
155 0x00,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x08, /* ` - g */
156 0x08,0x08,0x08,0x08,0x08,0x08,0x08,0x08, /* h - o */
157 0x08,0x08,0x08,0x08,0x08,0x08,0x08,0x08, /* p - w */
158 0x08,0x08,0x08,0x80,0x80,0x00,0x00,0x00, /* x -127 */
159 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
160 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
161 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
162 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
163 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
164 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
165 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
166 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
167 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
168 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
169 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
170 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
171 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
172 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
173 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
174 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
175
176/* End of pcre-chartables.c */
177/*************************************************
178* Perl-Compatible Regular Expressions *
179*************************************************/
180
181/*
182This is a library of functions to support regular expressions whose syntax
183and semantics are as close as possible to those of the Perl 5 language.
184
185Written by: Philip Hazel <ph10@cam.ac.uk>
186
187 Copyright (c) 1997 University of Cambridge
188
189-----------------------------------------------------------------------------
190Permission is granted to anyone to use this software for any purpose on any
191computer system, and to redistribute it freely, subject to the following
192restrictions:
193
1941. This software is distributed in the hope that it will be useful,
195 but WITHOUT ANY WARRANTY; without even the implied warranty of
196 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
197
1982. The origin of this software must not be misrepresented, either by
199 explicit claim or by omission.
200
2013. Altered versions must be plainly marked as such, and must not be
202 misrepresented as being the original software.
203-----------------------------------------------------------------------------
204
205See the file Tech.Notes for some information on the internals.
206*/
207
208/* This module contains the actual definition of global variables that are
209shared between the different modules. In fact, these are limited to the
210indirections for memory management functions. */
211
212/* Include the internals header, which itself includes Standard C headers plus
213the external pcre header. */
214
215
216/* Store get and free functions. */
217
218void *(*pcre_malloc)(size_t) = malloc;
219void (*pcre_free)(void *) = free;
220
221/* End of pcre-globals.c */
222/*************************************************
223* Perl-Compatible Regular Expressions *
224*************************************************/
225
226/*
227This is a library of functions to support regular expressions whose syntax
228and semantics are as close as possible to those of the Perl 5 language. See
229the file Tech.Notes for some information on the internals.
230
231Written by: Philip Hazel <ph10@cam.ac.uk>
232
233 Copyright (c) 1997 University of Cambridge
234
235-----------------------------------------------------------------------------
236Permission is granted to anyone to use this software for any purpose on any
237computer system, and to redistribute it freely, subject to the following
238restrictions:
239
2401. This software is distributed in the hope that it will be useful,
241 but WITHOUT ANY WARRANTY; without even the implied warranty of
242 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
243
2442. The origin of this software must not be misrepresented, either by
245 explicit claim or by omission.
246
2473. Altered versions must be plainly marked as such, and must not be
248 misrepresented as being the original software.
249-----------------------------------------------------------------------------
250*/
251
252
253/* Include the internals header, which itself includes Standard C headers plus
254the external pcre header. */
255
256
257
258/* Character types for class type bits */
259
260static char class_types[] = { ctype_digit, ctype_space, ctype_word };
261
262
263
264/*************************************************
265* Set a range of bits in the map *
266*************************************************/
267
268/* This function is called for character types.
269
270Arguments:
271 start_bits points to the bit map
272 type a character type bit
273 include TRUE to include the type;
274 FALSE to include all but the type
275
276Returns: nothing
277*/
278
279static void
280set_type_bits(uschar *start_bits, int type, BOOL include)
281{
282int i;
283for (i = 0; i < 256; i++)
284 if (((pcre_ctypes[i] & type) != 0) == include) start_bits[i/8] |= (1<<(i%8));
285}
286
287
288
289/*************************************************
290* Set one bit in the map *
291*************************************************/
292
293/* This function is called to set a bit in the map for a given character,
294or both cases of a letter if caseless. It could be replaced by a macro if
295better performance is wanted.
296
297Arguments:
298 start_bits points to 32-byte table
299 c the character
300 caseless TRUE if caseless
301
302Returns: nothing
303*/
304
305static void
306set_bit(uschar *start_bits, int c, BOOL caseless)
307{
308if (caseless)
309 {
310 int d = pcre_ucc[c];
311 start_bits[d/8] |= (1<<(d%8));
312 c = pcre_lcc[c];
313 }
314start_bits[c/8] |= (1<<(c%8));
315}
316
317
318
319/*************************************************
320* Create bitmap of starting chars *
321*************************************************/
322
323/* This function scans a compiled unanchored expression and attempts to build a
324bitmap of the set of initial characters. If it can't, it returns FALSE. As time
325goes by, we may be able to get more clever at doing this.
326
327Arguments:
328 code points to an expression
329 start_bits points to a 32-byte table, initialized to 0
330 caseless TRUE if caseless
331
332Returns: TRUE if table built, FALSE otherwise
333*/
334
335static BOOL
336set_start_bits(uschar *code, uschar *start_bits, BOOL caseless)
337{
338do
339 {
340 uschar *tcode = code + 3;
341 BOOL try_next = TRUE;
342
343 while (try_next)
344 {
345 try_next = FALSE;
346
347 if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
348 {
349 if (!set_start_bits(tcode, start_bits, caseless)) return FALSE;
350 }
351
352 else switch(*tcode)
353 {
354 default:
355 return FALSE;
356
357 /* BRAZERO does the bracket, but carries on. */
358
359 case OP_BRAZERO:
360 case OP_BRAMINZERO:
361 if (!set_start_bits(++tcode, start_bits, caseless)) return FALSE;
362 do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
363 tcode += 3;
364 try_next = TRUE;
365 break;
366
367 /* Single-char * or ? sets the bit and tries the next item */
368
369 case OP_STAR:
370 case OP_MINSTAR:
371 case OP_QUERY:
372 case OP_MINQUERY:
373 set_bit(start_bits, tcode[1], caseless);
374 tcode += 2;
375 try_next = TRUE;
376 break;
377
378 /* Single-char upto sets the bit and tries the next */
379
380 case OP_UPTO:
381 case OP_MINUPTO:
382 set_bit(start_bits, tcode[3], caseless);
383 tcode += 4;
384 try_next = TRUE;
385 break;
386
387 /* At least one single char sets the bit and stops */
388
389 case OP_EXACT: /* Fall through */
390 tcode++;
391
392 case OP_CHARS: /* Fall through */
393 tcode++;
394
395 case OP_PLUS:
396 case OP_MINPLUS:
397 set_bit(start_bits, tcode[1], caseless);
398 break;
399
400 /* Single character type sets the bits and stops */
401
402 case OP_NOT_DIGIT:
403 set_type_bits(start_bits, ctype_digit, FALSE);
404 break;
405
406 case OP_DIGIT:
407 set_type_bits(start_bits, ctype_digit, TRUE);
408 break;
409
410 case OP_NOT_WHITESPACE:
411 set_type_bits(start_bits, ctype_space, FALSE);
412 break;
413
414 case OP_WHITESPACE:
415 set_type_bits(start_bits, ctype_space, TRUE);
416 break;
417
418 case OP_NOT_WORDCHAR:
419 set_type_bits(start_bits, ctype_word, FALSE);
420 break;
421
422 case OP_WORDCHAR:
423 set_type_bits(start_bits, ctype_word, TRUE);
424 break;
425
426 /* One or more character type fudges the pointer and restarts, knowing
427 it will hit a single character type and stop there. */
428
429 case OP_TYPEPLUS:
430 case OP_TYPEMINPLUS:
431 tcode++;
432 try_next = TRUE;
433 break;
434
435 case OP_TYPEEXACT:
436 tcode += 3;
437 try_next = TRUE;
438 break;
439
440 /* Zero or more repeats of character types set the bits and then
441 try again. */
442
443 case OP_TYPEUPTO:
444 case OP_TYPEMINUPTO:
445 tcode += 2; /* Fall through */
446
447 case OP_TYPESTAR:
448 case OP_TYPEMINSTAR:
449 case OP_TYPEQUERY:
450 case OP_TYPEMINQUERY:
451 switch(tcode[1])
452 {
453 case OP_NOT_DIGIT:
454 set_type_bits(start_bits, ctype_digit, FALSE);
455 break;
456
457 case OP_DIGIT:
458 set_type_bits(start_bits, ctype_digit, TRUE);
459 break;
460
461 case OP_NOT_WHITESPACE:
462 set_type_bits(start_bits, ctype_space, FALSE);
463 break;
464
465 case OP_WHITESPACE:
466 set_type_bits(start_bits, ctype_space, TRUE);
467 break;
468
469 case OP_NOT_WORDCHAR:
470 set_type_bits(start_bits, ctype_word, FALSE);
471 break;
472
473 case OP_WORDCHAR:
474 set_type_bits(start_bits, ctype_word, TRUE);
475 break;
476 }
477
478 tcode += 2;
479 try_next = TRUE;
480 break;
481
482 /* Character class: set the bits and either carry on or not,
483 according to the repeat count. */
484
485 case OP_CLASS:
486 case OP_NEGCLASS:
487 {
488 uschar *base = tcode;
489 uschar *data, *end;
490 uschar *bitmap = start_bits;
491 uschar local[32];
492 int flags = base[1];
493 int i;
494
495 tcode += 4 + 2 * tcode[2] + tcode[3]; /* Advance past the item */
496 switch (*tcode)
497 {
498 case OP_CRSTAR:
499 case OP_CRMINSTAR:
500 case OP_CRQUERY:
501 case OP_CRMINQUERY:
502 tcode++;
503 try_next = TRUE;
504 break;
505
506 case OP_CRRANGE:
507 case OP_CRMINRANGE:
508 if (((tcode[1] << 8) + tcode[2]) == 0)
509 {
510 tcode += 5;
511 try_next = TRUE;
512 }
513 break;
514 }
515
516 /* For a negated class, we have to build a separate table of all
517 the bits in the class, and then turn all other bits on in the main
518 table. Otherwise there are problems with things like [^\da]. */
519
520 if (*base == OP_NEGCLASS)
521 {
522 memset(local, 0, 32);
523 bitmap = local;
524 }
525
526 /* Character types */
527
528 for (i = 0; flags != 0; i++)
529 {
530 if ((flags & 1) != 0)
531 set_type_bits(bitmap, class_types[i/2], (i & 1) == 0);
532 flags >>= 1;
533 }
534
535 /* Ranges and individual characters */
536
537 data = base + 4;
538 end = data + base[2] * 2;
539 while (data < end)
540 {
541 for (i = *data; i <= data[1]; i++) set_bit(bitmap, i, caseless);
542 data += 2;
543 }
544
545 end += base[3];
546 while (data < end) set_bit(bitmap, *data++, caseless);
547
548 /* If a negated class, transfer data from local map to the main one */
549
550 if (bitmap != start_bits)
551 for (i = 0; i < 32; i++) start_bits[i] |= ~local[i];
552 }
553 break; /* End of class handling */
554
555 } /* End of switch */
556 } /* End of try_next loop */
557
558 code += (code[1] << 8) + code[2]; /* Advance to next branch */
559 }
560while (*code == OP_ALT);
561return TRUE;
562}
563
564
565
566/*************************************************
567* Study a compiled expression *
568*************************************************/
569
570/* This function is handed a compiled expression that it must study to produce
571information that will speed up the matching. It returns a pcre_extra block
572which then gets handed back to pcre_exec().
573
574Arguments:
575 re points to the compiled expression
576 options contains option bits
577 errorptr points to where to place error messages;
578 set NULL unless error
579
580Returns: pointer to a pcre_extra block,
581 NULL on error or if no optimization possible
582*/
583
584pcre_extra *
585pcre_study(pcre *external_re, int options, char **errorptr)
586{
587BOOL caseless;
588uschar start_bits[32];
589real_pcre_extra *extra;
590real_pcre *re = (real_pcre *)external_re;
591
592*errorptr = NULL;
593
594if (re == NULL || re->magic_number != MAGIC_NUMBER)
595 {
596 *errorptr = "argument is not a compiled regular expression";
597 return NULL;
598 }
599
600if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
601 {
602 *errorptr = "unknown or incorrect option bit(s) set";
603 return NULL;
604 }
605
606/* For an anchored pattern, or an unchored pattern that has a first char, or a
607multiline pattern that matches only at "line starts", no further processing at
608present. */
609
610if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
611 return NULL;
612
613/* Determine the caseless state from the compiled pattern and the current
614options. */
615
616caseless = ((re->options | options) & PCRE_CASELESS) != 0;
617
618/* See if we can find a fixed set of initial characters for the pattern. */
619
620memset(start_bits, 0, 32);
621if (!set_start_bits(re->code, start_bits, caseless)) return NULL;
622
623/* Get an "extra" block and put the information therein. */
624
625extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
626
627if (extra == NULL)
628 {
629 *errorptr = "failed to get memory";
630 return NULL;
631 }
632extra->options = PCRE_STUDY_MAPPED | (caseless? PCRE_STUDY_CASELESS : 0);
633memcpy(extra->start_bits, start_bits, 32);
634
635return (pcre_extra *)extra;
636}
637
638/* End of pcre-study.c */
639/*************************************************
640* Perl-Compatible Regular Expressions *
641*************************************************/
642
643/*
644This is a library of functions to support regular expressions whose syntax
645and semantics are as close as possible to those of the Perl 5 language. See
646the file Tech.Notes for some information on the internals.
647
648Written by: Philip Hazel <ph10@cam.ac.uk>
649
650 Copyright (c) 1997 University of Cambridge
651
652-----------------------------------------------------------------------------
653Permission is granted to anyone to use this software for any purpose on any
654computer system, and to redistribute it freely, subject to the following
655restrictions:
656
6571. This software is distributed in the hope that it will be useful,
658 but WITHOUT ANY WARRANTY; without even the implied warranty of
659 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
660
6612. The origin of this software must not be misrepresented, either by
662 explicit claim or by omission.
663
6643. Altered versions must be plainly marked as such, and must not be
665 misrepresented as being the original software.
666-----------------------------------------------------------------------------
667*/
668
669
670/* Define DEBUG to get debugging output on stdout. */
671
672/* #define DEBUG */
673
674
675/* Include the internals header, which itself includes Standard C headers plus
676the external pcre header. */
677
678
679#ifndef Py_eval_input
680/* For Python 1.4, graminit.h has to be explicitly included */
681#define Py_eval_input eval_input
682#endif
683
684/* Min and max values for the common repeats; for the maxima, 0 => infinity */
685
686static char rep_min[] = { 0, 0, 1, 1, 0, 0 };
687static char rep_max[] = { 0, 0, 0, 0, 1, 1 };
688
689/* Text forms of OP_ values and things, for debugging */
690
691#ifdef DEBUG
692static char *OP_names[] = { "End", "\\A", "\\B", "\\b", "\\D", "\\d",
693 "\\S", "\\s", "\\W", "\\w", "\\Z", "^", "$", "Any", "chars",
694 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
695 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
696 "*", "*?", "+", "+?", "?", "??", "{", "{",
697 "class", "negclass", "Ref",
698 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
699 "Brazero", "Braminzero", "Bra"
700};
701
702static char *class_names[] = { "\\d", "\\D", "\\s", "\\S", "\\w", "\\W" };
703#endif
704
705/* Table of character type operators that correspond to the bits in the
706character class flags, starting at the least significant end. */
707
708static char class_ops[] = {
709 OP_DIGIT, OP_NOT_DIGIT,
710 OP_WHITESPACE, OP_NOT_WHITESPACE,
711 OP_WORDCHAR, OP_NOT_WORDCHAR };
712
713/* Table for handling escaped characters in the range '0'-'z'. Positive returns
714are simple data values; negative values are for special things like \d and so
715on. Zero means further processing is needed (for things like \x), or the escape
716is invalid. */
717
718/* PYTHON: Python doesn't support \e, but does support \v */
719
720static short int escapes[] = {
721 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
722 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
723 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
724 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
725 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
726 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
727 '`', 7, -ESC_b, 0, -ESC_d, 0, '\f', 0, /* ` - g */
728 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
729 0, 0, '\r', -ESC_s, '\t', 0, '\v', -ESC_w, /* p - w */
730 0, 0, 0 /* x - z */
731};
732
733
734/* Definition to allow mutual recursion */
735
736static BOOL compile_regexp(BOOL, int *, uschar **, uschar **,
737 char **, PyObject *);
738
739/* Structure for passing "static" information around between the functions
740doing the matching, so that they are thread-safe. */
741
742typedef struct match_data {
743 int errorcode; /* As it says */
744 int *offset_vector; /* Offset vector */
745 int offset_end; /* One past the end */
746 BOOL offset_overflow; /* Set if too many extractions */
747 BOOL caseless; /* Case-independent flag */
748 BOOL multiline; /* Multiline flag */
749 uschar *start_subject; /* Start of the subject string */
750 uschar *end_subject; /* End of the subject string */
751
752 uschar *end_match_ptr; /* Subject position at end match */
753 int end_offset_top; /* Highwater mark at end of match */
754 BOOL dotall; /* Dotall flag */
755 int length; /* Length of the allocated stacks */
756 int point; /* Point to add next item pushed onto stacks */
757 /* Pointers to the 6 stacks */
758 int *off_num, *offset_top, *r1, *r2;
759 uschar **eptr, **ecode;
760} match_data;
761
762
763
764/*************************************************
765* Return version string *
766*************************************************/
767
768char *
769pcre_version(void)
770{
771return PCRE_VERSION;
772}
773
774
775
776
777/*************************************************
778* Return info about a compiled pattern *
779*************************************************/
780
781/* This function picks potentially useful data out of the private
782structure.
783
784Arguments:
785 external_re points to compiled code
786 optptr where to pass back the options
787 first_char where to pass back the first character,
788 or -1 if multiline and all branches start ^,
789 or -2 otherwise
790
791Returns: number of identifying extraction brackets
792 or negative values on error
793*/
794
795int
796pcre_info(pcre *external_re, int *optptr, int *first_char)
797{
798real_pcre *re = (real_pcre *)external_re;
799if (re == NULL) return PCRE_ERROR_NULL;
800if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
801if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
802if (first_char != NULL)
803 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
804 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
805return re->top_bracket;
806}
807
808
809
810
811#ifdef DEBUG
812/*************************************************
813* Debugging function to print chars *
814*************************************************/
815
816/* Print a sequence of chars in printable format, stopping at the end of the
817subject if the requested.
818
819Arguments:
820 p points to characters
821 length number to print
822 is_subject TRUE if printing from within md->start_subject
823 md pointer to matching data block, if is_subject is TRUE
824
825Returns: nothing
826*/
827
828static pchars(uschar *p, int length, BOOL is_subject, match_data *md)
829{
830int c;
831if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
832while (length-- > 0)
833 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
834}
835#endif
836
837
838
839
840/*************************************************
841* Check subpattern for empty operand *
842*************************************************/
843
844/* This function checks a bracketed subpattern to see if any of the paths
845through it could match an empty string. This is used to diagnose an error if
846such a subpattern is followed by a quantifier with an unlimited upper bound.
847
848Argument:
849 code points to the opening bracket
850
851Returns: TRUE or FALSE
852*/
853
854static BOOL
855could_be_empty(uschar *code)
856{
857do {
858 uschar *cc = code + 3;
859
860 /* Scan along the opcodes for this branch; as soon as we find something
861 that matches a non-empty string, break out and advance to test the next
862 branch. If we get to the end of the branch, return TRUE for the whole
863 sub-expression. */
864
865 for (;;)
866 {
867 /* Test an embedded subpattern; if it could not be empty, break the
868 loop. Otherwise carry on in the branch. */
869
870 if ((int)(*cc) >= OP_BRA)
871 {
872 if (!could_be_empty(cc)) break;
873 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
874 cc += 3;
875 }
876
877 else switch (*cc)
878 {
879 /* Reached end of a branch: the subpattern may match the empty string */
880
881 case OP_ALT:
882 case OP_KET:
883 case OP_KETRMAX:
884 case OP_KETRMIN:
885 return TRUE;
886
887 /* Skip over assertive subpatterns */
888
889 case OP_ASSERT:
890 case OP_ASSERT_NOT:
891 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
892 cc += 3;
893 break;
894
895 /* Skip over things that don't match chars */
896
897 case OP_SOD:
898 case OP_EOD:
899 case OP_CIRC:
900 case OP_DOLL:
901 case OP_BRAZERO:
902 case OP_BRAMINZERO:
903 case OP_NOT_WORD_BOUNDARY:
904 case OP_WORD_BOUNDARY:
905 cc++;
906 break;
907
908 /* Skip over simple repeats with zero lower bound */
909
910 case OP_STAR:
911 case OP_MINSTAR:
912 case OP_QUERY:
913 case OP_MINQUERY:
914 case OP_TYPESTAR:
915 case OP_TYPEMINSTAR:
916 case OP_TYPEQUERY:
917 case OP_TYPEMINQUERY:
918 cc += 2;
919 break;
920
921 /* Skip over UPTOs (lower bound is zero) */
922
923 case OP_UPTO:
924 case OP_MINUPTO:
925 case OP_TYPEUPTO:
926 case OP_TYPEMINUPTO:
927 cc += 4;
928 break;
929
930 /* Check a class or a back reference for a zero minimum */
931
932 case OP_CLASS:
933 case OP_NEGCLASS:
934 case OP_REF:
935 cc += (*cc == OP_REF)? 2 : 4 + 2 * cc[2] + cc[3];
936
937 switch (*cc)
938 {
939 case OP_CRSTAR:
940 case OP_CRMINSTAR:
941 case OP_CRQUERY:
942 case OP_CRMINQUERY:
943 cc++;
944 break;
945
946 case OP_CRRANGE:
947 case OP_CRMINRANGE:
948 if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
949 cc += 3;
950 break;
951
952 default:
953 goto NEXT_BRANCH;
954 }
955 break;
956
957 /* Anything else matches at least one character */
958
959 default:
960 goto NEXT_BRANCH;
961 }
962 }
963
964 NEXT_BRANCH:
965 code += (code[1] << 8) + code[2];
966 }
967while (*code == OP_ALT);
968
969/* No branches match the empty string */
970
971return FALSE;
972}
973
974
975/* Determine the length of a group ID in an expression like
976 (?P<foo_123>...)
977Arguments:
978 ptr pattern position pointer (say that 3 times fast)
979 finalchar the character that will mark the end of the ID
980 errorptr points to the pointer to the error message
981*/
982
983static int
984get_group_id(uschar *ptr, char finalchar, char **errorptr)
985{
986 uschar *start = ptr;
987
988 /* If the first character is not in \w, or is in \w but is a digit,
989 report an error */
990 if (!(pcre_ctypes[*ptr] & ctype_word) ||
991 (pcre_ctypes[*ptr++] & ctype_digit))
992 {
993 *errorptr = "(?P identifier must start with a letter or underscore";
994 return 0;
995 }
996
997 /* Increment ptr until we either hit a null byte, the desired
998 final character, or a non-word character */
999 for(; (*ptr != 0) && (*ptr != finalchar) &&
1000 (pcre_ctypes[*ptr] & ctype_word); ptr++)
1001 {
1002 }
1003 if (*ptr==finalchar)
1004 return ptr-start;
1005 if (*ptr==0)
1006 {
1007 *errorptr = "unterminated (?P identifier";
1008 return 0;
1009 }
1010 *errorptr = "illegal character in (?P identifier";
1011 return 0;
1012}
1013
1014/*************************************************
1015* Handle escapes *
1016*************************************************/
1017
1018/* This function is called when a \ has been encountered. It either returns a
1019positive value for a simple escape such as \n, or a negative value which
1020encodes one of the more complicated things such as \d. On entry, ptr is
1021pointing at the \. On exit, it is on the final character of the escape
1022sequence.
1023
1024Arguments:
1025 ptrptr points to the pattern position pointer
1026 errorptr points to the pointer to the error message
1027
1028Returns: zero or positive => a data character
1029 negative => a special escape sequence
1030 on error, errorptr is set
1031*/
1032
1033static int
1034check_escape(uschar **ptrptr, char **errorptr)
1035{
1036uschar *ptr = *ptrptr;
1037int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
1038int i;
1039
1040if (c == 0) *errorptr = "\\ at end of pattern";
1041
1042/* Digits or letters may have special meaning; all others are literals. */
1043
1044else if (c < '0' || c > 'z') {}
1045
1046/* Do an initial lookup in a table. A non-zero result is something that can be
1047returned immediately. Otherwise further processing may be required. */
1048
1049else if ((i = escapes[c - '0']) != 0) c = i;
1050
1051/* Escapes that need further processing, or are illegal. */
1052
1053else switch (c)
1054 {
1055 case '0':
1056 c = 0;
1057 while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_odigit) != 0 )
1058 c = c * 8 + *(++ptr) - '0';
1059 break;
1060
1061 case '1': case '2': case '3': case '4': case '5':
1062 case '6': case '7': case '8': case '9':
1063 {
1064 /* PYTHON: Try to compute an octal value for a character */
1065 for(c=0, i=0; c!=-1 && ptr[i]!=0 && i<3; i++)
1066 {
1067 if (( pcre_ctypes[ ptr[i] ] & ctype_odigit) != 0)
1068 c = c * 8 + ptr[i]-'0';
1069 else
1070 c = -1; /* Non-octal character */
1071 }
1072 /* Aha! There were 3 octal digits, so it must be a character */
1073 if (c != -1 && i == 3)
1074 {
1075 ptr += i-1;
1076 break;
1077 }
1078 c = ptr[0]; /* Restore the first character after the \ */
1079 c -= '0'; i = 1;
1080 while (i<2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0)
1081 {
1082 c = c * 10 + ptr[1] - '0';
1083 ptr++; i++;
1084 }
1085 if (c > 255 - ESC_REF) *errorptr = "back reference too big";
1086 c = -(ESC_REF + c);
1087 }
1088 break;
1089
1090 case 'x':
1091 {
1092 int end, length;
1093 char *string;
1094 PyObject *v, *result;
1095
1096 i=1;
1097 while (ptr[i]!=0 &&
1098 ( pcre_ctypes[ptr[i]] & ctype_xdigit) != 0)
1099 i++;
1100 if (i==1)
1101 {
1102 *errorptr="\\x must be followed by hex digits";
1103 break;
1104 }
1105 length=i-1;
1106 string=malloc(length+4+1);
1107 if (string==NULL)
1108 {
1109 *errorptr="can't allocate memory for \\x string";
1110 break;
1111 }
1112 /* Create a string containing "\x<hexdigits>", which will be
1113 passed to eval() */
1114 string[0]=string[length+3]='"';
1115 string[1]='\\';
1116 string[length+4]='\0';
1117 memcpy(string+2, ptr, length+1);
1118 ptr += length;
1119 v=PyRun_String((char *)string, Py_eval_input,
1120 PyEval_GetGlobals(), PyEval_GetLocals());
1121 free(string);
1122 /* The evaluation raised an exception */
1123 if (v==NULL)
1124 {
1125 *errorptr="exception occurred during evaluation of \\x";
1126 break;
1127 }
1128 if (PyString_Size(v)!=1)
1129 {
1130 Py_DECREF(v);
1131 *errorptr="\\x string is not one byte in length";
1132 break;
1133 }
1134 c=*(unsigned char *)PyString_AsString(v);
1135 Py_DECREF(v);
1136 break;
1137 }
1138 break;
1139
1140
1141 case 'l':
1142 case 'L':
1143 case 'u':
1144 case 'U':
1145 case 'Q':
1146 case 'E':
1147 *errorptr = "the Perl escapes \\u, \\U, \\l, \\L, \\Q, \\E are not valid";
1148 break;
1149
1150 default:
1151 /* In Python, an unrecognized escape will simply return the character
1152 after the backslash, so do nothing */
1153 break;
1154 }
1155
1156*ptrptr = ptr;
1157return c;
1158}
1159
1160
1161
1162/*************************************************
1163* Read repeat counts *
1164*************************************************/
1165
1166/* Read an item of the form {n,m} and return the values.
1167
1168Arguments:
1169 p pointer to first char after '{'
1170 minp pointer to int for min
1171 maxp pointer to int for max
1172 returned as -1 if no max
1173 errorptr points to pointer to error message
1174
1175Returns: pointer to '}' on success;
1176 current ptr on error, with errorptr set
1177*/
1178
1179static uschar *
1180read_repeat_counts(uschar *p, int *minp, int *maxp, char **errorptr)
1181{
1182int min = 0;
1183int max = -1;
1184
1185if ((pcre_ctypes[*p] & ctype_digit) == 0)
1186 {
1187 *errorptr = "number expected after {";
1188 return p;
1189 }
1190
1191while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
1192
1193if (*p == '}') max = min; else
1194 {
1195 if (*p++ != ',')
1196 {
1197 *errorptr = "comma expected";
1198 return p-1;
1199 }
1200 if (*p != '}')
1201 {
1202 max = 0;
1203 while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
1204 if (*p != '}')
1205 {
1206 *errorptr = "} expected";
1207 return p;
1208 }
1209 if (max < min)
1210 {
1211 *errorptr = "numbers out of order";
1212 return p;
1213 }
1214 }
1215 }
1216
1217/* Do paranoid checks, then fill in the required variables, and pass back the
1218pointer to the terminating '}'. */
1219
1220if (max == 0) *errorptr = "zero maximum not allowed";
1221else if (min > 65535 || max > 65535) *errorptr = "number too big";
1222else
1223 {
1224 *minp = min;
1225 *maxp = max;
1226 }
1227return p;
1228}
1229
1230
1231
1232/*************************************************
1233* Compile one branch *
1234*************************************************/
1235
1236/* Scan the pattern, compiling it into the code vector.
1237
1238Arguments:
1239 extended TRUE if the PCRE_EXTENDED option was set
1240 brackets points to 2-element bracket vector
1241 code points to the pointer to the current code point
1242 ptrptr points to the current pattern pointer
1243 errorptr points to pointer to error message
1244
1245Returns: TRUE on success
1246 FALSE, with *errorptr set on error
1247*/
1248
1249static BOOL
1250compile_branch(BOOL extended, int *brackets, uschar **codeptr,
1251 uschar **ptrptr, char **errorptr, PyObject *dictionary)
1252{
1253int repeat_type, op_type;
1254int repeat_min, repeat_max;
1255int bravalue, length;
1256register int c;
1257register uschar *code = *codeptr;
1258uschar *ptr = *ptrptr;
1259uschar *previous = NULL;
1260uschar *oldptr;
1261
1262/* Switch on next character until the end of the branch */
1263
1264for (;; ptr++)
1265 {
1266 c = *ptr;
1267 if (extended)
1268 {
1269 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1270 if (c == '#')
1271 {
1272 while ((c = *(++ptr)) != 0 && c != '\n');
1273 continue;
1274 }
1275 }
1276
1277 switch(c)
1278 {
1279 /* The branch terminates at end of string, |, or ). */
1280
1281 case 0:
1282 case '|':
1283 case ')':
1284 *codeptr = code;
1285 *ptrptr = ptr;
1286 return TRUE;
1287
1288 /* Handle single-character metacharacters */
1289
1290 case '^':
1291 previous = NULL;
1292 *code++ = OP_CIRC;
1293 break;
1294
1295 case '$':
1296 previous = NULL;
1297 *code++ = OP_DOLL;
1298 break;
1299
1300 case '.':
1301 previous = code;
1302 *code++ = OP_ANY;
1303 break;
1304
1305 /* Character classes. We do quite a bit of munging around here. There are
1306 always four initial bytes: the op_code, a flags byte for things like \d, a
1307 count of pairs and a count of single characters. The pairs then follow, and
1308 finally the single characters. */
1309
1310 case '[':
1311 {
1312 int rangecount = 0;
1313 int flags = 0;
1314 int singles_count = 0;
1315 char singles[256];
1316
1317 previous = code;
1318
1319 /* If the first character is '^', set the negation flag */
1320
1321 if ((c = *(++ptr)) == '^') { *code = OP_NEGCLASS; c = *(++ptr); }
1322 else *code = OP_CLASS;
1323 code += 4;
1324
1325 /* Process characters until ] is reached. By writing this as a "do" it
1326 means that an initial ] is taken as a data character. */
1327
1328 do
1329 {
1330 if (c == 0)
1331 {
1332 *errorptr = "] missing";
1333 goto FAILED;
1334 }
1335
1336 /*** Perl treats '-' here as a data character, so PCRE had better
1337 do the same ... cut out this diagnosis.
1338
1339 if (c == '-')
1340 {
1341 *errorptr = "unexpected '-' in character class";
1342 goto FAILED;
1343 }
1344 ... ***/
1345
1346 /* Backslash may introduce a single character, or it may introduce one
1347 of the specials, which just set a flag. Escaped items are checked for
1348 validity in the pre-compiling pass. The sequence \b is a special case.
1349 Inside a class (and only there) it is treated as backslash. Elsewhere
1350 it marks a word boundary. */
1351
1352 if (c == '\\')
1353 {
1354 uschar *save_ptr = ptr+1;
1355 c = check_escape(&ptr, errorptr);
1356 if (c < 0)
1357 {
1358 switch (-c)
1359 {
1360 case ESC_d: flags |= CLASS_DIGITS; continue;
1361 case ESC_D: flags |= CLASS_NOT_DIGITS; continue;
1362 case ESC_s: flags |= CLASS_WHITESPACE; continue;
1363 case ESC_S: flags |= CLASS_NOT_WHITESPACE; continue;
1364 case ESC_w: flags |= CLASS_WORD; continue;
1365 case ESC_W: flags |= CLASS_NOT_WORD; continue;
1366 default:
1367 ptr = save_ptr;
1368 c = *ptr;
1369 break;
1370
1371 case ESC_b: c = '\b'; /* Treat as single character */
1372 break;
1373 }
1374 }
1375 /* Fall through if single character */
1376 }
1377
1378 /* A single character may be followed by '-' to form a range. However,
1379 Perl does not permit ']' to be the end of the range. A '-' character
1380 here is treated as a literal. */
1381
1382 if (ptr[1] == '-' && ptr[2] != ']')
1383 {
1384 int d;
1385 ptr += 2;
1386 d = *ptr;
1387
1388 if (d == 0)
1389 {
1390 *errorptr = "incomplete range";
1391 goto FAILED;
1392 }
1393
1394 /* The second part of a range can be a single-character escape, but
1395 not any of the other escapes. */
1396
1397 if (d == '\\')
1398 {
1399 d = check_escape(&ptr, errorptr);
1400 if (d < 0)
1401 {
1402 if (d == -ESC_b) d = '\b'; else
1403 {
1404 *errorptr = "invalid range";
1405 goto FAILED;
1406 }
1407 }
1408 }
1409
1410 if (d < c)
1411 {
1412 *errorptr = "range out of order";
1413 goto FAILED;
1414 }
1415
1416 if (rangecount >= 255)
1417 {
1418 *errorptr = "too many ranges inside []";
1419 goto FAILED;
1420 }
1421
1422 rangecount++;
1423 *code++ = c;
1424 *code++ = d;
1425 continue;
1426 }
1427
1428 /* Handle a lone single character: save it up for outputting at the
1429 end. Be paranoid and check that the buffer isn't going to overflow. */
1430
1431 if (singles_count >= 255)
1432 {
1433 *errorptr = "too many characters inside []";
1434 goto FAILED;
1435 }
1436 singles[singles_count++] = c;
1437 }
1438
1439 /* Loop until ']' reached; the check for end of string happens inside the
1440 loop. This "while" is the end of the "do" above. */
1441
1442 while ((c = *(++ptr)) != ']');
1443
1444 /* Copy saved single characters, which follow the ranges in the output. */
1445
1446 c = 0;
1447 while (c < singles_count) *code++ = singles[c++];
1448
1449 /* Finally fill in the flags and counts of ranges and single characters,
1450 and advance the pointer past the ]. */
1451
1452 previous[1] = flags;
1453 previous[2] = rangecount;
1454 previous[3] = singles_count;
1455 }
1456 break;
1457
1458 /* Various kinds of repeat */
1459
1460 case '{':
1461 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
1462 if (*errorptr != NULL) goto FAILED;
1463 goto REPEAT;
1464
1465 case '*':
1466 repeat_min = 0;
1467 repeat_max = -1;
1468 goto REPEAT;
1469
1470 case '+':
1471 repeat_min = 1;
1472 repeat_max = -1;
1473 goto REPEAT;
1474
1475 case '?':
1476 repeat_min = 0;
1477 repeat_max = 1;
1478
1479 REPEAT:
1480 if (previous == NULL)
1481 {
1482 *errorptr = "nothing to repeat";
1483 goto FAILED;
1484 }
1485
1486 /* If the next character is '?' this is a minimizing repeat. Advance to the
1487 next character. */
1488
1489 if (ptr[1] == '?') { repeat_type = 1; ptr++; } else repeat_type = 0;
1490
1491 /* If previous was a string of characters, chop off the last one and use it
1492 as the subject of the repeat. If there was only one character, we can
1493 abolish the previous item altogether. */
1494
1495 if (*previous == OP_CHARS)
1496 {
1497 int len = previous[1];
1498 if (len == 1)
1499 {
1500 c = previous[2];
1501 code = previous;
1502 }
1503 else
1504 {
1505 c = previous[len+1];
1506 previous[1]--;
1507 code--;
1508 }
1509 op_type = 0; /* Use single-char op codes */
1510 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1511 }
1512
1513 /* If previous was a character type match (\d or similar), abolish it and
1514 create a suitable repeat item. The code is shared with single-character
1515 repeats by adding a suitable offset into repeat_type. */
1516
1517 if ((int)*previous < OP_EOD || *previous == OP_ANY)
1518 {
1519 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1520 c = *previous;
1521 code = previous;
1522
1523 OUTPUT_SINGLE_REPEAT:
1524 repeat_type += op_type; /* Combine both values for many cases */
1525
1526 /* A minimum of zero is handled either as the special case * or ?, or as
1527 an UPTO, with the maximum given. */
1528
1529 if (repeat_min == 0)
1530 {
1531 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1532 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1533 else
1534 {
1535 *code++ = OP_UPTO + repeat_type;
1536 *code++ = repeat_max >> 8;
1537 *code++ = (repeat_max & 255);
1538 }
1539 }
1540
1541 /* The case {1,} is handled as the special case + */
1542
1543 else if (repeat_min == 1 && repeat_max == -1)
1544 *code++ = OP_PLUS + repeat_type;
1545
1546 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1547 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1548
1549 else
1550 {
1551 if (repeat_min != 1)
1552 {
1553 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1554 *code++ = repeat_min >> 8;
1555 *code++ = (repeat_min & 255);
1556 }
1557
1558 /* If the mininum is 1 and the previous item was a character string,
1559 we either have to put back the item that got cancelled if the string
1560 length was 1, or add the character back onto the end of a longer
1561 string. For a character type nothing need be done; it will just get put
1562 back naturally. */
1563
1564 else if (*previous == OP_CHARS)
1565 {
1566 if (code == previous) code += 2; else previous[1]++;
1567 }
1568
1569 /* Insert an UPTO if the max is greater than the min. */
1570
1571 if (repeat_max != repeat_min)
1572 {
1573 *code++ = c;
1574 repeat_max -= repeat_min;
1575 *code++ = OP_UPTO + repeat_type;
1576 *code++ = repeat_max >> 8;
1577 *code++ = (repeat_max & 255);
1578 }
1579 }
1580
1581 /* The character or character type itself comes last in all cases. */
1582
1583 *code++ = c;
1584 }
1585
1586 /* If previous was a character class or a back reference, we put the repeat
1587 stuff after it. */
1588
1589 else if (*previous == OP_CLASS || *previous == OP_NEGCLASS ||
1590 *previous == OP_REF)
1591 {
1592 if (repeat_min == 0 && repeat_max == -1)
1593 *code++ = OP_CRSTAR + repeat_type;
1594 else if (repeat_min == 1 && repeat_max == -1)
1595 *code++ = OP_CRPLUS + repeat_type;
1596 else if (repeat_min == 0 && repeat_max == 1)
1597 *code++ = OP_CRQUERY + repeat_type;
1598 else
1599 {
1600 *code++ = OP_CRRANGE + repeat_type;
1601 *code++ = repeat_min >> 8;
1602 *code++ = repeat_min & 255;
1603 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1604 *code++ = repeat_max >> 8;
1605 *code++ = repeat_max & 255;
1606 }
1607 }
1608
1609 /* If previous was a bracket group, we may have to replicate it in certain
1610 cases. If the maximum repeat count is unlimited, check that the bracket
1611 group cannot match the empty string, and diagnose an error if it can. */
1612
1613 else if ((int)*previous >= OP_BRA)
1614 {
1615 int i;
1616 int length = code - previous;
1617
1618 if (repeat_max == -1 && could_be_empty(previous))
1619 {
1620 *errorptr = "operand of unlimited repeat could match the empty string";
1621 goto FAILED;
1622 }
1623
1624 /* If the minimum is greater than zero, and the maximum is unlimited or
1625 equal to the minimum, the first copy remains where it is, and is
1626 replicated up to the minimum number of times. This case includes the +
1627 repeat, but of course no replication is needed in that case. */
1628
1629 if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1630 {
1631 for (i = 1; i < repeat_min; i++)
1632 {
1633 memcpy(code, previous, length);
1634 code += length;
1635 }
1636 }
1637
1638 /* If the minimum is zero, stick BRAZERO in front of the first copy.
1639 Then, if there is a fixed upper limit, replicated up to that many times,
1640 sticking BRAZERO in front of all the optional ones. */
1641
1642 else
1643 {
1644 if (repeat_min == 0)
1645 {
1646 memmove(previous+1, previous, length);
1647 code++;
1648 *previous++ = OP_BRAZERO + repeat_type;
1649 }
1650
1651 for (i = 1; i < repeat_min; i++)
1652 {
1653 memcpy(code, previous, length);
1654 code += length;
1655 }
1656
1657 for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1658 {
1659 *code++ = OP_BRAZERO + repeat_type;
1660 memcpy(code, previous, length);
1661 code += length;
1662 }
1663 }
1664
1665 /* If the maximum is unlimited, set a repeater in the final copy. */
1666
1667 if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
1668 }
1669
1670 /* Else there's some kind of shambles */
1671
1672 else
1673 {
1674 *errorptr = "internal error 1 (unexpected repeat)";
1675 goto FAILED;
1676 }
1677
1678 /* In all case we no longer have a previous item. */
1679
1680 previous = NULL;
1681 break;
1682
1683
1684 /* Start of nested bracket sub-expression, or comment or lookahead.
1685 First deal with special things that can come after a bracket; all are
1686 introduced by ?, and the appearance of any of them means that this is not a
1687 referencing group. They were checked for validity in the first pass over
1688 the string, so we don't have to check for syntax errors here. */
1689
1690 case '(':
1691 previous = code; /* Only real brackets can be repeated */
1692 if (*(++ptr) == '?')
1693 {
1694 bravalue = OP_BRA;
1695
1696 switch (*(++ptr))
1697 {
1698 case '#':
1699 case 'i':
1700 case 'm':
1701 case 's':
1702 case 'x':
1703 ptr++;
1704 while (*ptr != ')') ptr++;
1705 previous = NULL;
1706 continue;
1707
1708 case ':': /* Non-extracting bracket */
1709 ptr++;
1710 break;
1711
1712 case '=': /* Assertions can't be repeated */
1713 bravalue = OP_ASSERT;
1714 ptr++;
1715 previous = NULL;
1716 break;
1717
1718 case '!':
1719 bravalue = OP_ASSERT_NOT;
1720 ptr++;
1721 previous = NULL;
1722 break;
1723
1724 case ('P'):
1725 ptr++;
1726 if (*ptr=='<')
1727 {
1728 /* (?P<groupname>...) */
1729 int idlen;
1730 PyObject *string, *intobj;
1731
1732 ptr++;
1733 idlen = get_group_id(ptr, '>', errorptr);
1734 if (*errorptr) {
1735 goto FAILED;
1736 }
1737 string = PyString_FromStringAndSize(ptr, idlen);
1738 intobj = PyInt_FromLong( brackets[0] );
1739 if (intobj == NULL || string==NULL)
1740 {
1741 Py_XDECREF(string);
1742 Py_XDECREF(intobj);
1743 *errorptr = "exception raised";
1744 goto FAILED;
1745 }
1746 PyDict_SetItem(dictionary, string, intobj);
1747 Py_DECREF(string); Py_DECREF(intobj);
1748 ptr += idlen+1; /* Point to rest of expression */
1749 goto do_grouping_bracket;
1750 }
1751 if (*ptr=='=')
1752 {
1753 /* (?P=groupname) */
1754 int idlen, refnum;
1755 PyObject *string, *intobj;
1756
1757 ptr++;
1758 idlen = get_group_id(ptr, ')', errorptr);
1759 if (*errorptr) {
1760 goto FAILED;
1761 }
1762 string = PyString_FromStringAndSize(ptr, idlen);
1763 if (string==NULL)
1764 {
1765 Py_XDECREF(string);
1766 *errorptr = "exception raised";
1767 goto FAILED;
1768 }
1769 intobj = PyDict_GetItem(dictionary, string);
1770 if (intobj==NULL) {
1771 *errorptr = "?P= group identifier isn't defined";
1772 goto FAILED;
1773 }
1774
1775 refnum = PyInt_AsLong(intobj);
1776 Py_DECREF(string); Py_DECREF(intobj);
1777 *code++ = OP_REF;
1778 *code++ = refnum;
1779 /* The continue will cause the top-level for() loop to
1780 be resumed, so ptr will be immediately incremented.
1781 Therefore, the following line adds just idlen, not
1782 idlen+1 */
1783 ptr += idlen;
1784 continue;
1785 }
1786 /* The character after ?P is neither < nor =, so
1787 report an error. Add more Python-extensions here. */
1788 *errorptr="unknown after (?P";
1789 goto FAILED;
1790 break;
1791 default:
1792 *errorptr = "unknown after (?";
1793 goto FAILED;
1794 }
1795 }
1796
1797 /* Else we have a referencing group */
1798
1799 else
1800 {
1801 do_grouping_bracket:
1802 if (brackets[0] > EXTRACT_MAX)
1803 {
1804 *errorptr = "too many extraction brackets";
1805 goto FAILED;
1806 }
1807 brackets[1] = brackets[0];
1808 bravalue = OP_BRA + brackets[0]++;
1809 }
1810
1811 /* Process nested bracketed re; at end pointer is on the bracket. We copy
1812 code into a non-register variable in order to be able to pass its address
1813 because some compilers complain otherwise. */
1814
1815 *code = bravalue;
1816 {
1817 uschar *mcode = code;
1818 if (!compile_regexp(extended, brackets, &mcode, &ptr, errorptr, dictionary))
1819 goto FAILED;
1820 code = mcode;
1821 }
1822
1823 if (*ptr != ')')
1824 {
1825 *errorptr = "missing )";
1826 goto FAILED;
1827 }
1828 break;
1829
1830 /* Check \ for being a real metacharacter; if not, fall through and handle
1831 it as a data character at the start of a string. Escape items are checked
1832 for validity in the pre-compiling pass. */
1833
1834 case '\\':
1835 oldptr = ptr;
1836 c = check_escape(&ptr, errorptr);
1837
1838 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1839 are arranged to be the negation of the corresponding OP_values. For the
1840 back references, the values are ESC_REF plus the reference number. Only
1841 back references and those types that consume a character may be repeated.
1842 We can test for values between ESC_b and ESC_Z for the latter; this may
1843 have to change if any new ones are ever created. */
1844
1845 if (c < 0)
1846 {
1847 if (-c >= ESC_REF)
1848 {
1849 int refnum = -c -ESC_REF;
1850 if (brackets[1] < refnum ) {
1851 *errorptr = "backreference to non-existent group";
1852 goto FAILED;
1853 }
1854 previous = code;
1855 *code++ = OP_REF;
1856 *code++ = refnum;
1857 }
1858 else
1859 {
1860 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
1861 *code++ = -c;
1862 }
1863 continue;
1864 }
1865
1866 /* Reset and fall through */
1867
1868 ptr = oldptr;
1869 c = '\\';
1870
1871 /* Handle a run of data characters until a metacharacter is encountered.
1872 The first character is guaranteed not to be whitespace or # when the
1873 extended flag is set. */
1874
1875 default:
1876 previous = code;
1877 *code = OP_CHARS;
1878 code += 2;
1879 length = 0;
1880
1881 do
1882 {
1883 if (extended)
1884 {
1885 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1886 if (c == '#')
1887 {
1888 while ((c = *(++ptr)) != 0 && c != '\n');
1889 if (c == 0) break;
1890 continue;
1891 }
1892 }
1893
1894 /* Backslash may introduce a data char or a metacharacter. Escaped items
1895 are checked for validity in the pre-compiling pass. Stop the string
1896 before a metaitem. */
1897
1898 if (c == '\\')
1899 {
1900 oldptr = ptr;
1901 c = check_escape(&ptr, errorptr);
1902 if (c < 0) { ptr = oldptr; break; }
1903 }
1904
1905 /* Ordinary character or single-char escape */
1906
1907 *code++ = c;
1908 length++;
1909 }
1910
1911 /* This "while" is the end of the "do" above. */
1912
1913 while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
1914
1915 /* Compute the length and set it in the data vector, and advance to
1916 the next state. */
1917
1918 previous[1] = length;
1919 ptr--;
1920 break;
1921 }
1922 } /* end of big loop */
1923
1924/* Control never reaches here by falling through, only by a goto for all the
1925error states. Pass back the position in the pattern so that it can be displayed
1926to the user for diagnosing the error. */
1927
1928FAILED:
1929*ptrptr = ptr;
1930return FALSE;
1931}
1932
1933
1934
1935
1936/*************************************************
1937* Compile sequence of alternatives *
1938*************************************************/
1939
1940/* On entry, ptr is pointing past the bracket character, but on return
1941it points to the closing bracket, or vertical bar, or end of string.
1942The code variable is pointing at the byte into which the BRA operator has been
1943stored.
1944
1945Argument:
1946 extended TRUE if PCRE_EXTENDED was set
1947 brackets -> 2-element vector containing next and top bracket numbers
1948 codeptr -> the address of the current code pointer
1949 ptrptr -> the address of the current pattern pointer
1950 errorptr -> pointer to error message
1951
1952Returns: TRUE on success
1953*/
1954
1955static BOOL
1956compile_regexp(BOOL extended, int *brackets, uschar **codeptr,
1957 uschar **ptrptr, char **errorptr, PyObject *dictionary)
1958{
1959uschar *ptr = *ptrptr;
1960uschar *code = *codeptr;
1961uschar *start_bracket = code;
1962
1963for (;;)
1964 {
1965 int length;
1966 uschar *last_branch = code;
1967
1968 code += 3;
1969 if (!compile_branch(extended, brackets, &code, &ptr, errorptr, dictionary))
1970 {
1971 *ptrptr = ptr;
1972 return FALSE;
1973 }
1974
1975 /* Fill in the length of the last branch */
1976
1977 length = code - last_branch;
1978 last_branch[1] = length >> 8;
1979 last_branch[2] = length & 255;
1980
1981 /* Reached end of expression, either ')' or end of pattern. Insert a
1982 terminating ket and the length of the whole bracketed item, and return,
1983 leaving the pointer at the terminating char. */
1984
1985 if (*ptr != '|')
1986 {
1987 length = code - start_bracket;
1988 *code++ = OP_KET;
1989 *code++ = length >> 8;
1990 *code++ = length & 255;
1991 *codeptr = code;
1992 *ptrptr = ptr;
1993 return TRUE;
1994 }
1995
1996 /* Another branch follows; insert an "or" node and advance the pointer. */
1997
1998 *code = OP_ALT;
1999 ptr++;
2000 }
2001/* Control never reaches here */
2002}
2003
2004
2005
2006/*************************************************
2007* Check for anchored expression *
2008*************************************************/
2009
2010/* Try to find out if this is an anchored regular expression. Consider each
2011alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2012all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2013it's anchored. However, if this is a multiline pattern, then only OP_SOD
2014counts, since OP_CIRC can match in the middle.
2015
2016A branch is also implicitly anchored if it starts with .* because that will try
2017the rest of the pattern at all possible matching points, so there is no point
2018trying them again.
2019
2020Argument: points to start of expression (the bracket)
2021Returns: TRUE or FALSE
2022*/
2023
2024static BOOL
2025is_anchored(register uschar *code, BOOL multiline)
2026{
2027do {
2028 int op = (int)code[3];
2029 if (op >= OP_BRA || op == OP_ASSERT)
2030 { if (!is_anchored(code+3, multiline)) return FALSE; }
2031 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2032 { if (code[4] != OP_ANY) return FALSE; }
2033 else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;
2034 code += (code[1] << 8) + code[2];
2035 }
2036while (*code == OP_ALT);
2037return TRUE;
2038}
2039
2040
2041
2042/*************************************************
2043* Check for start with \n line expression *
2044*************************************************/
2045
2046/* This is called for multiline expressions to try to find out if every branch
2047starts with ^ so that "first char" processing can be done to speed things up.
2048
2049Argument: points to start of expression (the bracket)
2050Returns: TRUE or FALSE
2051*/
2052
2053static BOOL
2054is_startline(uschar *code)
2055{
2056do {
2057 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2058 { if (!is_startline(code+3)) return FALSE; }
2059 else if (code[3] != OP_CIRC) return FALSE;
2060 code += (code[1] << 8) + code[2];
2061 }
2062while (*code == OP_ALT);
2063return TRUE;
2064}
2065
2066
2067
2068/*************************************************
2069* Check for fixed first char *
2070*************************************************/
2071
2072/* Try to find out if there is a fixed first character. This is called for
2073unanchored expressions, as it speeds up their processing quite considerably.
2074Consider each alternative branch. If they all start with the same char, or with
2075a bracket all of whose alternatives start with the same char (recurse ad lib),
2076then we return that char, otherwise -1.
2077
2078Argument: points to start of expression (the bracket)
2079Returns: -1 or the fixed first char
2080*/
2081
2082static int
2083find_firstchar(uschar *code)
2084{
2085register int c = -1;
2086do
2087 {
2088 register int charoffset = 4;
2089
2090 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2091 {
2092 register int d;
2093 if ((d = find_firstchar(code+3)) < 0) return -1;
2094 if (c < 0) c = d; else if (c != d) return -1;
2095 }
2096
2097 else switch(code[3])
2098 {
2099 default:
2100 return -1;
2101
2102 case OP_EXACT: /* Fall through */
2103 charoffset++;
2104
2105 case OP_CHARS: /* Fall through */
2106 charoffset++;
2107
2108 case OP_PLUS:
2109 case OP_MINPLUS:
2110 if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;
2111 break;
2112 }
2113 code += (code[1] << 8) + code[2];
2114 }
2115while (*code == OP_ALT);
2116return c;
2117}
2118
2119
2120
2121/*************************************************
2122* Compile a Regular Expression *
2123*************************************************/
2124
2125/* This function takes a string and returns a pointer to a block of store
2126holding a compiled version of the expression.
2127
2128Arguments:
2129 pattern the regular expression
2130 options various option bits
2131 errorptr pointer to pointer to error text
2132 erroroffset ptr offset in pattern where error was detected
2133
2134Returns: pointer to compiled data block, or NULL on error,
2135 with errorptr and erroroffset set
2136*/
2137
2138pcre *
2139pcre_compile(char *pattern, int options, char **errorptr, int
2140 *erroroffset, PyObject *dictionary)
2141{
2142real_pcre *re;
2143int spaces = 0;
2144int length = 3; /* For initial BRA plus length */
2145int runlength;
2146int c, size;
2147int brackets[2];
2148int brastack[200];
2149int brastackptr = 0;
2150BOOL extended = (options & PCRE_EXTENDED) != 0;
2151uschar *code, *ptr;
2152
2153#ifdef DEBUG
2154uschar *code_base, *code_end;
2155#endif
2156
2157/* Miscellaneous initialization; the copy the error pointers into static
2158variables so all functions can access them. */
2159
2160brackets[0] = 1; /* Next bracket number */
2161brackets[1] = 0; /* Highest used bracket number */
2162
2163*errorptr = NULL;
2164*erroroffset = 0;
2165
2166if ((options & ~PUBLIC_OPTIONS) != 0)
2167 {
2168 *errorptr = "unknown option bit(s) set";
2169 return NULL;
2170 }
2171
2172#ifdef DEBUG
2173printf("------------------------------------------------------------------\n");
2174printf("%s\n", pattern);
2175#endif
2176
2177/* The first thing to do is to make a pass over the pattern to compute the
2178amount of store required to hold the compiled code. This does not have to be
2179perfect as long as errors are overestimates. At the same time we can detect any
2180internal flag settings. Make an attempt to correct for any counted white space
2181if an "extended" flag setting appears late in the pattern. We can't be so
2182clever for #-comments. */
2183
2184ptr = (uschar *)(pattern - 1);
2185while ((c = *(++ptr)) != 0)
2186 {
2187 int min, max;
2188
2189 if ((pcre_ctypes[c] & ctype_space) != 0)
2190 {
2191 if (extended) continue;
2192 spaces++;
2193 }
2194
2195 if (extended && c == '#')
2196 {
2197 while ((c = *(++ptr)) != 0 && c != '\n');
2198 continue;
2199 }
2200
2201 switch(c)
2202 {
2203 /* A backslashed item may be an escaped "normal" character or a
2204 character type. For a "normal" character, put the pointers and
2205 character back so that tests for whitespace etc. in the input
2206 are done correctly. */
2207
2208 case '\\':
2209 {
2210 uschar *save_ptr = ptr;
2211 c = check_escape(&ptr, errorptr);
2212 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2213 if (c >= 0)
2214 {
2215 ptr = save_ptr;
2216 c = '\\';
2217 goto NORMAL_CHAR;
2218 }
2219 }
2220 length++;
2221
2222 /* A back reference needs an additional char, plus either one or 5
2223 bytes for a repeat. */
2224
2225 if (c <= -ESC_REF)
2226 {
2227 length++; /* For single back reference */
2228 if (ptr[1] == '{')
2229 {
2230 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2231 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2232 if ((min == 0 && (max == 1 || max == -1)) ||
2233 (min == 1 && max == -1))
2234 length++;
2235 else length += 5;
2236 if (ptr[1] == '?') ptr++;
2237 }
2238 }
2239 continue;
2240
2241 case '^':
2242 case '.':
2243 case '$':
2244 case '*': /* These repeats won't be after brackets; */
2245 case '+': /* those are handled separately */
2246 case '?':
2247 length++;
2248 continue;
2249
2250 /* This covers the cases of repeats after a single char, metachar, class,
2251 or back reference. */
2252
2253 case '{':
2254 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
2255 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2256 if ((min == 0 && (max == 1 || max == -1)) ||
2257 (min == 1 && max == -1))
2258 length++;
2259 else
2260 {
2261 length--; /* Uncount the original char or metachar */
2262 if (min == 1) length++; else if (min > 0) length += 4;
2263 if (max > 0) length += 4; else length += 2;
2264 }
2265 if (ptr[1] == '?') ptr++;
2266 continue;
2267
2268 /* An alternation contains an offset to the next branch or ket. */
2269 case '|':
2270 length += 3;
2271 continue;
2272
2273 /* A character class uses 4 characters plus the characters in it. Don't
2274 worry about character types that aren't allowed in classes - they'll get
2275 picked up during the compile. */
2276
2277 case '[':
2278 length += 4;
2279 if (ptr[1] == '^') ptr++;
2280 do
2281 {
2282 if (*(++ptr) == '\\')
2283 {
2284 (void)check_escape(&ptr, errorptr);
2285 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2286 }
2287 length++;
2288 }
2289 while (*ptr != 0 && *ptr != ']');
2290
2291 /* A repeat needs either 1 or 5 bytes. */
2292
2293 if (ptr[1] == '{')
2294 {
2295 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2296 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2297 if ((min == 0 && (max == 1 || max == -1)) ||
2298 (min == 1 && max == -1))
2299 length++;
2300 else length += 5;
2301 if (ptr[1] == '?') ptr++;
2302 }
2303 continue;
2304
2305 /* Brackets may be genuine groups or special things */
2306
2307 case '(':
2308
2309 /* Handle special forms of bracket, which all start (? */
2310
2311 if (ptr[1] == '?') switch (c = ptr[2])
2312 {
2313 /* Skip over comments entirely */
2314 case '#':
2315 ptr += 3;
2316 while (*ptr != 0 && *ptr != ')') ptr++;
2317 if (*ptr == 0)
2318 {
2319 *errorptr = "missing ) after comment";
2320 goto PCRE_ERROR_RETURN;
2321 }
2322 continue;
2323
2324 /* Non-referencing groups and lookaheads just move the pointer on, and
2325 then behave like a non-special bracket. */
2326
2327 case ':':
2328 case '=':
2329 case '!':
2330 ptr += 2;
2331 break;
2332
2333 /* Else loop setting valid options until ) is met. Anything else is an
2334 error. */
2335
2336 case ('P'):
2337 {
2338 int idlen;
2339 switch (*ptr++) {
2340 case ('<'):
2341 idlen = get_group_id(ptr++, '>', errorptr);
2342 if (*errorptr) goto PCRE_ERROR_RETURN;
2343 ptr += idlen+1;
2344 break;
2345 case ('='):
2346 idlen = get_group_id(ptr++, ')', errorptr);
2347 if (*errorptr) goto PCRE_ERROR_RETURN;
2348 ptr += idlen+1;
2349 length++;
2350 break;
2351 }
2352 }
2353 break;
2354
2355 default:
2356 ptr += 2;
2357 for (;; ptr++)
2358 {
2359 if ((c = *ptr) == 'i')
2360 {
2361 options |= PCRE_CASELESS;
2362 continue;
2363 }
2364 else if ((c = *ptr) == 'm')
2365 {
2366 options |= PCRE_MULTILINE;
2367 continue;
2368 }
2369 else if ((c = *ptr) == 's')
2370 {
2371 options |= PCRE_DOTALL;
2372 continue;
2373 }
2374 else if (c == 'x')
2375 {
2376 options |= PCRE_EXTENDED;
2377 extended = TRUE;
2378 length -= spaces; /* Already counted spaces */
2379 continue;
2380 }
2381 else if (c == ')') break;
2382
2383 *errorptr = "undefined after (?";
2384 goto PCRE_ERROR_RETURN;
2385 }
2386 continue; /* End of this bracket handling */
2387 }
2388
2389 /* Non-special forms of bracket. Save length for computing whole length
2390 at end if there's a repeat that requires duplication of the group. */
2391
2392 if (brastackptr >= sizeof(brastack)/sizeof(int))
2393 {
2394 *errorptr = "too many brackets";
2395 goto PCRE_ERROR_RETURN;
2396 }
2397
2398 brastack[brastackptr++] = length;
2399 length += 3;
2400 continue;
2401
2402 /* Handle ket. Look for subsequent max/min; for certain sets of values we
2403 have to replicate this bracket up to that many times. */
2404
2405 case ')':
2406 length += 3;
2407 {
2408 int min = 1;
2409 int max = 1;
2410 int duplength = length - brastack[--brastackptr];
2411
2412 /* Leave ptr at the final char; for read_repeat_counts this happens
2413 automatically; for the others we need an increment. */
2414
2415 if ((c = ptr[1]) == '{')
2416 {
2417 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2418 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2419 }
2420 else if (c == '*') { min = 0; max = -1; ptr++; }
2421 else if (c == '+') { max = -1; ptr++; }
2422 else if (c == '?') { min = 0; ptr++; }
2423
2424 /* If there is a minimum > 1 we have to replicate up to min-1 times; if
2425 there is a limited maximum we have to replicate up to max-1 times and
2426 allow for a BRAZERO item before each optional copy, as we also have to
2427 do before the first copy if the minimum is zero. */
2428
2429 if (min == 0) length++;
2430 else if (min > 1) length += (min - 1) * duplength;
2431 if (max > min) length += (max - min) * (duplength + 1);
2432 }
2433
2434 continue;
2435
2436 /* Non-special character. For a run of such characters the length required
2437 is the number of characters + 2, except that the maximum run length is 255.
2438 We won't get a skipped space or a non-data escape or the start of a #
2439 comment as the first character, so the length can't be zero. */
2440
2441 NORMAL_CHAR:
2442 default:
2443 length += 2;
2444 runlength = 0;
2445 do
2446 {
2447 if ((pcre_ctypes[c] & ctype_space) != 0)
2448 {
2449 if (extended) continue;
2450 spaces++;
2451 }
2452
2453 if (extended && c == '#')
2454 {
2455 while ((c = *(++ptr)) != 0 && c != '\n');
2456 continue;
2457 }
2458
2459 /* Backslash may introduce a data char or a metacharacter; stop the
2460 string before the latter. */
2461
2462 if (c == '\\')
2463 {
2464 uschar *saveptr = ptr;
2465 c = check_escape(&ptr, errorptr);
2466 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2467 if (c < 0) { ptr = saveptr; break; }
2468 }
2469
2470 /* Ordinary character or single-char escape */
2471
2472 runlength++;
2473 }
2474
2475 /* This "while" is the end of the "do" above. */
2476
2477 while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2478
2479 ptr--;
2480 length += runlength;
2481 continue;
2482 }
2483 }
2484
2485length += 4; /* For final KET and END */
2486
2487if (length > 65539)
2488 {
2489 *errorptr = "regular expression too large";
2490 return NULL;
2491 }
2492
2493/* Compute the size of data block needed and get it, either from malloc or
2494externally provided function. Put in the magic number and the options. */
2495
2496size = length + sizeof(real_pcre) - sizeof(re->code);
2497re = (real_pcre *)(pcre_malloc)(size);
2498
2499if (re == NULL)
2500 {
2501 *errorptr = "failed to get memory";
2502 return NULL;
2503 }
2504
2505re->magic_number = MAGIC_NUMBER;
2506re->options = options;
2507
2508/* Set up a starting, non-extracting bracket, then compile the expression. On
2509error, *errorptr will be set non-NULL, so we don't need to look at the result
2510of the function here. */
2511
2512ptr = (uschar *)pattern;
2513code = re->code;
2514*code = OP_BRA;
2515(void)compile_regexp(extended, brackets, &code, &ptr, errorptr, dictionary);
2516re->top_bracket = brackets[1];
2517
2518/* If not reached end of pattern on success, there's an excess bracket. */
2519
2520if (*errorptr == NULL && *ptr != 0) *errorptr = "unmatched brackets";
2521/* Fill in the terminating state and check for disastrous overflow, but
2522if debugging, leave the test till after things are printed out. */
2523
2524*code++ = OP_END;
2525
2526#ifndef DEBUG
2527if (code - re->code > length) *errorptr = "internal error: code overflow";
2528#endif
2529
2530/* Failed to compile */
2531
2532if (*errorptr != NULL)
2533 {
2534 (pcre_free)(re);
2535 PCRE_ERROR_RETURN:
2536 *erroroffset = ptr - (uschar *)pattern;
2537 return NULL;
2538 }
2539
2540/* If the anchored option was not passed, set flag if we can determine that it
2541is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
2542we can determine what the first character has to be, because that speeds up
2543unanchored matches no end. In the case of multiline matches, an alternative is
2544to set the PCRE_STARTLINE flag if all branches start with ^. */
2545
2546if ((options & PCRE_ANCHORED) == 0)
2547 {
2548 if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))
2549 re->options |= PCRE_ANCHORED;
2550 else
2551 {
2552 int c = find_firstchar(re->code);
2553 if (c >= 0)
2554 {
2555 re->first_char = c;
2556 re->options |= PCRE_FIRSTSET;
2557 }
2558 else if (is_startline(re->code))
2559 re->options |= PCRE_STARTLINE;
2560 }
2561 }
2562
2563/* Print out the compiled data for debugging */
2564
2565#ifdef DEBUG
2566
2567printf("Length = %d top_bracket = %d%s%s%s%s\n",
2568 length, re->top_bracket,
2569 ((re->options & PCRE_ANCHORED) != 0)? " anchored" : "",
2570 ((re->options & PCRE_CASELESS) != 0)? " caseless" : "",
2571 extended? " extended" : "",
2572 ((re->options & PCRE_MULTILINE) != 0)? " multiline" : "");
2573
2574if ((re->options & PCRE_FIRSTSET) != 0)
2575 {
2576 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2577 else printf("First char = \\x%02x\n", re->first_char);
2578 }
2579
2580code_end = code;
2581code_base = code = re->code;
2582
2583while (code < code_end)
2584 {
2585 int charlength;
2586
2587 printf("%3d ", code - code_base);
2588
2589 if (*code >= OP_BRA)
2590 {
2591 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2592 code += 2;
2593 }
2594
2595 else switch(*code)
2596 {
2597 case OP_CHARS:
2598 charlength = *(++code);
2599 printf("%3d ", charlength);
2600 while (charlength-- > 0)
2601 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2602 break;
2603
2604 case OP_KETRMAX:
2605 case OP_KETRMIN:
2606 case OP_ALT:
2607 case OP_KET:
2608 case OP_ASSERT:
2609 case OP_ASSERT_NOT:
2610 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2611 code += 2;
2612 break;
2613
2614 case OP_STAR:
2615 case OP_MINSTAR:
2616 case OP_PLUS:
2617 case OP_MINPLUS:
2618 case OP_QUERY:
2619 case OP_MINQUERY:
2620 case OP_TYPESTAR:
2621 case OP_TYPEMINSTAR:
2622 case OP_TYPEPLUS:
2623 case OP_TYPEMINPLUS:
2624 case OP_TYPEQUERY:
2625 case OP_TYPEMINQUERY:
2626 if (*code >= OP_TYPESTAR)
2627 printf(" %s", OP_names[code[1]]);
2628 else if (isprint(c = code[1])) printf(" %c", c);
2629 else printf(" \\x%02x", c);
2630 printf("%s", OP_names[*code++]);
2631 break;
2632
2633 case OP_EXACT:
2634 case OP_UPTO:
2635 case OP_MINUPTO:
2636 if (isprint(c = code[3])) printf(" %c{", c);
2637 else printf(" \\x%02x{", c);
2638 if (*code != OP_EXACT) printf(",");
2639 printf("%d}", (code[1] << 8) + code[2]);
2640 if (*code == OP_MINUPTO) printf("?");
2641 code += 3;
2642 break;
2643
2644 case OP_TYPEEXACT:
2645 case OP_TYPEUPTO:
2646 case OP_TYPEMINUPTO:
2647 printf(" %s{", OP_names[code[3]]);
2648 if (*code != OP_TYPEEXACT) printf(",");
2649 printf("%d}", (code[1] << 8) + code[2]);
2650 if (*code == OP_TYPEMINUPTO) printf("?");
2651 code += 3;
2652 break;
2653
2654 case OP_REF:
2655 printf(" \\%d", *(++code));
2656 break;
2657
2658 case OP_CLASS:
2659 case OP_NEGCLASS:
2660 {
2661 int i, min, max;
2662 int flags = code[1];
2663 int rangecount = code[2];
2664 int charcount = code[3];
2665
2666 printf(" [%s", (*code == OP_CLASS)? "" : "^");
2667 code += 3;
2668
2669 for (i = 0; i < 8; i++)
2670 if ((flags & (1 << i)) != 0) printf("%s", class_names[i]);
2671
2672 for (i = 0; i < rangecount; i++)
2673 {
2674 if (isprint(*(++code))) printf("%c-", *code); else printf("\\x%02x-", *code);
2675 if (isprint(*(++code))) printf("%c", *code); else printf("\\x%02x", *code);
2676 }
2677
2678 for (i = 0; i < charcount; i++)
2679 {
2680 if (!isprint(*(++code))) printf("\\x%02x", *code);
2681 else if (strchr("-\\]", *code) != NULL) printf("\\%c", *code);
2682 else printf("%c", *code);
2683 }
2684 printf("]");
2685
2686 switch(*(++code))
2687 {
2688 case OP_CRSTAR:
2689 case OP_CRMINSTAR:
2690 case OP_CRPLUS:
2691 case OP_CRMINPLUS:
2692 case OP_CRQUERY:
2693 case OP_CRMINQUERY:
2694 printf("%s", OP_names[*code]);
2695 break;
2696
2697 case OP_CRRANGE:
2698 case OP_CRMINRANGE:
2699 min = (code[1] << 8) + code[2];
2700 max = (code[3] << 8) + code[4];
2701 if (max == 0) printf("{%d,}", min);
2702 else printf("{%d,%d}", min, max);
2703 if (*code == OP_CRMINRANGE) printf("?");
2704 code += 4;
2705 break;
2706
2707 default:
2708 code--;
2709 }
2710 }
2711 break;
2712
2713 /* Anything else is just a one-node item */
2714
2715 default:
2716 printf(" %s", OP_names[*code]);
2717 break;
2718 }
2719
2720 code++;
2721 printf("\n");
2722 }
2723printf("------------------------------------------------------------------\n");
2724
2725/* This check is done here in the debugging case so that the code that
2726was compiled can be seen. */
2727
2728if (code - re->code > length)
2729 {
2730 *errorptr = "internal error: code overflow";
2731 (pcre_free)(re);
2732 *erroroffset = ptr - (uschar *)pattern;
2733 return NULL;
2734 }
2735#endif
2736
2737return (pcre *)re;
2738}
2739
2740
2741
2742/*************************************************
2743* Match a character type *
2744*************************************************/
2745
2746/* Not used in all the places it might be as it's sometimes faster
2747to put the code inline.
2748
2749Arguments:
2750 type the character type
2751 c the character
2752 multiline the multiline flag
2753
2754Returns: TRUE if character is of the type
2755*/
2756
2757static BOOL
2758match_type(int type, int c, BOOL dotall)
2759{
2760
2761#ifdef DEBUG
2762if (isprint(c)) printf("matching subject %c against ", c);
2763 else printf("matching subject \\x%02x against ", c);
2764printf("%s\n", OP_names[type]);
2765#endif
2766
2767switch(type)
2768 {
2769 case OP_ANY: return dotall || c != '\n';
2770 case OP_NOT_DIGIT: return (pcre_ctypes[c] & ctype_digit) == 0;
2771 case OP_DIGIT: return (pcre_ctypes[c] & ctype_digit) != 0;
2772 case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
2773 case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0;
2774 case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0;
2775 case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0;
2776 }
2777return FALSE;
2778}
2779
2780
2781/*************************************************
2782* Match a character class *
2783*************************************************/
2784
2785/* Return "result" if char is in the class and "!result" otherwise.
2786
2787Arguments:
2788 data points to the class item
2789 c the subject character
2790 result value to return if in class
2791 md matching "static" data
2792
2793Returns: result or !result
2794*/
2795
2796static BOOL
2797match_class(register uschar *data, register int c, BOOL result, match_data *md)
2798{
2799int flags = data[1];
2800int i;
2801uschar *base = data;
2802uschar *end;
2803
2804#ifdef DEBUG
2805 {
2806 uschar *d = base + 3;
2807
2808 if (isprint(c))
2809 printf("match %c against [%s", c, result? "" : "^");
2810 else
2811 printf("match \\x%02x against [%s", c, result? "" : "^");
2812
2813 for (i = 0; i < 8; i++)
2814 if ((flags & (1 << i)) != 0) printf("%s", class_names[i]);
2815
2816 for (i = 0; i < data[2]; i++)
2817 {
2818 if (isprint(*(++d))) printf("%c-", *d); else printf("\\x%02x-", *d);
2819 if (isprint(*(++d))) printf("%c", *d); else printf("\\x%02x", *d);
2820 }
2821
2822 for (i = 0; i < data[3]; i++)
2823 {
2824 if (!isprint(*(++d))) printf("\\x%02x", *d);
2825 else if (strchr("-\\]", *d) != NULL) printf("\\%c", *d);
2826 else printf("%c", *d);
2827 }
2828 printf("]\n");
2829 }
2830#endif
2831
2832/* Test for any character types */
2833
2834for (i = 0; flags != 0; i++)
2835 {
2836 if ((flags & 1) != 0 && match_type(class_ops[i], c, md->dotall))
2837 return result;
2838 flags >>= 1;
2839 }
2840
2841/* Advance pointer to the specific chars and do the caseless or caseful testing
2842of the ranges and individual characters as necessary. */
2843
2844data += 4;
2845end = data + base[2] * 2;
2846
2847/* Caseless character ranges are slightly tricky, because of cases like [W-c].
2848What we do is to uppercase the subject char if it is beyond the end of the
2849range, or lowercase it if it is before the start of the range and try again if
2850a caseful comparison has failed. This works because upper case letters come
2851before lower case in ASCII code. It would not work in EBCDIC, for example,
2852where they are the other way round, but then ranges like [W-c] would be illegal
2853in EBCDIC. */
2854
2855if (md->caseless)
2856 {
2857 while (data < end)
2858 {
2859 register int d;
2860 if (c >= (int)*data && c <= (int)data[1]) return result;
2861 d = (c < (int)*data)? pcre_lcc[c] : pcre_ucc[c];
2862 if (d >= (int)*data && d <= (int)data[1]) return result;
2863 data += 2;
2864 }
2865 end += base[3];
2866 c = pcre_lcc[c];
2867 while (data < end) if (c == pcre_lcc[*data++]) return result;
2868 }
2869
2870/* Caseful is easy */
2871
2872else
2873 {
2874 while (data < end)
2875 {
2876 if (c >= (int)*data && c <= (int)data[1]) return result;
2877 data += 2;
2878 }
2879 end += base[3];
2880 while (data < end) if (c == *data++) return result;
2881 }
2882
2883/* Character is not in the class */
2884
2885return !result;
2886}
2887
2888
2889
2890/*************************************************
2891* Match a back-reference *
2892*************************************************/
2893
2894/* If a back reference hasn't been set, the match fails.
2895
2896Arguments:
2897 number reference number
2898 eptr points into the subject
2899 length length to be matched
2900 md points to match data block
2901
2902Returns: TRUE if matched
2903*/
2904
2905static BOOL
2906match_ref(int number, register uschar *eptr, int length, match_data *md)
2907{
2908uschar *p = md->start_subject + md->offset_vector[number];
2909
2910#ifdef DEBUG
2911if (eptr >= md->end_subject)
2912 printf("matching subject <null>");
2913else
2914 {
2915 printf("matching subject ");
2916 pchars(eptr, length, TRUE, md);
2917 }
2918printf(" against backref ");
2919pchars(p, length, FALSE, md);
2920printf("\n");
2921#endif
2922
2923/* Always fail if not enough characters left */
2924
2925if (length > md->end_subject - p) return FALSE;
2926
2927/* Separate the caselesss case for speed */
2928
2929if (md->caseless)
2930 { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }
2931else
2932 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
2933
2934return TRUE;
2935}
2936
2937static int free_stack(match_data *md)
2938{
2939/* Free any stack space that was allocated by the call to match(). */
2940if (md->off_num) free(md->off_num);
2941if (md->offset_top) free(md->offset_top);
2942if (md->r1) free(md->r1);
2943if (md->r2) free(md->r2);
2944if (md->eptr) free(md->eptr);
2945if (md->ecode) free(md->ecode);
2946}
2947
2948static int grow_stack(match_data *md)
2949{
2950 md->length = md->length ? md->length+md->length/2 : 200;
2951 md->offset_top=realloc(md->offset_top, md->length*sizeof(int));
2952 md->eptr=realloc(md->eptr, md->length*sizeof(void *));
2953 md->ecode=realloc(md->ecode, md->length*sizeof(void *));
2954 md->off_num=realloc(md->off_num, md->length*sizeof(int));
2955 md->r1=realloc(md->r1, md->length*sizeof(int));
2956 md->r2=realloc(md->r2, md->length*sizeof(int));
2957 return 0;
2958}
2959
2960/*************************************************
2961* Match from current position *
2962*************************************************/
2963
2964/* On entry ecode points to the first opcode, and eptr to the first character.
2965
2966Arguments:
2967 eptr pointer in subject
2968 ecode position in code
2969 offset_top current top pointer
2970 md pointer to "static" info for the match
2971
2972Returns: TRUE if matched
2973*/
2974
2975static BOOL
2976match(register uschar *eptr, register uschar *ecode, int offset_top,
2977 match_data *md)
2978{
2979 int save_stack_position = md->point;
2980match_loop:
2981
2982#define SUCCEED goto succeed
2983#define FAIL goto fail
2984
2985for (;;)
2986 {
2987 int min, max, ctype;
2988 register int i;
2989 register int c;
2990 BOOL minimize;
2991
2992 /* Opening bracket. Check the alternative branches in turn, failing if none
2993 match. We have to set the start offset if required and there is space
2994 in the offset vector so that it is available for subsequent back references
2995 if the bracket matches. However, if the bracket fails, we must put back the
2996 previous value of both offsets in case they were set by a previous copy of
2997 the same bracket. Don't worry about setting the flag for the error case here;
2998 that is handled in the code for KET. */
2999
3000 if ((int)*ecode >= OP_BRA)
3001 {
3002 int number = (*ecode - OP_BRA) << 1;
3003 int save_offset1, save_offset2;
3004
3005 #ifdef DEBUG
3006 printf("start bracket %d\n", number/2);
3007 #endif
3008
3009 if (number > 0 && number < md->offset_end)
3010 {
3011 save_offset1 = md->offset_vector[number];
3012 save_offset2 = md->offset_vector[number+1];
3013 md->offset_vector[number] = eptr - md->start_subject;
3014
3015 #ifdef DEBUG
3016 printf("saving %d %d\n", save_offset1, save_offset2);
3017 #endif
3018 }
3019
3020 /* Recurse for all the alternatives. */
3021
3022 do
3023 {
3024 if (match(eptr, ecode+3, offset_top, md)) SUCCEED;
3025 ecode += (ecode[1] << 8) + ecode[2];
3026 }
3027 while (*ecode == OP_ALT);
3028
3029 #ifdef DEBUG
3030 printf("bracket %d failed\n", number/2);
3031 #endif
3032
3033 if (number > 0 && number < md->offset_end)
3034 {
3035 md->offset_vector[number] = save_offset1;
3036 md->offset_vector[number+1] = save_offset2;
3037 }
3038
3039 FAIL;
3040 }
3041
3042 /* Other types of node can be handled by a switch */
3043
3044 switch(*ecode)
3045 {
3046 case OP_END:
3047 md->end_match_ptr = eptr; /* Record where we ended */
3048 md->end_offset_top = offset_top; /* and how many extracts were taken */
3049 SUCCEED;
3050
3051 /* Assertion brackets. Check the alternative branches in turn - the
3052 matching won't pass the KET for an assertion. If any one branch matches,
3053 the assertion is true. */
3054
3055 case OP_ASSERT:
3056 do
3057 {
3058 if (match(eptr, ecode+3, offset_top, md)) break;
3059 ecode += (ecode[1] << 8) + ecode[2];
3060 }
3061 while (*ecode == OP_ALT);
3062 if (*ecode == OP_KET) FAIL;
3063
3064 /* Continue from after the assertion, updating the offsets high water
3065 mark, since extracts may have been taken during the assertion. */
3066
3067 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3068 ecode += 3;
3069 offset_top = md->end_offset_top;
3070 continue;
3071
3072 /* Negative assertion: all branches must fail to match */
3073
3074 case OP_ASSERT_NOT:
3075 do
3076 {
3077 if (match(eptr, ecode+3, offset_top, md)) FAIL;
3078 ecode += (ecode[1] << 8) + ecode[2];
3079 }
3080 while (*ecode == OP_ALT);
3081 ecode += 3;
3082 continue;
3083
3084 /* An alternation is the end of a branch; scan along to find the end of the
3085 bracketed group and go to there. */
3086
3087 case OP_ALT:
3088 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3089 break;
3090
3091 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3092 that it may occur zero times. It may repeat infinitely, or not at all -
3093 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3094 repeat limits are compiled as a number of copies, with the optional ones
3095 preceded by BRAZERO or BRAMINZERO. */
3096
3097 case OP_BRAZERO:
3098 {
3099 uschar *next = ecode+1;
3100 if (match(eptr, next, offset_top, md)) SUCCEED;
3101 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3102 ecode = next + 3;
3103 }
3104 break;
3105
3106 case OP_BRAMINZERO:
3107 {
3108 uschar *next = ecode+1;
3109 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3110 if (match(eptr, next+3, offset_top, md)) SUCCEED;
3111 ecode++;
3112 }
3113 break;;
3114
3115 /* End of a group, repeated or non-repeating. If we are at the end of
3116 an assertion "group", stop matching and SUCCEED, but record the
3117 current high water mark for use by positive assertions. */
3118
3119 case OP_KET:
3120 case OP_KETRMIN:
3121 case OP_KETRMAX:
3122 {
3123 int number, start, end;
3124 uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3125
3126 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT)
3127 {
3128 md->end_offset_top = offset_top;
3129 SUCCEED;
3130 }
3131
3132 /* In all other cases we have to check the group number back at the
3133 start and if necessary complete handling an extraction by setting the
3134 final offset and bumping the high water mark. */
3135
3136 number = (*prev - OP_BRA) << 1;
3137
3138 #ifdef DEBUG
3139 printf("end bracket %d\n", number/2);
3140 #endif
3141
3142 if (number > 0)
3143 {
3144 if (number >= md->offset_end) md->offset_overflow = TRUE; else
3145 {
3146 start=md->offset_vector[number] ; end =md->offset_vector[number+1];
3147 md->offset_vector[number+1] = eptr - md->start_subject;
3148 if (offset_top <= number) offset_top = number + 2;
3149 }
3150 }
3151
3152 /* For a non-repeating ket, just advance to the next node and continue at
3153 this level. */
3154
3155 if (*ecode == OP_KET)
3156 {
3157 ecode += 3;
3158 break;
3159 }
3160
3161 /* The repeating kets try the rest of the pattern or restart from the
3162 preceding bracket, in the appropriate order. */
3163
3164 if (*ecode == OP_KETRMIN)
3165 {
3166 uschar *ptr;
3167 if (match(eptr, ecode+3, offset_top, md)) goto succeed;
3168 /* Handle alternation inside the BRA...KET; push the additional
3169 alternatives onto the stack
3170 XXX this tries the alternatives backwards! */
3171 ptr=prev;
3172 do {
3173 ptr += (ptr[1]<<8)+ ptr[2];
3174 if (*ptr==OP_ALT)
3175 {
3176 if (md->length == md->point) grow_stack(md);
3177 md->offset_top[md->point] = offset_top;
3178 md->eptr[md->point] = eptr;
3179 md->ecode[md->point] = ptr+3;
3180 md->r1[md->point] = 0;
3181 md->r2[md->point] = 0;
3182 md->off_num[md->point] = 0;
3183 md->point++;
3184 }
3185 } while (*ptr==OP_ALT);
3186 ecode=prev+3; goto match_loop;
3187 }
3188 else /* OP_KETRMAX */
3189 {
3190 uschar *ptr;
3191 int points_pushed=0;
3192
3193 /* Push one failure point, that will resume matching at the code after
3194 the KETRMAX opcode. */
3195 if (md->length == md->point) grow_stack(md);
3196 md->offset_top[md->point] = offset_top;
3197 md->eptr[md->point] = eptr;
3198 md->ecode[md->point] = ecode+3;
3199 md->r1[md->point] = md->offset_vector[number];
3200 md->r2[md->point] = md->offset_vector[number+1];
3201 md->off_num[md->point] = number;
3202 md->point++;
3203
3204 md->offset_vector[number] = eptr - md->start_subject;
3205 /* Handle alternation inside the BRA...KET; push each of the
3206 additional alternatives onto the stack
3207 XXX this tries the alternatives backwards! */
3208 ptr=prev;
3209 do {
3210 ptr += (ptr[1]<<8)+ ptr[2];
3211 if (*ptr==OP_ALT)
3212 {
3213 if (md->length == md->point) grow_stack(md);
3214 md->offset_top[md->point] = offset_top;
3215 md->eptr[md->point] = eptr;
3216 md->ecode[md->point] = ptr+3;
3217 md->r1[md->point] = 0;
3218 md->r2[md->point] = 0;
3219 md->off_num[md->point] = 0;
3220 md->point++;
3221 points_pushed++;
3222 }
3223 } while (*ptr==OP_ALT);
3224 /* Jump to the first (or only) alternative and resume trying to match */
3225 ecode=prev+3; goto match_loop;
3226 }
3227 }
3228 FAIL;
3229
3230 /* Start of subject, or after internal newline if multiline */
3231
3232 case OP_CIRC:
3233 if (md->multiline)
3234 {
3235 if (eptr != md->start_subject && eptr[-1] != '\n') FAIL;
3236 ecode++;
3237 break;
3238 }
3239 /* ... else fall through */
3240
3241 /* Start of subject assertion */
3242
3243 case OP_SOD:
3244 if (eptr != md->start_subject) FAIL;
3245 ecode++;
3246 break;
3247
3248 /* End of subject, or before internal newline if multiline */
3249
3250 case OP_DOLL:
3251 if (md->multiline)
3252 {
3253 if (eptr < md->end_subject && *eptr != '\n') FAIL;
3254 ecode++;
3255 break;
3256 }
3257 /* ... else fall through */
3258
3259 /* End of subject assertion */
3260
3261 case OP_EOD:
3262 if (eptr < md->end_subject) FAIL;
3263 ecode++;
3264 break;
3265
3266 /* Word boundary assertions */
3267
3268 case OP_NOT_WORD_BOUNDARY:
3269 case OP_WORD_BOUNDARY:
3270 {
3271 BOOL prev_is_word = (eptr != md->start_subject) &&
3272 ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
3273 BOOL cur_is_word = (eptr < md->end_subject) &&
3274 ((pcre_ctypes[*eptr] & ctype_word) != 0);
3275 if ((*ecode++ == OP_WORD_BOUNDARY)?
3276 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3277 FAIL;
3278 }
3279 break;
3280
3281 /* Match a single character type; inline for speed */
3282
3283 case OP_ANY:
3284 if (!md->dotall && eptr < md->end_subject && *eptr == '\n') FAIL;
3285 if (eptr++ >= md->end_subject) FAIL;
3286 ecode++;
3287 break;
3288
3289 case OP_NOT_DIGIT:
3290 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)
3291 FAIL;
3292 ecode++;
3293 break;
3294
3295 case OP_DIGIT:
3296 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)
3297 FAIL;
3298 ecode++;
3299 break;
3300
3301 case OP_NOT_WHITESPACE:
3302 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)
3303 FAIL;
3304 ecode++;
3305 break;
3306
3307 case OP_WHITESPACE:
3308 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)
3309 FAIL;
3310 ecode++;
3311 break;
3312
3313 case OP_NOT_WORDCHAR:
3314 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)
3315 FAIL;
3316 ecode++;
3317 break;
3318
3319 case OP_WORDCHAR:
3320 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)
3321 FAIL;
3322 ecode++;
3323 break;
3324
3325 /* Match a back reference, possibly repeatedly. Look past the end of the
3326 item to see if there is repeat information following. The code is similar
3327 to that for character classes, but repeated for efficiency. Then obey
3328 similar code to character type repeats - written out again for speed.
3329 However, if the referenced string is the empty string, always treat
3330 it as matched, any number of times (otherwise there could be infinite
3331 loops). */
3332
3333 case OP_REF:
3334 {
3335 int length;
3336 int number = ecode[1] << 1; /* Doubled reference number */
3337 ecode += 2; /* Advance past the item */
3338
3339 if (number >= offset_top || md->offset_vector[number] < 0)
3340 {
3341 md->errorcode = PCRE_ERROR_BADREF;
3342 FAIL;
3343 }
3344
3345 length = md->offset_vector[number+1] - md->offset_vector[number];
3346
3347 switch (*ecode)
3348 {
3349 case OP_CRSTAR:
3350 case OP_CRMINSTAR:
3351 case OP_CRPLUS:
3352 case OP_CRMINPLUS:
3353 case OP_CRQUERY:
3354 case OP_CRMINQUERY:
3355 c = *ecode++ - OP_CRSTAR;
3356 minimize = (c & 1) != 0;
3357 min = rep_min[c]; /* Pick up values from tables; */
3358 max = rep_max[c]; /* zero for max => infinity */
3359 if (max == 0) max = INT_MAX;
3360 break;
3361
3362 case OP_CRRANGE:
3363 case OP_CRMINRANGE:
3364 minimize = (*ecode == OP_CRMINRANGE);
3365 min = (ecode[1] << 8) + ecode[2];
3366 max = (ecode[3] << 8) + ecode[4];
3367 if (max == 0) max = INT_MAX;
3368 ecode += 5;
3369 break;
3370
3371 default: /* No repeat follows */
3372 if (!match_ref(number, eptr, length, md)) FAIL;
3373 eptr += length;
3374 continue; /* With the main loop */
3375 }
3376
3377 /* If the length of the reference is zero, just continue with the
3378 main loop. */
3379
3380 if (length == 0) continue;
3381
3382 /* First, ensure the minimum number of matches are present. We get back
3383 the length of the reference string explicitly rather than passing the
3384 address of eptr, so that eptr can be a register variable. */
3385
3386 for (i = 1; i <= min; i++)
3387 {
3388 if (!match_ref(number, eptr, length, md)) FAIL;
3389 eptr += length;
3390 }
3391
3392 /* If min = max, continue at the same level without recursion.
3393 They are not both allowed to be zero. */
3394
3395 if (min == max) continue;
3396
3397 /* If minimizing, keep trying and advancing the pointer */
3398
3399 if (minimize)
3400 {
3401 for (i = min;; i++)
3402 {
3403 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3404 if (i >= max || !match_ref(number, eptr, length, md))
3405 FAIL;
3406 eptr += length;
3407 }
3408 /* Control never gets here */
3409 }
3410
3411 /* If maximizing, find the longest string and work backwards */
3412
3413 else
3414 {
3415 uschar *pp = eptr;
3416 for (i = min; i < max; i++)
3417 {
3418 if (!match_ref(number, eptr, length, md)) break;
3419 eptr += length;
3420 }
3421 while (eptr >= pp)
3422 {
3423 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3424 eptr -= length;
3425 }
3426 FAIL;
3427 }
3428 }
3429 /* Control never gets here */
3430
3431 /* Match a character class, possibly repeatedly. Look past the end of the
3432 item to see if there is repeat information following. Then obey similar
3433 code to character type repeats - written out again for speed. */
3434
3435 case OP_CLASS:
3436 case OP_NEGCLASS:
3437 {
3438 BOOL result = *ecode == OP_CLASS;
3439 uschar *data = ecode; /* Save for matching */
3440
3441 ecode += 4 + 2 * ecode[2] + ecode[3]; /* Advance past the item */
3442
3443 switch (*ecode)
3444 {
3445 case OP_CRSTAR:
3446 case OP_CRMINSTAR:
3447 case OP_CRPLUS:
3448 case OP_CRMINPLUS:
3449 case OP_CRQUERY:
3450 case OP_CRMINQUERY:
3451 c = *ecode++ - OP_CRSTAR;
3452 minimize = (c & 1) != 0;
3453 min = rep_min[c]; /* Pick up values from tables; */
3454 max = rep_max[c]; /* zero for max => infinity */
3455 if (max == 0) max = INT_MAX;
3456 break;
3457
3458 case OP_CRRANGE:
3459 case OP_CRMINRANGE:
3460 minimize = (*ecode == OP_CRMINRANGE);
3461 min = (ecode[1] << 8) + ecode[2];
3462 max = (ecode[3] << 8) + ecode[4];
3463 if (max == 0) max = INT_MAX;
3464 ecode += 5;
3465 break;
3466
3467 default: /* No repeat follows */
3468 if (eptr >= md->end_subject || !match_class(data, *eptr++, result, md))
3469 FAIL;
3470 continue; /* With the main loop */
3471 }
3472
3473 /* First, ensure the minimum number of matches are present. */
3474
3475 for (i = 1; i <= min; i++)
3476 if (eptr >= md->end_subject || !match_class(data, *eptr++, result, md))
3477 FAIL;
3478
3479 /* If max == min we can continue with the main loop without the
3480 need to recurse. */
3481
3482 if (min == max) continue;
3483
3484 /* If minimizing, keep testing the rest of the expression and advancing
3485 the pointer while it matches the class. */
3486
3487 if (minimize)
3488 {
3489 for (i = min;; i++)
3490 {
3491 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3492 if (i >= max || eptr >= md->end_subject ||
3493 !match_class(data, *eptr++, result, md)) FAIL;
3494 }
3495 /* Control never gets here */
3496 }
3497
3498 /* If maximizing, find the longest possible run, then work backwards. */
3499
3500 else
3501 {
3502 uschar *pp = eptr;
3503 for (i = min; i < max; i++)
3504 {
3505 if (eptr >= md->end_subject || !match_class(data, *eptr, result, md))
3506 break;
3507 eptr++;
3508 }
3509 while (eptr >= pp)
3510 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3511 FAIL;
3512 }
3513 }
3514 /* Control never gets here */
3515
3516 /* Match a run of characters */
3517
3518 case OP_CHARS:
3519 {
3520 register int length = ecode[1];
3521 ecode += 2;
3522
3523 #ifdef DEBUG
3524 if (eptr >= md->end_subject)
3525 printf("matching subject <null> against pattern ");
3526 else
3527 {
3528 printf("matching subject ");
3529 pchars(eptr, length, TRUE, md);
3530 printf(" against pattern ");
3531 }
3532 pchars(ecode, length, FALSE, md);
3533 printf("\n");
3534 #endif
3535
3536 if (length > md->end_subject - eptr) FAIL;
3537 if (md->caseless)
3538 {
3539 while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) FAIL;
3540 }
3541 else
3542 {
3543 while (length-- > 0) if (*ecode++ != *eptr++) FAIL;
3544 }
3545 }
3546 break;
3547
3548 /* Match a single character repeatedly; different opcodes share code. */
3549
3550 case OP_EXACT:
3551 min = max = (ecode[1] << 8) + ecode[2];
3552 ecode += 3;
3553 goto REPEATCHAR;
3554
3555 case OP_UPTO:
3556 case OP_MINUPTO:
3557 min = 0;
3558 max = (ecode[1] << 8) + ecode[2];
3559 minimize = *ecode == OP_MINUPTO;
3560 ecode += 3;
3561 goto REPEATCHAR;
3562
3563 case OP_STAR:
3564 case OP_MINSTAR:
3565 case OP_PLUS:
3566 case OP_MINPLUS:
3567 case OP_QUERY:
3568 case OP_MINQUERY:
3569 c = *ecode++ - OP_STAR;
3570 minimize = (c & 1) != 0;
3571 min = rep_min[c]; /* Pick up values from tables; */
3572 max = rep_max[c]; /* zero for max => infinity */
3573 if (max == 0) max = INT_MAX;
3574
3575 /* Common code for all repeated single-character matches. We can give
3576 up quickly if there are fewer than the minimum number of characters left in
3577 the subject. */
3578
3579 REPEATCHAR:
3580 if (min > md->end_subject - eptr) FAIL;
3581 c = *ecode++;
3582
3583 /* The code is duplicated for the caseless and caseful cases, for speed,
3584 since matching characters is likely to be quite common. First, ensure the
3585 minimum number of matches are present. If min = max, continue at the same
3586 level without recursing. Otherwise, if minimizing, keep trying the rest of
3587 the expression and advancing one matching character if failing, up to the
3588 maximum. Alternatively, if maximizing, find the maximum number of
3589 characters and work backwards. */
3590
3591 #ifdef DEBUG
3592 printf("matching %c{%d,%d} against subject %.*s\n", c, min, max,
3593 max, eptr);
3594 #endif
3595
3596 if (md->caseless)
3597 {
3598 c = pcre_lcc[c];
3599 for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) FAIL;
3600 if (min == max) continue;
3601 if (minimize)
3602 {
3603 for (i = min;; i++)
3604 {
3605 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3606 if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])
3607 FAIL;
3608 }
3609 /* Control never gets here */
3610 }
3611 else
3612 {
3613 uschar *pp = eptr;
3614 for (i = min; i < max; i++)
3615 {
3616 if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;
3617 eptr++;
3618 }
3619 while (eptr >= pp)
3620 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3621 FAIL;
3622 }
3623 }
3624
3625 /* Caseful comparisons */
3626
3627 else
3628 {
3629 for (i = 1; i <= min; i++) if (c != *eptr++) FAIL;
3630 if (min == max) continue;
3631 if (minimize)
3632 {
3633 for (i = min;; i++)
3634 {
3635 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3636 if (i >= max || eptr >= md->end_subject || c != *eptr++) FAIL;
3637 }
3638 /* Control never gets here */
3639 }
3640 else
3641 {
3642 uschar *pp = eptr;
3643 for (i = min; i < max; i++)
3644 {
3645 if (eptr >= md->end_subject || c != *eptr) break;
3646 eptr++;
3647 }
3648 while (eptr >= pp)
3649 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3650 FAIL;
3651 }
3652 }
3653 /* Control never gets here */
3654
3655 /* Match a single character type repeatedly; several different opcodes
3656 share code. This is very similar to the code for single characters, but we
3657 repeat it in the interests of efficiency. */
3658
3659 case OP_TYPEEXACT:
3660 min = max = (ecode[1] << 8) + ecode[2];
3661 minimize = TRUE;
3662 ecode += 3;
3663 goto REPEATTYPE;
3664
3665 case OP_TYPEUPTO:
3666 case OP_TYPEMINUPTO:
3667 min = 0;
3668 max = (ecode[1] << 8) + ecode[2];
3669 minimize = *ecode == OP_TYPEMINUPTO;
3670 ecode += 3;
3671 goto REPEATTYPE;
3672
3673 case OP_TYPESTAR:
3674 case OP_TYPEMINSTAR:
3675 case OP_TYPEPLUS:
3676 case OP_TYPEMINPLUS:
3677 case OP_TYPEQUERY:
3678 case OP_TYPEMINQUERY:
3679 c = *ecode++ - OP_TYPESTAR;
3680 minimize = (c & 1) != 0;
3681 min = rep_min[c]; /* Pick up values from tables; */
3682 max = rep_max[c]; /* zero for max => infinity */
3683 if (max == 0) max = INT_MAX;
3684
3685 /* Common code for all repeated single character type matches */
3686
3687 REPEATTYPE:
3688 ctype = *ecode++; /* Code for the character type */
3689
3690 /* First, ensure the minimum number of matches are present. Use inline
3691 code for maximizing the speed, and do the type test once at the start
3692 (i.e. keep it out of the loop). Also test that there are at least the
3693 minimum number of characters before we start. */
3694
3695 if (min > md->end_subject - eptr) FAIL;
3696 if (min > 0) switch(ctype)
3697 {
3698 case OP_ANY:
3699 if (!md->dotall)
3700 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') FAIL; }
3701 else eptr += min;
3702 break;
3703
3704 case OP_NOT_DIGIT:
3705 for (i = 1; i <= min; i++)
3706 if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) FAIL;
3707 break;
3708
3709 case OP_DIGIT:
3710 for (i = 1; i <= min; i++)
3711 if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) FAIL;
3712 break;
3713
3714 case OP_NOT_WHITESPACE:
3715 for (i = 1; i <= min; i++)
3716 if ((pcre_ctypes[*eptr++] & ctype_space) != 0) FAIL;
3717 break;
3718
3719 case OP_WHITESPACE:
3720 for (i = 1; i <= min; i++)
3721 if ((pcre_ctypes[*eptr++] & ctype_space) == 0) FAIL;
3722 break;
3723
3724 case OP_NOT_WORDCHAR:
3725 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
3726 FAIL;
3727 break;
3728
3729 case OP_WORDCHAR:
3730 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
3731 FAIL;
3732 break;
3733 }
3734
3735 /* If min = max, continue at the same level without recursing */
3736
3737 if (min == max) continue;
3738
3739 /* If minimizing, we have to test the rest of the pattern before each
3740 subsequent match, so inlining isn't much help; just use the function. */
3741
3742 if (minimize)
3743 {
3744 for (i = min;; i++)
3745 {
3746 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3747 if (i >= max || eptr >= md->end_subject ||
3748 !match_type(ctype, *eptr++, md->dotall))
3749 FAIL;
3750 }
3751 /* Control never gets here */
3752 }
3753
3754 /* If maximizing it is worth using inline code for speed, doing the type
3755 test once at the start (i.e. keep it out of the loop). */
3756
3757 else
3758 {
3759 uschar *pp = eptr;
3760 switch(ctype)
3761 {
3762 case OP_ANY:
3763 if (!md->dotall)
3764 {
3765 for (i = min; i < max; i++)
3766 {
3767 if (eptr >= md->end_subject || *eptr == '\n') break;
3768 eptr++;
3769 }
3770 }
3771 else
3772 {
3773 c = max - min;
3774 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3775 eptr += c;
3776 }
3777 break;
3778
3779 case OP_NOT_DIGIT:
3780 for (i = min; i < max; i++)
3781 {
3782 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)
3783 break;
3784 eptr++;
3785 }
3786 break;
3787
3788 case OP_DIGIT:
3789 for (i = min; i < max; i++)
3790 {
3791 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)
3792 break;
3793 eptr++;
3794 }
3795 break;
3796
3797 case OP_NOT_WHITESPACE:
3798 for (i = min; i < max; i++)
3799 {
3800 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)
3801 break;
3802 eptr++;
3803 }
3804 break;
3805
3806 case OP_WHITESPACE:
3807 for (i = min; i < max; i++)
3808 {
3809 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)
3810 break;
3811 eptr++;
3812 }
3813 break;
3814
3815 case OP_NOT_WORDCHAR:
3816 for (i = min; i < max; i++)
3817 {
3818 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)
3819 break;
3820 eptr++;
3821 }
3822 break;
3823
3824 case OP_WORDCHAR:
3825 for (i = min; i < max; i++)
3826 {
3827 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)
3828 break;
3829 eptr++;
3830 }
3831 break;
3832 }
3833
3834 while (eptr >= pp)
3835 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3836 FAIL;
3837 }
3838 /* Control never gets here */
3839
3840 /* There's been some horrible disaster. */
3841
3842 default:
3843 #ifdef DEBUG
3844 printf("Unknown opcode %d\n", *ecode);
3845 #endif
3846 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
3847 FAIL;
3848 }
3849
3850 /* Do not stick any code in here without much thought; it is assumed
3851 that "continue" in the code above comes out to here to repeat the main
3852 loop. */
3853
3854 } /* End of main loop */
3855/* Control never reaches here */
3856
3857fail:
3858 if (md->point > save_stack_position)
3859 {
3860 /* If there are still points remaining on the stack, pop the next one off */
3861 int start, end, off_num;
3862
3863 md->point--;
3864 offset_top = md->offset_top[md->point];
3865 eptr = md->eptr[md->point];
3866 ecode = md->ecode[md->point];
3867 off_num = md->off_num[md->point];
3868 md->offset_vector[off_num] = md->r1[md->point];
3869 md->offset_vector[off_num+1] = md->r2[md->point];
3870 goto match_loop;
3871 }
3872 /* Failure, and nothing left on the stack, so end this function call */
3873
3874 /* Restore the top of the stack to where it was before this function
3875 call. This lets us use one stack for everything; recursive calls
3876 can push and pop information, and may increase the stack. When
3877 the call returns, the parent function can resume pushing and
3878 popping wherever it was. */
3879
3880 md->point = save_stack_position;
3881 return FALSE;
3882
3883succeed:
3884 return TRUE;
3885}
3886
3887
3888/*************************************************
3889* Execute a Regular Expression *
3890*************************************************/
3891
3892/* This function applies a compiled re to a subject string and picks out
3893portions of the string if it matches. Two elements in the vector are set for
3894each substring: the offsets to the start and end of the substring.
3895
3896Arguments:
3897 re points to the compiled expression
3898 extra points to "hints" from pcre_study() or is NULL
3899 subject points to the subject string
3900 length length of subject string (may contain binary zeros)
3901 options option bits
3902 offsets points to a vector of ints to be filled in with offsets
3903 offsetcount the number of elements in the vector
3904
3905Returns: > 0 => success; value is the number of elements filled in
3906 = 0 => success, but offsets is not big enough
3907 -1 => failed to match
3908 < -1 => some kind of unexpected problem
3909*/
3910
3911int
3912pcre_exec(pcre *external_re, pcre_extra *external_extra, char *subject,
3913 int length, int options, int *offsets, int offsetcount)
3914{
3915int resetcount;
3916int first_char = -1;
3917match_data match_block;
3918uschar *start_bits = NULL;
3919uschar *start_match = (uschar *)subject;
3920uschar *end_subject;
3921real_pcre *re = (real_pcre *)external_re;
3922real_pcre_extra *extra = (real_pcre_extra *)external_extra;
3923BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3924BOOL startline = (re->options & PCRE_STARTLINE) != 0;
3925
3926if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3927
3928if (re == NULL || subject == NULL ||
3929 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3930if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
3931
3932match_block.start_subject = (uschar *)subject;
3933match_block.end_subject = match_block.start_subject + length;
3934end_subject = match_block.end_subject;
3935
3936match_block.caseless = ((re->options | options) & PCRE_CASELESS) != 0;
3937match_block.multiline = ((re->options |options) & PCRE_MULTILINE) != 0;
3938match_block.dotall = ((re->options |options) & PCRE_DOTALL) != 0;
3939
3940match_block.offset_vector = offsets; /* Where offsets go */
3941match_block.offset_end = (offsetcount & (-2)); /* Past max permitted (even) */
3942match_block.offset_overflow = FALSE;
3943
3944match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
3945
3946/* Set the stack state to empty */
3947 match_block.off_num = match_block.offset_top = NULL;
3948 match_block.r1 = match_block.r2 = NULL;
3949 match_block.eptr = match_block.ecode = NULL;
3950 match_block.point = match_block.length = 0;
3951
3952/* Compute the minimum number of offsets that we need to reset each time. Doing
3953this makes a huge difference to execution time when there aren't many brackets
3954in the pattern. */
3955
3956resetcount = 2 + re->top_bracket * 2;
3957if (resetcount > offsetcount) resetcount = offsetcount;
3958
3959/* If MULTILINE is set at exec time but was not set at compile time, and the
3960anchored flag is set, we must re-check because a setting provoked by ^ in the
3961pattern is not right in multi-line mode. Calling is_anchored() again here does
3962the right check, because multiline is now set. If it now yields FALSE, the
3963expression must have had ^ starting some of its branches. Check to see if
3964that is true for *all* branches, and if so, set the startline flag. */
3965
3966if (match_block. multiline && anchored && (re->options & PCRE_MULTILINE) == 0 &&
3967 !is_anchored(re->code, match_block.multiline))
3968 {
3969 anchored = FALSE;
3970 if (is_startline(re->code)) startline = TRUE;
3971 }
3972
3973/* Set up the first character to match, if available. The first_char value is
3974never set for an anchored regular expression, but the anchoring may be forced
3975at run time, so we have to test for anchoring. The first char may be unset for
3976an unanchored pattern, of course. If there's no first char and the pattern was
3977studied, the may be a bitmap of possible first characters. However, we can
3978use this only if the caseless state of the studying was correct. */
3979
3980if (!anchored)
3981 {
3982 if ((re->options & PCRE_FIRSTSET) != 0)
3983 {
3984 first_char = re->first_char;
3985 if (match_block.caseless) first_char = pcre_lcc[first_char];
3986 }
3987 else
3988 if (!startline && extra != NULL &&
3989 (extra->options & PCRE_STUDY_MAPPED) != 0 &&
3990 ((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless)
3991 start_bits = extra->start_bits;
3992 }
3993
3994/* Loop for unanchored matches; for anchored regexps the loop runs just once. */
3995
3996do
3997 {
3998 register int *iptr = offsets;
3999 register int *iend = offsets + resetcount;
4000
4001 /* Reset the maximum number of extractions we might see. */
4002
4003 while (iptr < iend) *iptr++ = -1;
4004
4005 /* Advance to a unique first char if possible */
4006
4007 if (first_char >= 0)
4008 {
4009 if (match_block.caseless)
4010 while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
4011 start_match++;
4012 else
4013 while (start_match < end_subject && *start_match != first_char)
4014 start_match++;
4015 }
4016
4017 /* Or to just after \n for a multiline match if possible */
4018
4019 else if (startline)
4020 {
4021 if (start_match > match_block.start_subject)
4022 {
4023 while (start_match < end_subject && start_match[-1] != '\n')
4024 start_match++;
4025 }
4026 }
4027
4028 /* Or to a non-unique first char */
4029
4030 else if (start_bits != NULL)
4031 {
4032 while (start_match < end_subject)
4033 {
4034 register int c = *start_match;
4035 if ((start_bits[c/8] & (1<<(c%8))) == 0) start_match++; else break;
4036 }
4037 }
4038
4039 #ifdef DEBUG
4040 printf(">>>> Match against: ");
4041 pchars(start_match, end_subject - start_match, TRUE, &match_block);
4042 printf("\n");
4043 #endif
4044
4045 /* When a match occurs, substrings will be set for all internal extractions;
4046 we just need to set up the whole thing as substring 0 before returning. If
4047 there were too many extractions, set the return code to zero. */
4048
4049 if (match(start_match, re->code, 2, &match_block))
4050 {
4051 int rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
4052 if (match_block.offset_end < 2) rc = 0; else
4053 {
4054 offsets[0] = start_match - match_block.start_subject;
4055 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
4056 }
4057 #ifdef DEBUG
4058 printf(">>>> returning %d\n", rc);
4059 #endif
4060 free_stack(&match_block);
4061 return rc;
4062 }
4063 }
4064while (!anchored &&
4065 match_block.errorcode == PCRE_ERROR_NOMATCH &&
4066 start_match++ < end_subject);
4067
4068#ifdef DEBUG
4069printf(">>>> returning %d\n", match_block.errorcode);
4070#endif
4071free_stack(&match_block);
4072return match_block.errorcode;
4073}
4074
4075/* End of pcre.c */