| |
| /************************************************* |
| * Perl-Compatible Regular Expressions * |
| *************************************************/ |
| |
| /* DO NOT EDIT THIS FILE! */ |
| |
| /* This file is automatically written by the merge-files.py script |
| included with the PCRE distribution for Python; it's produced from |
| several C files, and code is removed in the process. If you want to |
| modify the code or track down bugs, it will be much easier to work |
| with the code in its original, multiple-file form. Don't edit this |
| file by hand, or submit patches to it. |
| |
| The Python-specific PCRE distribution can be retrieved from |
| http://starship.skyport.net/crew/amk/regex/ |
| |
| The unmodified original PCRE distribution doesn't have a fixed URL |
| yet; write Philip Hazel <ph10@cam.ac.uk> for the latest version. |
| |
| Written by: Philip Hazel <ph10@cam.ac.uk> |
| |
| Extensively modified by the Python String-SIG: <string-sig@python.org> |
| Send bug reports to: <string-sig@python.org> |
| (They'll figure out if it's a bug in PCRE or in the Python-specific |
| changes.) |
| |
| Copyright (c) 1997 University of Cambridge |
| |
| ----------------------------------------------------------------------------- |
| Permission is granted to anyone to use this software for any purpose on any |
| computer system, and to redistribute it freely, subject to the following |
| restrictions: |
| |
| 1. This software is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| |
| 2. The origin of this software must not be misrepresented, either by |
| explicit claim or by omission. |
| |
| 3. Altered versions must be plainly marked as such, and must not be |
| misrepresented as being the original software. |
| ----------------------------------------------------------------------------- |
| */ |
| |
| |
| #define FOR_PYTHON |
| #include "pcre-internal.h" |
| #include "Python.h" |
| #include "graminit.h" |
| |
| /************************************************* |
| * Perl-Compatible Regular Expressions * |
| *************************************************/ |
| |
| /* This file is automatically written by the makechartables auxiliary |
| program. If you edit it by hand, you might like to edit the Makefile to |
| prevent its ever being regenerated. */ |
| |
| /* This table is a lower casing table. */ |
| |
| unsigned char pcre_lcc[] = { |
| 0, 1, 2, 3, 4, 5, 6, 7, |
| 8, 9, 10, 11, 12, 13, 14, 15, |
| 16, 17, 18, 19, 20, 21, 22, 23, |
| 24, 25, 26, 27, 28, 29, 30, 31, |
| 32, 33, 34, 35, 36, 37, 38, 39, |
| 40, 41, 42, 43, 44, 45, 46, 47, |
| 48, 49, 50, 51, 52, 53, 54, 55, |
| 56, 57, 58, 59, 60, 61, 62, 63, |
| 64, 97, 98, 99,100,101,102,103, |
| 104,105,106,107,108,109,110,111, |
| 112,113,114,115,116,117,118,119, |
| 120,121,122, 91, 92, 93, 94, 95, |
| 96, 97, 98, 99,100,101,102,103, |
| 104,105,106,107,108,109,110,111, |
| 112,113,114,115,116,117,118,119, |
| 120,121,122,123,124,125,126,127, |
| 128,129,130,131,132,133,134,135, |
| 136,137,138,139,140,141,142,143, |
| 144,145,146,147,148,149,150,151, |
| 152,153,154,155,156,157,158,159, |
| 160,161,162,163,164,165,166,167, |
| 168,169,170,171,172,173,174,175, |
| 176,177,178,179,180,181,182,183, |
| 184,185,186,187,188,189,190,191, |
| 192,193,194,195,196,197,198,199, |
| 200,201,202,203,204,205,206,207, |
| 208,209,210,211,212,213,214,215, |
| 216,217,218,219,220,221,222,223, |
| 224,225,226,227,228,229,230,231, |
| 232,233,234,235,236,237,238,239, |
| 240,241,242,243,244,245,246,247, |
| 248,249,250,251,252,253,254,255 }; |
| |
| /* This table is an upper casing table. */ |
| |
| unsigned char pcre_ucc[] = { |
| 0, 1, 2, 3, 4, 5, 6, 7, |
| 8, 9, 10, 11, 12, 13, 14, 15, |
| 16, 17, 18, 19, 20, 21, 22, 23, |
| 24, 25, 26, 27, 28, 29, 30, 31, |
| 32, 33, 34, 35, 36, 37, 38, 39, |
| 40, 41, 42, 43, 44, 45, 46, 47, |
| 48, 49, 50, 51, 52, 53, 54, 55, |
| 56, 57, 58, 59, 60, 61, 62, 63, |
| 64, 65, 66, 67, 68, 69, 70, 71, |
| 72, 73, 74, 75, 76, 77, 78, 79, |
| 80, 81, 82, 83, 84, 85, 86, 87, |
| 88, 89, 90, 91, 92, 93, 94, 95, |
| 96, 65, 66, 67, 68, 69, 70, 71, |
| 72, 73, 74, 75, 76, 77, 78, 79, |
| 80, 81, 82, 83, 84, 85, 86, 87, |
| 88, 89, 90,123,124,125,126,127, |
| 128,129,130,131,132,133,134,135, |
| 136,137,138,139,140,141,142,143, |
| 144,145,146,147,148,149,150,151, |
| 152,153,154,155,156,157,158,159, |
| 160,161,162,163,164,165,166,167, |
| 168,169,170,171,172,173,174,175, |
| 176,177,178,179,180,181,182,183, |
| 184,185,186,187,188,189,190,191, |
| 192,193,194,195,196,197,198,199, |
| 200,201,202,203,204,205,206,207, |
| 208,209,210,211,212,213,214,215, |
| 216,217,218,219,220,221,222,223, |
| 224,225,226,227,228,229,230,231, |
| 232,233,234,235,236,237,238,239, |
| 240,241,242,243,244,245,246,247, |
| 248,249,250,251,252,253,254,255 }; |
| |
| /* This table identifies various classes of character by individual bits: |
| 1 white space character |
| 2 decimal digit |
| 4 hexadecimal digit |
| 8 alphanumeric or '_' |
| 16 octal digit |
| 128 regular expression metacharacter or binary zero |
| */ |
| |
| unsigned char pcre_ctypes[] = { |
| 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */ |
| 0x00,0x01,0x01,0x01,0x01,0x01,0x00,0x00, /* 8- 15 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ |
| 0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /* - ' */ |
| 0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /* ( - / */ |
| 0x1e,0x1e,0x1e,0x1e,0x1e,0x1e,0x1e,0x1e, /* 0 - 7 */ |
| 0x0e,0x0e,0x00,0x00,0x00,0x00,0x00,0x80, /* 8 - ? */ |
| 0x00,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x08, /* @ - G */ |
| 0x08,0x08,0x08,0x08,0x08,0x08,0x08,0x08, /* H - O */ |
| 0x08,0x08,0x08,0x08,0x08,0x08,0x08,0x08, /* P - W */ |
| 0x08,0x08,0x08,0x80,0x00,0x00,0x80,0x08, /* X - _ */ |
| 0x00,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x08, /* ` - g */ |
| 0x08,0x08,0x08,0x08,0x08,0x08,0x08,0x08, /* h - o */ |
| 0x08,0x08,0x08,0x08,0x08,0x08,0x08,0x08, /* p - w */ |
| 0x08,0x08,0x08,0x80,0x80,0x00,0x00,0x00, /* x -127 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */ |
| |
| /* End of pcre-chartables.c */ |
| /************************************************* |
| * Perl-Compatible Regular Expressions * |
| *************************************************/ |
| |
| /* |
| This is a library of functions to support regular expressions whose syntax |
| and semantics are as close as possible to those of the Perl 5 language. |
| |
| Written by: Philip Hazel <ph10@cam.ac.uk> |
| |
| Copyright (c) 1997 University of Cambridge |
| |
| ----------------------------------------------------------------------------- |
| Permission is granted to anyone to use this software for any purpose on any |
| computer system, and to redistribute it freely, subject to the following |
| restrictions: |
| |
| 1. This software is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| |
| 2. The origin of this software must not be misrepresented, either by |
| explicit claim or by omission. |
| |
| 3. Altered versions must be plainly marked as such, and must not be |
| misrepresented as being the original software. |
| ----------------------------------------------------------------------------- |
| |
| See the file Tech.Notes for some information on the internals. |
| */ |
| |
| /* This module contains the actual definition of global variables that are |
| shared between the different modules. In fact, these are limited to the |
| indirections for memory management functions. */ |
| |
| /* Include the internals header, which itself includes Standard C headers plus |
| the external pcre header. */ |
| |
| |
| /* Store get and free functions. */ |
| |
| void *(*pcre_malloc)(size_t) = malloc; |
| void (*pcre_free)(void *) = free; |
| |
| /* End of pcre-globals.c */ |
| /************************************************* |
| * Perl-Compatible Regular Expressions * |
| *************************************************/ |
| |
| /* |
| This is a library of functions to support regular expressions whose syntax |
| and semantics are as close as possible to those of the Perl 5 language. See |
| the file Tech.Notes for some information on the internals. |
| |
| Written by: Philip Hazel <ph10@cam.ac.uk> |
| |
| Copyright (c) 1997 University of Cambridge |
| |
| ----------------------------------------------------------------------------- |
| Permission is granted to anyone to use this software for any purpose on any |
| computer system, and to redistribute it freely, subject to the following |
| restrictions: |
| |
| 1. This software is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| |
| 2. The origin of this software must not be misrepresented, either by |
| explicit claim or by omission. |
| |
| 3. Altered versions must be plainly marked as such, and must not be |
| misrepresented as being the original software. |
| ----------------------------------------------------------------------------- |
| */ |
| |
| |
| /* Include the internals header, which itself includes Standard C headers plus |
| the external pcre header. */ |
| |
| |
| |
| /* Character types for class type bits */ |
| |
| static char class_types[] = { ctype_digit, ctype_space, ctype_word }; |
| |
| |
| |
| /************************************************* |
| * Set a range of bits in the map * |
| *************************************************/ |
| |
| /* This function is called for character types. |
| |
| Arguments: |
| start_bits points to the bit map |
| type a character type bit |
| include TRUE to include the type; |
| FALSE to include all but the type |
| |
| Returns: nothing |
| */ |
| |
| static void |
| set_type_bits(uschar *start_bits, int type, BOOL include) |
| { |
| int i; |
| for (i = 0; i < 256; i++) |
| if (((pcre_ctypes[i] & type) != 0) == include) start_bits[i/8] |= (1<<(i%8)); |
| } |
| |
| |
| |
| /************************************************* |
| * Set one bit in the map * |
| *************************************************/ |
| |
| /* This function is called to set a bit in the map for a given character, |
| or both cases of a letter if caseless. It could be replaced by a macro if |
| better performance is wanted. |
| |
| Arguments: |
| start_bits points to 32-byte table |
| c the character |
| caseless TRUE if caseless |
| |
| Returns: nothing |
| */ |
| |
| static void |
| set_bit(uschar *start_bits, int c, BOOL caseless) |
| { |
| if (caseless) |
| { |
| int d = pcre_ucc[c]; |
| start_bits[d/8] |= (1<<(d%8)); |
| c = pcre_lcc[c]; |
| } |
| start_bits[c/8] |= (1<<(c%8)); |
| } |
| |
| |
| |
| /************************************************* |
| * Create bitmap of starting chars * |
| *************************************************/ |
| |
| /* This function scans a compiled unanchored expression and attempts to build a |
| bitmap of the set of initial characters. If it can't, it returns FALSE. As time |
| goes by, we may be able to get more clever at doing this. |
| |
| Arguments: |
| code points to an expression |
| start_bits points to a 32-byte table, initialized to 0 |
| caseless TRUE if caseless |
| |
| Returns: TRUE if table built, FALSE otherwise |
| */ |
| |
| static BOOL |
| set_start_bits(uschar *code, uschar *start_bits, BOOL caseless) |
| { |
| do |
| { |
| uschar *tcode = code + 3; |
| BOOL try_next = TRUE; |
| |
| while (try_next) |
| { |
| try_next = FALSE; |
| |
| if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT) |
| { |
| if (!set_start_bits(tcode, start_bits, caseless)) return FALSE; |
| } |
| |
| else switch(*tcode) |
| { |
| default: |
| return FALSE; |
| |
| /* BRAZERO does the bracket, but carries on. */ |
| |
| case OP_BRAZERO: |
| case OP_BRAMINZERO: |
| if (!set_start_bits(++tcode, start_bits, caseless)) return FALSE; |
| do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT); |
| tcode += 3; |
| try_next = TRUE; |
| break; |
| |
| /* Single-char * or ? sets the bit and tries the next item */ |
| |
| case OP_STAR: |
| case OP_MINSTAR: |
| case OP_QUERY: |
| case OP_MINQUERY: |
| set_bit(start_bits, tcode[1], caseless); |
| tcode += 2; |
| try_next = TRUE; |
| break; |
| |
| /* Single-char upto sets the bit and tries the next */ |
| |
| case OP_UPTO: |
| case OP_MINUPTO: |
| set_bit(start_bits, tcode[3], caseless); |
| tcode += 4; |
| try_next = TRUE; |
| break; |
| |
| /* At least one single char sets the bit and stops */ |
| |
| case OP_EXACT: /* Fall through */ |
| tcode++; |
| |
| case OP_CHARS: /* Fall through */ |
| tcode++; |
| |
| case OP_PLUS: |
| case OP_MINPLUS: |
| set_bit(start_bits, tcode[1], caseless); |
| break; |
| |
| /* Single character type sets the bits and stops */ |
| |
| case OP_NOT_DIGIT: |
| set_type_bits(start_bits, ctype_digit, FALSE); |
| break; |
| |
| case OP_DIGIT: |
| set_type_bits(start_bits, ctype_digit, TRUE); |
| break; |
| |
| case OP_NOT_WHITESPACE: |
| set_type_bits(start_bits, ctype_space, FALSE); |
| break; |
| |
| case OP_WHITESPACE: |
| set_type_bits(start_bits, ctype_space, TRUE); |
| break; |
| |
| case OP_NOT_WORDCHAR: |
| set_type_bits(start_bits, ctype_word, FALSE); |
| break; |
| |
| case OP_WORDCHAR: |
| set_type_bits(start_bits, ctype_word, TRUE); |
| break; |
| |
| /* One or more character type fudges the pointer and restarts, knowing |
| it will hit a single character type and stop there. */ |
| |
| case OP_TYPEPLUS: |
| case OP_TYPEMINPLUS: |
| tcode++; |
| try_next = TRUE; |
| break; |
| |
| case OP_TYPEEXACT: |
| tcode += 3; |
| try_next = TRUE; |
| break; |
| |
| /* Zero or more repeats of character types set the bits and then |
| try again. */ |
| |
| case OP_TYPEUPTO: |
| case OP_TYPEMINUPTO: |
| tcode += 2; /* Fall through */ |
| |
| case OP_TYPESTAR: |
| case OP_TYPEMINSTAR: |
| case OP_TYPEQUERY: |
| case OP_TYPEMINQUERY: |
| switch(tcode[1]) |
| { |
| case OP_NOT_DIGIT: |
| set_type_bits(start_bits, ctype_digit, FALSE); |
| break; |
| |
| case OP_DIGIT: |
| set_type_bits(start_bits, ctype_digit, TRUE); |
| break; |
| |
| case OP_NOT_WHITESPACE: |
| set_type_bits(start_bits, ctype_space, FALSE); |
| break; |
| |
| case OP_WHITESPACE: |
| set_type_bits(start_bits, ctype_space, TRUE); |
| break; |
| |
| case OP_NOT_WORDCHAR: |
| set_type_bits(start_bits, ctype_word, FALSE); |
| break; |
| |
| case OP_WORDCHAR: |
| set_type_bits(start_bits, ctype_word, TRUE); |
| break; |
| } |
| |
| tcode += 2; |
| try_next = TRUE; |
| break; |
| |
| /* Character class: set the bits and either carry on or not, |
| according to the repeat count. */ |
| |
| case OP_CLASS: |
| case OP_NEGCLASS: |
| { |
| uschar *base = tcode; |
| uschar *data, *end; |
| uschar *bitmap = start_bits; |
| uschar local[32]; |
| int flags = base[1]; |
| int i; |
| |
| tcode += 4 + 2 * tcode[2] + tcode[3]; /* Advance past the item */ |
| switch (*tcode) |
| { |
| case OP_CRSTAR: |
| case OP_CRMINSTAR: |
| case OP_CRQUERY: |
| case OP_CRMINQUERY: |
| tcode++; |
| try_next = TRUE; |
| break; |
| |
| case OP_CRRANGE: |
| case OP_CRMINRANGE: |
| if (((tcode[1] << 8) + tcode[2]) == 0) |
| { |
| tcode += 5; |
| try_next = TRUE; |
| } |
| break; |
| } |
| |
| /* For a negated class, we have to build a separate table of all |
| the bits in the class, and then turn all other bits on in the main |
| table. Otherwise there are problems with things like [^\da]. */ |
| |
| if (*base == OP_NEGCLASS) |
| { |
| memset(local, 0, 32); |
| bitmap = local; |
| } |
| |
| /* Character types */ |
| |
| for (i = 0; flags != 0; i++) |
| { |
| if ((flags & 1) != 0) |
| set_type_bits(bitmap, class_types[i/2], (i & 1) == 0); |
| flags >>= 1; |
| } |
| |
| /* Ranges and individual characters */ |
| |
| data = base + 4; |
| end = data + base[2] * 2; |
| while (data < end) |
| { |
| for (i = *data; i <= data[1]; i++) set_bit(bitmap, i, caseless); |
| data += 2; |
| } |
| |
| end += base[3]; |
| while (data < end) set_bit(bitmap, *data++, caseless); |
| |
| /* If a negated class, transfer data from local map to the main one */ |
| |
| if (bitmap != start_bits) |
| for (i = 0; i < 32; i++) start_bits[i] |= ~local[i]; |
| } |
| break; /* End of class handling */ |
| |
| } /* End of switch */ |
| } /* End of try_next loop */ |
| |
| code += (code[1] << 8) + code[2]; /* Advance to next branch */ |
| } |
| while (*code == OP_ALT); |
| return TRUE; |
| } |
| |
| |
| |
| /************************************************* |
| * Study a compiled expression * |
| *************************************************/ |
| |
| /* This function is handed a compiled expression that it must study to produce |
| information that will speed up the matching. It returns a pcre_extra block |
| which then gets handed back to pcre_exec(). |
| |
| Arguments: |
| re points to the compiled expression |
| options contains option bits |
| errorptr points to where to place error messages; |
| set NULL unless error |
| |
| Returns: pointer to a pcre_extra block, |
| NULL on error or if no optimization possible |
| */ |
| |
| pcre_extra * |
| pcre_study(pcre *external_re, int options, char **errorptr) |
| { |
| BOOL caseless; |
| uschar start_bits[32]; |
| real_pcre_extra *extra; |
| real_pcre *re = (real_pcre *)external_re; |
| |
| *errorptr = NULL; |
| |
| if (re == NULL || re->magic_number != MAGIC_NUMBER) |
| { |
| *errorptr = "argument is not a compiled regular expression"; |
| return NULL; |
| } |
| |
| if ((options & ~PUBLIC_STUDY_OPTIONS) != 0) |
| { |
| *errorptr = "unknown or incorrect option bit(s) set"; |
| return NULL; |
| } |
| |
| /* For an anchored pattern, or an unchored pattern that has a first char, or a |
| multiline pattern that matches only at "line starts", no further processing at |
| present. */ |
| |
| if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0) |
| return NULL; |
| |
| /* Determine the caseless state from the compiled pattern and the current |
| options. */ |
| |
| caseless = ((re->options | options) & PCRE_CASELESS) != 0; |
| |
| /* See if we can find a fixed set of initial characters for the pattern. */ |
| |
| memset(start_bits, 0, 32); |
| if (!set_start_bits(re->code, start_bits, caseless)) return NULL; |
| |
| /* Get an "extra" block and put the information therein. */ |
| |
| extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra)); |
| |
| if (extra == NULL) |
| { |
| *errorptr = "failed to get memory"; |
| return NULL; |
| } |
| extra->options = PCRE_STUDY_MAPPED | (caseless? PCRE_STUDY_CASELESS : 0); |
| memcpy(extra->start_bits, start_bits, 32); |
| |
| return (pcre_extra *)extra; |
| } |
| |
| /* End of pcre-study.c */ |
| /************************************************* |
| * Perl-Compatible Regular Expressions * |
| *************************************************/ |
| |
| /* |
| This is a library of functions to support regular expressions whose syntax |
| and semantics are as close as possible to those of the Perl 5 language. See |
| the file Tech.Notes for some information on the internals. |
| |
| Written by: Philip Hazel <ph10@cam.ac.uk> |
| |
| Copyright (c) 1997 University of Cambridge |
| |
| ----------------------------------------------------------------------------- |
| Permission is granted to anyone to use this software for any purpose on any |
| computer system, and to redistribute it freely, subject to the following |
| restrictions: |
| |
| 1. This software is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| |
| 2. The origin of this software must not be misrepresented, either by |
| explicit claim or by omission. |
| |
| 3. Altered versions must be plainly marked as such, and must not be |
| misrepresented as being the original software. |
| ----------------------------------------------------------------------------- |
| */ |
| |
| |
| /* Define DEBUG to get debugging output on stdout. */ |
| |
| /* #define DEBUG */ |
| |
| |
| /* Include the internals header, which itself includes Standard C headers plus |
| the external pcre header. */ |
| |
| |
| #ifndef Py_eval_input |
| /* For Python 1.4, graminit.h has to be explicitly included */ |
| #define Py_eval_input eval_input |
| #endif |
| |
| /* Min and max values for the common repeats; for the maxima, 0 => infinity */ |
| |
| static char rep_min[] = { 0, 0, 1, 1, 0, 0 }; |
| static char rep_max[] = { 0, 0, 0, 0, 1, 1 }; |
| |
| /* Text forms of OP_ values and things, for debugging */ |
| |
| #ifdef DEBUG |
| static char *OP_names[] = { "End", "\\A", "\\B", "\\b", "\\D", "\\d", |
| "\\S", "\\s", "\\W", "\\w", "\\Z", "^", "$", "Any", "chars", |
| "*", "*?", "+", "+?", "?", "??", "{", "{", "{", |
| "*", "*?", "+", "+?", "?", "??", "{", "{", "{", |
| "*", "*?", "+", "+?", "?", "??", "{", "{", |
| "class", "negclass", "Ref", |
| "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", |
| "Brazero", "Braminzero", "Bra" |
| }; |
| |
| static char *class_names[] = { "\\d", "\\D", "\\s", "\\S", "\\w", "\\W" }; |
| #endif |
| |
| /* Table of character type operators that correspond to the bits in the |
| character class flags, starting at the least significant end. */ |
| |
| static char class_ops[] = { |
| OP_DIGIT, OP_NOT_DIGIT, |
| OP_WHITESPACE, OP_NOT_WHITESPACE, |
| OP_WORDCHAR, OP_NOT_WORDCHAR }; |
| |
| /* Table for handling escaped characters in the range '0'-'z'. Positive returns |
| are simple data values; negative values are for special things like \d and so |
| on. Zero means further processing is needed (for things like \x), or the escape |
| is invalid. */ |
| |
| /* PYTHON: Python doesn't support \e, but does support \v */ |
| |
| static short int escapes[] = { |
| 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */ |
| 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */ |
| '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */ |
| 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */ |
| 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */ |
| 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */ |
| '`', 7, -ESC_b, 0, -ESC_d, 0, '\f', 0, /* ` - g */ |
| 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */ |
| 0, 0, '\r', -ESC_s, '\t', 0, '\v', -ESC_w, /* p - w */ |
| 0, 0, 0 /* x - z */ |
| }; |
| |
| |
| /* Definition to allow mutual recursion */ |
| |
| static BOOL compile_regexp(BOOL, int *, uschar **, uschar **, |
| char **, PyObject *); |
| |
| /* Structure for passing "static" information around between the functions |
| doing the matching, so that they are thread-safe. */ |
| |
| typedef struct match_data { |
| int errorcode; /* As it says */ |
| int *offset_vector; /* Offset vector */ |
| int offset_end; /* One past the end */ |
| BOOL offset_overflow; /* Set if too many extractions */ |
| BOOL caseless; /* Case-independent flag */ |
| BOOL multiline; /* Multiline flag */ |
| uschar *start_subject; /* Start of the subject string */ |
| uschar *end_subject; /* End of the subject string */ |
| |
| uschar *end_match_ptr; /* Subject position at end match */ |
| int end_offset_top; /* Highwater mark at end of match */ |
| BOOL dotall; /* Dotall flag */ |
| int length; /* Length of the allocated stacks */ |
| int point; /* Point to add next item pushed onto stacks */ |
| /* Pointers to the 6 stacks */ |
| int *off_num, *offset_top, *r1, *r2; |
| uschar **eptr, **ecode; |
| } match_data; |
| |
| |
| |
| /************************************************* |
| * Return version string * |
| *************************************************/ |
| |
| char * |
| pcre_version(void) |
| { |
| return PCRE_VERSION; |
| } |
| |
| |
| |
| |
| /************************************************* |
| * Return info about a compiled pattern * |
| *************************************************/ |
| |
| /* This function picks potentially useful data out of the private |
| structure. |
| |
| Arguments: |
| external_re points to compiled code |
| optptr where to pass back the options |
| first_char where to pass back the first character, |
| or -1 if multiline and all branches start ^, |
| or -2 otherwise |
| |
| Returns: number of identifying extraction brackets |
| or negative values on error |
| */ |
| |
| int |
| pcre_info(pcre *external_re, int *optptr, int *first_char) |
| { |
| real_pcre *re = (real_pcre *)external_re; |
| if (re == NULL) return PCRE_ERROR_NULL; |
| if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC; |
| if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS); |
| if (first_char != NULL) |
| *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char : |
| ((re->options & PCRE_STARTLINE) != 0)? -1 : -2; |
| return re->top_bracket; |
| } |
| |
| |
| |
| |
| #ifdef DEBUG |
| /************************************************* |
| * Debugging function to print chars * |
| *************************************************/ |
| |
| /* Print a sequence of chars in printable format, stopping at the end of the |
| subject if the requested. |
| |
| Arguments: |
| p points to characters |
| length number to print |
| is_subject TRUE if printing from within md->start_subject |
| md pointer to matching data block, if is_subject is TRUE |
| |
| Returns: nothing |
| */ |
| |
| static pchars(uschar *p, int length, BOOL is_subject, match_data *md) |
| { |
| int c; |
| if (is_subject && length > md->end_subject - p) length = md->end_subject - p; |
| while (length-- > 0) |
| if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c); |
| } |
| #endif |
| |
| |
| |
| |
| /************************************************* |
| * Check subpattern for empty operand * |
| *************************************************/ |
| |
| /* This function checks a bracketed subpattern to see if any of the paths |
| through it could match an empty string. This is used to diagnose an error if |
| such a subpattern is followed by a quantifier with an unlimited upper bound. |
| |
| Argument: |
| code points to the opening bracket |
| |
| Returns: TRUE or FALSE |
| */ |
| |
| static BOOL |
| could_be_empty(uschar *code) |
| { |
| do { |
| uschar *cc = code + 3; |
| |
| /* Scan along the opcodes for this branch; as soon as we find something |
| that matches a non-empty string, break out and advance to test the next |
| branch. If we get to the end of the branch, return TRUE for the whole |
| sub-expression. */ |
| |
| for (;;) |
| { |
| /* Test an embedded subpattern; if it could not be empty, break the |
| loop. Otherwise carry on in the branch. */ |
| |
| if ((int)(*cc) >= OP_BRA) |
| { |
| if (!could_be_empty(cc)) break; |
| do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT); |
| cc += 3; |
| } |
| |
| else switch (*cc) |
| { |
| /* Reached end of a branch: the subpattern may match the empty string */ |
| |
| case OP_ALT: |
| case OP_KET: |
| case OP_KETRMAX: |
| case OP_KETRMIN: |
| return TRUE; |
| |
| /* Skip over assertive subpatterns */ |
| |
| case OP_ASSERT: |
| case OP_ASSERT_NOT: |
| do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT); |
| cc += 3; |
| break; |
| |
| /* Skip over things that don't match chars */ |
| |
| case OP_SOD: |
| case OP_EOD: |
| case OP_CIRC: |
| case OP_DOLL: |
| case OP_BRAZERO: |
| case OP_BRAMINZERO: |
| case OP_NOT_WORD_BOUNDARY: |
| case OP_WORD_BOUNDARY: |
| cc++; |
| break; |
| |
| /* Skip over simple repeats with zero lower bound */ |
| |
| case OP_STAR: |
| case OP_MINSTAR: |
| case OP_QUERY: |
| case OP_MINQUERY: |
| case OP_TYPESTAR: |
| case OP_TYPEMINSTAR: |
| case OP_TYPEQUERY: |
| case OP_TYPEMINQUERY: |
| cc += 2; |
| break; |
| |
| /* Skip over UPTOs (lower bound is zero) */ |
| |
| case OP_UPTO: |
| case OP_MINUPTO: |
| case OP_TYPEUPTO: |
| case OP_TYPEMINUPTO: |
| cc += 4; |
| break; |
| |
| /* Check a class or a back reference for a zero minimum */ |
| |
| case OP_CLASS: |
| case OP_NEGCLASS: |
| case OP_REF: |
| cc += (*cc == OP_REF)? 2 : 4 + 2 * cc[2] + cc[3]; |
| |
| switch (*cc) |
| { |
| case OP_CRSTAR: |
| case OP_CRMINSTAR: |
| case OP_CRQUERY: |
| case OP_CRMINQUERY: |
| cc++; |
| break; |
| |
| case OP_CRRANGE: |
| case OP_CRMINRANGE: |
| if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH; |
| cc += 3; |
| break; |
| |
| default: |
| goto NEXT_BRANCH; |
| } |
| break; |
| |
| /* Anything else matches at least one character */ |
| |
| default: |
| goto NEXT_BRANCH; |
| } |
| } |
| |
| NEXT_BRANCH: |
| code += (code[1] << 8) + code[2]; |
| } |
| while (*code == OP_ALT); |
| |
| /* No branches match the empty string */ |
| |
| return FALSE; |
| } |
| |
| |
| /* Determine the length of a group ID in an expression like |
| (?P<foo_123>...) |
| Arguments: |
| ptr pattern position pointer (say that 3 times fast) |
| finalchar the character that will mark the end of the ID |
| errorptr points to the pointer to the error message |
| */ |
| |
| static int |
| get_group_id(uschar *ptr, char finalchar, char **errorptr) |
| { |
| uschar *start = ptr; |
| |
| /* If the first character is not in \w, or is in \w but is a digit, |
| report an error */ |
| if (!(pcre_ctypes[*ptr] & ctype_word) || |
| (pcre_ctypes[*ptr++] & ctype_digit)) |
| { |
| *errorptr = "(?P identifier must start with a letter or underscore"; |
| return 0; |
| } |
| |
| /* Increment ptr until we either hit a null byte, the desired |
| final character, or a non-word character */ |
| for(; (*ptr != 0) && (*ptr != finalchar) && |
| (pcre_ctypes[*ptr] & ctype_word); ptr++) |
| { |
| /* Empty loop body */ |
| } |
| if (*ptr==finalchar) |
| return ptr-start; |
| if (*ptr==0) |
| { |
| *errorptr = "unterminated (?P identifier"; |
| return 0; |
| } |
| *errorptr = "illegal character in (?P identifier"; |
| return 0; |
| } |
| |
| /************************************************* |
| * Handle escapes * |
| *************************************************/ |
| |
| /* This function is called when a \ has been encountered. It either returns a |
| positive value for a simple escape such as \n, or a negative value which |
| encodes one of the more complicated things such as \d. On entry, ptr is |
| pointing at the \. On exit, it is on the final character of the escape |
| sequence. |
| |
| Arguments: |
| ptrptr points to the pattern position pointer |
| errorptr points to the pointer to the error message |
| |
| Returns: zero or positive => a data character |
| negative => a special escape sequence |
| on error, errorptr is set |
| */ |
| |
| static int |
| check_escape(uschar **ptrptr, char **errorptr) |
| { |
| uschar *ptr = *ptrptr; |
| int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */ |
| int i; |
| |
| if (c == 0) *errorptr = "\\ at end of pattern"; |
| |
| /* Digits or letters may have special meaning; all others are literals. */ |
| |
| else if (c < '0' || c > 'z') {} |
| |
| /* Do an initial lookup in a table. A non-zero result is something that can be |
| returned immediately. Otherwise further processing may be required. */ |
| |
| else if ((i = escapes[c - '0']) != 0) c = i; |
| |
| /* Escapes that need further processing, or are illegal. */ |
| |
| else switch (c) |
| { |
| case '0': |
| c = 0; |
| while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_odigit) != 0 ) |
| c = c * 8 + *(++ptr) - '0'; |
| break; |
| |
| case '1': case '2': case '3': case '4': case '5': |
| case '6': case '7': case '8': case '9': |
| { |
| /* PYTHON: Try to compute an octal value for a character */ |
| for(c=0, i=0; c!=-1 && ptr[i]!=0 && i<3; i++) |
| { |
| if (( pcre_ctypes[ ptr[i] ] & ctype_odigit) != 0) |
| c = c * 8 + ptr[i]-'0'; |
| else |
| c = -1; /* Non-octal character */ |
| } |
| /* Aha! There were 3 octal digits, so it must be a character */ |
| if (c != -1 && i == 3) |
| { |
| ptr += i-1; |
| break; |
| } |
| c = ptr[0]; /* Restore the first character after the \ */ |
| c -= '0'; i = 1; |
| while (i<2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0) |
| { |
| c = c * 10 + ptr[1] - '0'; |
| ptr++; i++; |
| } |
| if (c > 255 - ESC_REF) *errorptr = "back reference too big"; |
| c = -(ESC_REF + c); |
| } |
| break; |
| |
| case 'x': |
| { |
| int length; |
| char *string; |
| PyObject *result; |
| |
| i=1; |
| while (ptr[i]!=0 && |
| ( pcre_ctypes[ptr[i]] & ctype_xdigit) != 0) |
| i++; |
| if (i==1) |
| { |
| *errorptr="\\x must be followed by hex digits"; |
| break; |
| } |
| length=i-1; |
| string=malloc(length+4+1); |
| if (string==NULL) |
| { |
| *errorptr="can't allocate memory for \\x string"; |
| break; |
| } |
| /* Create a string containing "\x<hexdigits>", which will be |
| passed to eval() */ |
| string[0]=string[length+3]='"'; |
| string[1]='\\'; |
| string[length+4]='\0'; |
| memcpy(string+2, ptr, length+1); |
| ptr += length; |
| result=PyRun_String((char *)string, Py_eval_input, |
| PyEval_GetGlobals(), PyEval_GetLocals()); |
| free(string); |
| /* The evaluation raised an exception */ |
| if (result==NULL) |
| { |
| *errorptr="exception occurred during evaluation of \\x"; |
| break; |
| } |
| if (PyString_Size(result)!=1) |
| { |
| Py_DECREF(result); |
| *errorptr="\\x string is not one byte in length"; |
| break; |
| } |
| c=*(unsigned char *)PyString_AsString(result); |
| Py_DECREF(result); |
| break; |
| } |
| break; |
| |
| |
| case 'l': |
| case 'L': |
| case 'u': |
| case 'U': |
| case 'Q': |
| case 'E': |
| *errorptr = "the Perl escapes \\u, \\U, \\l, \\L, \\Q, \\E are not valid"; |
| break; |
| |
| default: |
| /* In Python, an unrecognized escape will simply return the character |
| after the backslash, so do nothing */ |
| break; |
| } |
| |
| *ptrptr = ptr; |
| return c; |
| } |
| |
| |
| |
| /************************************************* |
| * Read repeat counts * |
| *************************************************/ |
| |
| /* Read an item of the form {n,m} and return the values. |
| |
| Arguments: |
| p pointer to first char after '{' |
| minp pointer to int for min |
| maxp pointer to int for max |
| returned as -1 if no max |
| errorptr points to pointer to error message |
| |
| Returns: pointer to '}' on success; |
| current ptr on error, with errorptr set |
| */ |
| |
| static uschar * |
| read_repeat_counts(uschar *p, int *minp, int *maxp, char **errorptr) |
| { |
| int min = 0; |
| int max = -1; |
| |
| if ((pcre_ctypes[*p] & ctype_digit) == 0) |
| { |
| *errorptr = "number expected after {"; |
| return p; |
| } |
| |
| while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0'; |
| |
| if (*p == '}') max = min; else |
| { |
| if (*p++ != ',') |
| { |
| *errorptr = "comma expected"; |
| return p-1; |
| } |
| if (*p != '}') |
| { |
| max = 0; |
| while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0'; |
| if (*p != '}') |
| { |
| *errorptr = "} expected"; |
| return p; |
| } |
| if (max < min) |
| { |
| *errorptr = "numbers out of order"; |
| return p; |
| } |
| } |
| } |
| |
| /* Do paranoid checks, then fill in the required variables, and pass back the |
| pointer to the terminating '}'. */ |
| |
| if (max == 0) *errorptr = "zero maximum not allowed"; |
| else if (min > 65535 || max > 65535) *errorptr = "number too big"; |
| else |
| { |
| *minp = min; |
| *maxp = max; |
| } |
| return p; |
| } |
| |
| |
| |
| /************************************************* |
| * Compile one branch * |
| *************************************************/ |
| |
| /* Scan the pattern, compiling it into the code vector. |
| |
| Arguments: |
| extended TRUE if the PCRE_EXTENDED option was set |
| brackets points to 2-element bracket vector |
| code points to the pointer to the current code point |
| ptrptr points to the current pattern pointer |
| errorptr points to pointer to error message |
| |
| Returns: TRUE on success |
| FALSE, with *errorptr set on error |
| */ |
| |
| static BOOL |
| compile_branch(BOOL extended, int *brackets, uschar **codeptr, |
| uschar **ptrptr, char **errorptr, PyObject *dictionary) |
| { |
| int repeat_type, op_type; |
| int repeat_min, repeat_max; |
| int bravalue, length; |
| register int c; |
| register uschar *code = *codeptr; |
| uschar *ptr = *ptrptr; |
| uschar *previous = NULL; |
| uschar *oldptr; |
| |
| /* Switch on next character until the end of the branch */ |
| |
| for (;; ptr++) |
| { |
| c = *ptr; |
| if (extended) |
| { |
| if ((pcre_ctypes[c] & ctype_space) != 0) continue; |
| if (c == '#') |
| { |
| while ((c = *(++ptr)) != 0 && c != '\n'); |
| continue; |
| } |
| } |
| |
| switch(c) |
| { |
| /* The branch terminates at end of string, |, or ). */ |
| |
| case 0: |
| case '|': |
| case ')': |
| *codeptr = code; |
| *ptrptr = ptr; |
| return TRUE; |
| |
| /* Handle single-character metacharacters */ |
| |
| case '^': |
| previous = NULL; |
| *code++ = OP_CIRC; |
| break; |
| |
| case '$': |
| previous = NULL; |
| *code++ = OP_DOLL; |
| break; |
| |
| case '.': |
| previous = code; |
| *code++ = OP_ANY; |
| break; |
| |
| /* Character classes. We do quite a bit of munging around here. There are |
| always four initial bytes: the op_code, a flags byte for things like \d, a |
| count of pairs and a count of single characters. The pairs then follow, and |
| finally the single characters. */ |
| |
| case '[': |
| { |
| int rangecount = 0; |
| int flags = 0; |
| int singles_count = 0; |
| char singles[256]; |
| |
| previous = code; |
| |
| /* If the first character is '^', set the negation flag */ |
| |
| if ((c = *(++ptr)) == '^') { *code = OP_NEGCLASS; c = *(++ptr); } |
| else *code = OP_CLASS; |
| code += 4; |
| |
| /* Process characters until ] is reached. By writing this as a "do" it |
| means that an initial ] is taken as a data character. */ |
| |
| do |
| { |
| if (c == 0) |
| { |
| *errorptr = "] missing"; |
| goto FAILED; |
| } |
| |
| /*** Perl treats '-' here as a data character, so PCRE had better |
| do the same ... cut out this diagnosis. |
| |
| if (c == '-') |
| { |
| *errorptr = "unexpected '-' in character class"; |
| goto FAILED; |
| } |
| ... ***/ |
| |
| /* Backslash may introduce a single character, or it may introduce one |
| of the specials, which just set a flag. Escaped items are checked for |
| validity in the pre-compiling pass. The sequence \b is a special case. |
| Inside a class (and only there) it is treated as backslash. Elsewhere |
| it marks a word boundary. */ |
| |
| if (c == '\\') |
| { |
| uschar *save_ptr = ptr+1; |
| c = check_escape(&ptr, errorptr); |
| if (c < 0) |
| { |
| switch (-c) |
| { |
| case ESC_d: flags |= CLASS_DIGITS; continue; |
| case ESC_D: flags |= CLASS_NOT_DIGITS; continue; |
| case ESC_s: flags |= CLASS_WHITESPACE; continue; |
| case ESC_S: flags |= CLASS_NOT_WHITESPACE; continue; |
| case ESC_w: flags |= CLASS_WORD; continue; |
| case ESC_W: flags |= CLASS_NOT_WORD; continue; |
| default: |
| ptr = save_ptr; |
| c = *ptr; |
| break; |
| |
| case ESC_b: c = '\b'; /* Treat as single character */ |
| break; |
| } |
| } |
| /* Fall through if single character */ |
| } |
| |
| /* A single character may be followed by '-' to form a range. However, |
| Perl does not permit ']' to be the end of the range. A '-' character |
| here is treated as a literal. */ |
| |
| if (ptr[1] == '-' && ptr[2] != ']') |
| { |
| int d; |
| ptr += 2; |
| d = *ptr; |
| |
| if (d == 0) |
| { |
| *errorptr = "incomplete range"; |
| goto FAILED; |
| } |
| |
| /* The second part of a range can be a single-character escape, but |
| not any of the other escapes. */ |
| |
| if (d == '\\') |
| { |
| d = check_escape(&ptr, errorptr); |
| if (d < 0) |
| { |
| if (d == -ESC_b) d = '\b'; else |
| { |
| *errorptr = "invalid range"; |
| goto FAILED; |
| } |
| } |
| } |
| |
| if (d < c) |
| { |
| *errorptr = "range out of order"; |
| goto FAILED; |
| } |
| |
| if (rangecount >= 255) |
| { |
| *errorptr = "too many ranges inside []"; |
| goto FAILED; |
| } |
| |
| rangecount++; |
| *code++ = c; |
| *code++ = d; |
| continue; |
| } |
| |
| /* Handle a lone single character: save it up for outputting at the |
| end. Be paranoid and check that the buffer isn't going to overflow. */ |
| |
| if (singles_count >= 255) |
| { |
| *errorptr = "too many characters inside []"; |
| goto FAILED; |
| } |
| singles[singles_count++] = c; |
| } |
| |
| /* Loop until ']' reached; the check for end of string happens inside the |
| loop. This "while" is the end of the "do" above. */ |
| |
| while ((c = *(++ptr)) != ']'); |
| |
| /* Copy saved single characters, which follow the ranges in the output. */ |
| |
| c = 0; |
| while (c < singles_count) *code++ = singles[c++]; |
| |
| /* Finally fill in the flags and counts of ranges and single characters, |
| and advance the pointer past the ]. */ |
| |
| previous[1] = flags; |
| previous[2] = rangecount; |
| previous[3] = singles_count; |
| } |
| break; |
| |
| /* Various kinds of repeat */ |
| |
| case '{': |
| ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr); |
| if (*errorptr != NULL) goto FAILED; |
| goto REPEAT; |
| |
| case '*': |
| repeat_min = 0; |
| repeat_max = -1; |
| goto REPEAT; |
| |
| case '+': |
| repeat_min = 1; |
| repeat_max = -1; |
| goto REPEAT; |
| |
| case '?': |
| repeat_min = 0; |
| repeat_max = 1; |
| |
| REPEAT: |
| if (previous == NULL) |
| { |
| *errorptr = "nothing to repeat"; |
| goto FAILED; |
| } |
| |
| /* If the next character is '?' this is a minimizing repeat. Advance to the |
| next character. */ |
| |
| if (ptr[1] == '?') { repeat_type = 1; ptr++; } else repeat_type = 0; |
| |
| /* If previous was a string of characters, chop off the last one and use it |
| as the subject of the repeat. If there was only one character, we can |
| abolish the previous item altogether. */ |
| |
| if (*previous == OP_CHARS) |
| { |
| int len = previous[1]; |
| if (len == 1) |
| { |
| c = previous[2]; |
| code = previous; |
| } |
| else |
| { |
| c = previous[len+1]; |
| previous[1]--; |
| code--; |
| } |
| op_type = 0; /* Use single-char op codes */ |
| goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */ |
| } |
| |
| /* If previous was a character type match (\d or similar), abolish it and |
| create a suitable repeat item. The code is shared with single-character |
| repeats by adding a suitable offset into repeat_type. */ |
| |
| if ((int)*previous < OP_EOD || *previous == OP_ANY) |
| { |
| op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */ |
| c = *previous; |
| code = previous; |
| |
| OUTPUT_SINGLE_REPEAT: |
| repeat_type += op_type; /* Combine both values for many cases */ |
| |
| /* A minimum of zero is handled either as the special case * or ?, or as |
| an UPTO, with the maximum given. */ |
| |
| if (repeat_min == 0) |
| { |
| if (repeat_max == -1) *code++ = OP_STAR + repeat_type; |
| else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type; |
| else |
| { |
| *code++ = OP_UPTO + repeat_type; |
| *code++ = repeat_max >> 8; |
| *code++ = (repeat_max & 255); |
| } |
| } |
| |
| /* The case {1,} is handled as the special case + */ |
| |
| else if (repeat_min == 1 && repeat_max == -1) |
| *code++ = OP_PLUS + repeat_type; |
| |
| /* The case {n,n} is just an EXACT, while the general case {n,m} is |
| handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */ |
| |
| else |
| { |
| if (repeat_min != 1) |
| { |
| *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */ |
| *code++ = repeat_min >> 8; |
| *code++ = (repeat_min & 255); |
| } |
| |
| /* If the mininum is 1 and the previous item was a character string, |
| we either have to put back the item that got cancelled if the string |
| length was 1, or add the character back onto the end of a longer |
| string. For a character type nothing need be done; it will just get put |
| back naturally. */ |
| |
| else if (*previous == OP_CHARS) |
| { |
| if (code == previous) code += 2; else previous[1]++; |
| } |
| |
| /* Insert an UPTO if the max is greater than the min. */ |
| |
| if (repeat_max != repeat_min) |
| { |
| *code++ = c; |
| repeat_max -= repeat_min; |
| *code++ = OP_UPTO + repeat_type; |
| *code++ = repeat_max >> 8; |
| *code++ = (repeat_max & 255); |
| } |
| } |
| |
| /* The character or character type itself comes last in all cases. */ |
| |
| *code++ = c; |
| } |
| |
| /* If previous was a character class or a back reference, we put the repeat |
| stuff after it. */ |
| |
| else if (*previous == OP_CLASS || *previous == OP_NEGCLASS || |
| *previous == OP_REF) |
| { |
| if (repeat_min == 0 && repeat_max == -1) |
| *code++ = OP_CRSTAR + repeat_type; |
| else if (repeat_min == 1 && repeat_max == -1) |
| *code++ = OP_CRPLUS + repeat_type; |
| else if (repeat_min == 0 && repeat_max == 1) |
| *code++ = OP_CRQUERY + repeat_type; |
| else |
| { |
| *code++ = OP_CRRANGE + repeat_type; |
| *code++ = repeat_min >> 8; |
| *code++ = repeat_min & 255; |
| if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */ |
| *code++ = repeat_max >> 8; |
| *code++ = repeat_max & 255; |
| } |
| } |
| |
| /* If previous was a bracket group, we may have to replicate it in certain |
| cases. If the maximum repeat count is unlimited, check that the bracket |
| group cannot match the empty string, and diagnose an error if it can. */ |
| |
| else if ((int)*previous >= OP_BRA) |
| { |
| int i; |
| int length = code - previous; |
| |
| if (repeat_max == -1 && could_be_empty(previous)) |
| { |
| *errorptr = "operand of unlimited repeat could match the empty string"; |
| goto FAILED; |
| } |
| |
| /* If the minimum is greater than zero, and the maximum is unlimited or |
| equal to the minimum, the first copy remains where it is, and is |
| replicated up to the minimum number of times. This case includes the + |
| repeat, but of course no replication is needed in that case. */ |
| |
| if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min)) |
| { |
| for (i = 1; i < repeat_min; i++) |
| { |
| memcpy(code, previous, length); |
| code += length; |
| } |
| } |
| |
| /* If the minimum is zero, stick BRAZERO in front of the first copy. |
| Then, if there is a fixed upper limit, replicated up to that many times, |
| sticking BRAZERO in front of all the optional ones. */ |
| |
| else |
| { |
| if (repeat_min == 0) |
| { |
| memmove(previous+1, previous, length); |
| code++; |
| *previous++ = OP_BRAZERO + repeat_type; |
| } |
| |
| for (i = 1; i < repeat_min; i++) |
| { |
| memcpy(code, previous, length); |
| code += length; |
| } |
| |
| for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++) |
| { |
| *code++ = OP_BRAZERO + repeat_type; |
| memcpy(code, previous, length); |
| code += length; |
| } |
| } |
| |
| /* If the maximum is unlimited, set a repeater in the final copy. */ |
| |
| if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type; |
| } |
| |
| /* Else there's some kind of shambles */ |
| |
| else |
| { |
| *errorptr = "internal error 1 (unexpected repeat)"; |
| goto FAILED; |
| } |
| |
| /* In all case we no longer have a previous item. */ |
| |
| previous = NULL; |
| break; |
| |
| |
| /* Start of nested bracket sub-expression, or comment or lookahead. |
| First deal with special things that can come after a bracket; all are |
| introduced by ?, and the appearance of any of them means that this is not a |
| referencing group. They were checked for validity in the first pass over |
| the string, so we don't have to check for syntax errors here. */ |
| |
| case '(': |
| previous = code; /* Only real brackets can be repeated */ |
| if (*(++ptr) == '?') |
| { |
| bravalue = OP_BRA; |
| |
| switch (*(++ptr)) |
| { |
| case '#': |
| case 'i': |
| case 'm': |
| case 's': |
| case 'x': |
| ptr++; |
| while (*ptr != ')') ptr++; |
| previous = NULL; |
| continue; |
| |
| case ':': /* Non-extracting bracket */ |
| ptr++; |
| break; |
| |
| case '=': /* Assertions can't be repeated */ |
| bravalue = OP_ASSERT; |
| ptr++; |
| previous = NULL; |
| break; |
| |
| case '!': |
| bravalue = OP_ASSERT_NOT; |
| ptr++; |
| previous = NULL; |
| break; |
| |
| case ('P'): |
| ptr++; |
| if (*ptr=='<') |
| { |
| /* (?P<groupname>...) */ |
| int idlen; |
| PyObject *string, *intobj; |
| |
| ptr++; |
| idlen = get_group_id(ptr, '>', errorptr); |
| if (*errorptr) { |
| goto FAILED; |
| } |
| string = PyString_FromStringAndSize(ptr, idlen); |
| intobj = PyInt_FromLong( brackets[0] ); |
| if (intobj == NULL || string==NULL) |
| { |
| Py_XDECREF(string); |
| Py_XDECREF(intobj); |
| *errorptr = "exception raised"; |
| goto FAILED; |
| } |
| PyDict_SetItem(dictionary, string, intobj); |
| Py_DECREF(string); Py_DECREF(intobj); |
| ptr += idlen+1; /* Point to rest of expression */ |
| goto do_grouping_bracket; |
| } |
| if (*ptr=='=') |
| { |
| /* (?P=groupname) */ |
| int idlen, refnum; |
| PyObject *string, *intobj; |
| |
| ptr++; |
| idlen = get_group_id(ptr, ')', errorptr); |
| if (*errorptr) { |
| goto FAILED; |
| } |
| string = PyString_FromStringAndSize(ptr, idlen); |
| if (string==NULL) { |
| Py_XDECREF(string); |
| *errorptr = "exception raised"; |
| goto FAILED; |
| } |
| intobj = PyDict_GetItem(dictionary, string); |
| if (intobj==NULL) { |
| Py_DECREF(string); |
| *errorptr = "?P= group identifier isn't defined"; |
| goto FAILED; |
| } |
| |
| refnum = PyInt_AsLong(intobj); |
| Py_DECREF(string); Py_DECREF(intobj); |
| *code++ = OP_REF; |
| *code++ = refnum; |
| /* The continue will cause the top-level for() loop to |
| be resumed, so ptr will be immediately incremented. |
| Therefore, the following line adds just idlen, not |
| idlen+1 */ |
| ptr += idlen; |
| continue; |
| } |
| /* The character after ?P is neither < nor =, so |
| report an error. Add more Python-extensions here. */ |
| *errorptr="unknown after (?P"; |
| goto FAILED; |
| break; |
| default: |
| *errorptr = "unknown after (?"; |
| goto FAILED; |
| } |
| } |
| |
| /* Else we have a referencing group */ |
| |
| else |
| { |
| do_grouping_bracket: |
| if (brackets[0] > EXTRACT_MAX) |
| { |
| *errorptr = "too many extraction brackets"; |
| goto FAILED; |
| } |
| brackets[1] = brackets[0]; |
| bravalue = OP_BRA + brackets[0]++; |
| } |
| |
| /* Process nested bracketed re; at end pointer is on the bracket. We copy |
| code into a non-register variable in order to be able to pass its address |
| because some compilers complain otherwise. */ |
| |
| *code = bravalue; |
| { |
| uschar *mcode = code; |
| if (!compile_regexp(extended, brackets, &mcode, &ptr, errorptr, dictionary)) |
| goto FAILED; |
| code = mcode; |
| } |
| |
| if (*ptr != ')') |
| { |
| *errorptr = "missing )"; |
| goto FAILED; |
| } |
| break; |
| |
| /* Check \ for being a real metacharacter; if not, fall through and handle |
| it as a data character at the start of a string. Escape items are checked |
| for validity in the pre-compiling pass. */ |
| |
| case '\\': |
| oldptr = ptr; |
| c = check_escape(&ptr, errorptr); |
| |
| /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values |
| are arranged to be the negation of the corresponding OP_values. For the |
| back references, the values are ESC_REF plus the reference number. Only |
| back references and those types that consume a character may be repeated. |
| We can test for values between ESC_b and ESC_Z for the latter; this may |
| have to change if any new ones are ever created. */ |
| |
| if (c < 0) |
| { |
| if (-c >= ESC_REF) |
| { |
| int refnum = -c -ESC_REF; |
| if (brackets[1] < refnum ) { |
| *errorptr = "backreference to non-existent group"; |
| goto FAILED; |
| } |
| previous = code; |
| *code++ = OP_REF; |
| *code++ = refnum; |
| } |
| else |
| { |
| previous = (-c > ESC_b && -c < ESC_Z)? code : NULL; |
| *code++ = -c; |
| } |
| continue; |
| } |
| |
| /* Reset and fall through */ |
| |
| ptr = oldptr; |
| c = '\\'; |
| |
| /* Handle a run of data characters until a metacharacter is encountered. |
| The first character is guaranteed not to be whitespace or # when the |
| extended flag is set. */ |
| |
| default: |
| previous = code; |
| *code = OP_CHARS; |
| code += 2; |
| length = 0; |
| |
| do |
| { |
| if (extended) |
| { |
| if ((pcre_ctypes[c] & ctype_space) != 0) continue; |
| if (c == '#') |
| { |
| while ((c = *(++ptr)) != 0 && c != '\n'); |
| if (c == 0) break; |
| continue; |
| } |
| } |
| |
| /* Backslash may introduce a data char or a metacharacter. Escaped items |
| are checked for validity in the pre-compiling pass. Stop the string |
| before a metaitem. */ |
| |
| if (c == '\\') |
| { |
| oldptr = ptr; |
| c = check_escape(&ptr, errorptr); |
| if (c < 0) { ptr = oldptr; break; } |
| } |
| |
| /* Ordinary character or single-char escape */ |
| |
| *code++ = c; |
| length++; |
| } |
| |
| /* This "while" is the end of the "do" above. */ |
| |
| while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0); |
| |
| /* Compute the length and set it in the data vector, and advance to |
| the next state. */ |
| |
| previous[1] = length; |
| ptr--; |
| break; |
| } |
| } /* end of big loop */ |
| |
| /* Control never reaches here by falling through, only by a goto for all the |
| error states. Pass back the position in the pattern so that it can be displayed |
| to the user for diagnosing the error. */ |
| |
| FAILED: |
| *ptrptr = ptr; |
| return FALSE; |
| } |
| |
| |
| |
| |
| /************************************************* |
| * Compile sequence of alternatives * |
| *************************************************/ |
| |
| /* On entry, ptr is pointing past the bracket character, but on return |
| it points to the closing bracket, or vertical bar, or end of string. |
| The code variable is pointing at the byte into which the BRA operator has been |
| stored. |
| |
| Argument: |
| extended TRUE if PCRE_EXTENDED was set |
| brackets -> 2-element vector containing next and top bracket numbers |
| codeptr -> the address of the current code pointer |
| ptrptr -> the address of the current pattern pointer |
| errorptr -> pointer to error message |
| |
| Returns: TRUE on success |
| */ |
| |
| static BOOL |
| compile_regexp(BOOL extended, int *brackets, uschar **codeptr, |
| uschar **ptrptr, char **errorptr, PyObject *dictionary) |
| { |
| uschar *ptr = *ptrptr; |
| uschar *code = *codeptr; |
| uschar *start_bracket = code; |
| |
| for (;;) |
| { |
| int length; |
| uschar *last_branch = code; |
| |
| code += 3; |
| if (!compile_branch(extended, brackets, &code, &ptr, errorptr, dictionary)) |
| { |
| *ptrptr = ptr; |
| return FALSE; |
| } |
| |
| /* Fill in the length of the last branch */ |
| |
| length = code - last_branch; |
| last_branch[1] = length >> 8; |
| last_branch[2] = length & 255; |
| |
| /* Reached end of expression, either ')' or end of pattern. Insert a |
| terminating ket and the length of the whole bracketed item, and return, |
| leaving the pointer at the terminating char. */ |
| |
| if (*ptr != '|') |
| { |
| length = code - start_bracket; |
| *code++ = OP_KET; |
| *code++ = length >> 8; |
| *code++ = length & 255; |
| *codeptr = code; |
| *ptrptr = ptr; |
| return TRUE; |
| } |
| |
| /* Another branch follows; insert an "or" node and advance the pointer. */ |
| |
| *code = OP_ALT; |
| ptr++; |
| } |
| /* Control never reaches here */ |
| } |
| |
| |
| |
| /************************************************* |
| * Check for anchored expression * |
| *************************************************/ |
| |
| /* Try to find out if this is an anchored regular expression. Consider each |
| alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket |
| all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then |
| it's anchored. However, if this is a multiline pattern, then only OP_SOD |
| counts, since OP_CIRC can match in the middle. |
| |
| A branch is also implicitly anchored if it starts with .* because that will try |
| the rest of the pattern at all possible matching points, so there is no point |
| trying them again. |
| |
| Argument: points to start of expression (the bracket) |
| Returns: TRUE or FALSE |
| */ |
| |
| static BOOL |
| is_anchored(register uschar *code, BOOL multiline) |
| { |
| do { |
| int op = (int)code[3]; |
| if (op >= OP_BRA || op == OP_ASSERT) |
| { if (!is_anchored(code+3, multiline)) return FALSE; } |
| else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR) |
| { if (code[4] != OP_ANY) return FALSE; } |
| else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE; |
| code += (code[1] << 8) + code[2]; |
| } |
| while (*code == OP_ALT); |
| return TRUE; |
| } |
| |
| |
| |
| /************************************************* |
| * Check for start with \n line expression * |
| *************************************************/ |
| |
| /* This is called for multiline expressions to try to find out if every branch |
| starts with ^ so that "first char" processing can be done to speed things up. |
| |
| Argument: points to start of expression (the bracket) |
| Returns: TRUE or FALSE |
| */ |
| |
| static BOOL |
| is_startline(uschar *code) |
| { |
| do { |
| if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT) |
| { if (!is_startline(code+3)) return FALSE; } |
| else if (code[3] != OP_CIRC) return FALSE; |
| code += (code[1] << 8) + code[2]; |
| } |
| while (*code == OP_ALT); |
| return TRUE; |
| } |
| |
| |
| |
| /************************************************* |
| * Check for fixed first char * |
| *************************************************/ |
| |
| /* Try to find out if there is a fixed first character. This is called for |
| unanchored expressions, as it speeds up their processing quite considerably. |
| Consider each alternative branch. If they all start with the same char, or with |
| a bracket all of whose alternatives start with the same char (recurse ad lib), |
| then we return that char, otherwise -1. |
| |
| Argument: points to start of expression (the bracket) |
| Returns: -1 or the fixed first char |
| */ |
| |
| static int |
| find_firstchar(uschar *code) |
| { |
| register int c = -1; |
| do |
| { |
| register int charoffset = 4; |
| |
| if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT) |
| { |
| register int d; |
| if ((d = find_firstchar(code+3)) < 0) return -1; |
| if (c < 0) c = d; else if (c != d) return -1; |
| } |
| |
| else switch(code[3]) |
| { |
| default: |
| return -1; |
| |
| case OP_EXACT: /* Fall through */ |
| charoffset++; |
| |
| case OP_CHARS: /* Fall through */ |
| charoffset++; |
| |
| case OP_PLUS: |
| case OP_MINPLUS: |
| if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1; |
| break; |
| } |
| code += (code[1] << 8) + code[2]; |
| } |
| while (*code == OP_ALT); |
| return c; |
| } |
| |
| |
| |
| /************************************************* |
| * Compile a Regular Expression * |
| *************************************************/ |
| |
| /* This function takes a string and returns a pointer to a block of store |
| holding a compiled version of the expression. |
| |
| Arguments: |
| pattern the regular expression |
| options various option bits |
| errorptr pointer to pointer to error text |
| erroroffset ptr offset in pattern where error was detected |
| |
| Returns: pointer to compiled data block, or NULL on error, |
| with errorptr and erroroffset set |
| */ |
| |
| pcre * |
| pcre_compile(char *pattern, int options, char **errorptr, int |
| *erroroffset, PyObject *dictionary) |
| { |
| real_pcre *re; |
| int spaces = 0; |
| int length = 3; /* For initial BRA plus length */ |
| int runlength; |
| int c, size; |
| int brackets[2]; |
| int brastack[200]; |
| int brastackptr = 0; |
| BOOL extended = (options & PCRE_EXTENDED) != 0; |
| uschar *code, *ptr; |
| |
| #ifdef DEBUG |
| uschar *code_base, *code_end; |
| #endif |
| |
| /* Miscellaneous initialization; the copy the error pointers into static |
| variables so all functions can access them. */ |
| |
| brackets[0] = 1; /* Next bracket number */ |
| brackets[1] = 0; /* Highest used bracket number */ |
| |
| *errorptr = NULL; |
| *erroroffset = 0; |
| |
| if ((options & ~PUBLIC_OPTIONS) != 0) |
| { |
| *errorptr = "unknown option bit(s) set"; |
| return NULL; |
| } |
| |
| #ifdef DEBUG |
| printf("------------------------------------------------------------------\n"); |
| printf("%s\n", pattern); |
| #endif |
| |
| /* The first thing to do is to make a pass over the pattern to compute the |
| amount of store required to hold the compiled code. This does not have to be |
| perfect as long as errors are overestimates. At the same time we can detect any |
| internal flag settings. Make an attempt to correct for any counted white space |
| if an "extended" flag setting appears late in the pattern. We can't be so |
| clever for #-comments. */ |
| |
| ptr = (uschar *)(pattern - 1); |
| while ((c = *(++ptr)) != 0) |
| { |
| int min, max; |
| |
| if ((pcre_ctypes[c] & ctype_space) != 0) |
| { |
| if (extended) continue; |
| spaces++; |
| } |
| |
| if (extended && c == '#') |
| { |
| while ((c = *(++ptr)) != 0 && c != '\n'); |
| continue; |
| } |
| |
| switch(c) |
| { |
| /* A backslashed item may be an escaped "normal" character or a |
| character type. For a "normal" character, put the pointers and |
| character back so that tests for whitespace etc. in the input |
| are done correctly. */ |
| |
| case '\\': |
| { |
| uschar *save_ptr = ptr; |
| c = check_escape(&ptr, errorptr); |
| if (*errorptr != NULL) goto PCRE_ERROR_RETURN; |
| if (c >= 0) |
| { |
| ptr = save_ptr; |
| c = '\\'; |
| goto NORMAL_CHAR; |
| } |
| } |
| length++; |
| |
| /* A back reference needs an additional char, plus either one or 5 |
| bytes for a repeat. */ |
| |
| if (c <= -ESC_REF) |
| { |
| length++; /* For single back reference */ |
| if (ptr[1] == '{') |
| { |
| ptr = read_repeat_counts(ptr+2, &min, &max, errorptr); |
| if (*errorptr != NULL) goto PCRE_ERROR_RETURN; |
| if ((min == 0 && (max == 1 || max == -1)) || |
| (min == 1 && max == -1)) |
| length++; |
| else length += 5; |
| if (ptr[1] == '?') ptr++; |
| } |
| } |
| continue; |
| |
| case '^': |
| case '.': |
| case '$': |
| case '*': /* These repeats won't be after brackets; */ |
| case '+': /* those are handled separately */ |
| case '?': |
| length++; |
| continue; |
| |
| /* This covers the cases of repeats after a single char, metachar, class, |
| or back reference. */ |
| |
| case '{': |
| ptr = read_repeat_counts(ptr+1, &min, &max, errorptr); |
| if (*errorptr != NULL) goto PCRE_ERROR_RETURN; |
| if ((min == 0 && (max == 1 || max == -1)) || |
| (min == 1 && max == -1)) |
| length++; |
| else |
| { |
| length--; /* Uncount the original char or metachar */ |
| if (min == 1) length++; else if (min > 0) length += 4; |
| if (max > 0) length += 4; else length += 2; |
| } |
| if (ptr[1] == '?') ptr++; |
| continue; |
| |
| /* An alternation contains an offset to the next branch or ket. */ |
| case '|': |
| length += 3; |
| continue; |
| |
| /* A character class uses 4 characters plus the characters in it. Don't |
| worry about character types that aren't allowed in classes - they'll get |
| picked up during the compile. */ |
| |
| case '[': |
| length += 4; |
| if (ptr[1] == '^') ptr++; |
| do |
| { |
| if (*(++ptr) == '\\') |
| { |
| (void)check_escape(&ptr, errorptr); |
| if (*errorptr != NULL) goto PCRE_ERROR_RETURN; |
| } |
| length++; |
| } |
| while (*ptr != 0 && *ptr != ']'); |
| |
| /* A repeat needs either 1 or 5 bytes. */ |
| |
| if (ptr[1] == '{') |
| { |
| ptr = read_repeat_counts(ptr+2, &min, &max, errorptr); |
| if (*errorptr != NULL) goto PCRE_ERROR_RETURN; |
| if ((min == 0 && (max == 1 || max == -1)) || |
| (min == 1 && max == -1)) |
| length++; |
| else length += 5; |
| if (ptr[1] == '?') ptr++; |
| } |
| continue; |
| |
| /* Brackets may be genuine groups or special things */ |
| |
| case '(': |
| |
| /* Handle special forms of bracket, which all start (? */ |
| |
| if (ptr[1] == '?') switch (c = ptr[2]) |
| { |
| /* Skip over comments entirely */ |
| case '#': |
| ptr += 3; |
| while (*ptr != 0 && *ptr != ')') ptr++; |
| if (*ptr == 0) |
| { |
| *errorptr = "missing ) after comment"; |
| goto PCRE_ERROR_RETURN; |
| } |
| continue; |
| |
| /* Non-referencing groups and lookaheads just move the pointer on, and |
| then behave like a non-special bracket. */ |
| |
| case ':': |
| case '=': |
| case '!': |
| ptr += 2; |
| break; |
| |
| /* Else loop setting valid options until ) is met. Anything else is an |
| error. */ |
| |
| case ('P'): |
| { |
| int idlen; |
| switch (*ptr++) { |
| case ('<'): |
| idlen = get_group_id(ptr++, '>', errorptr); |
| if (*errorptr) goto PCRE_ERROR_RETURN; |
| ptr += idlen+1; |
| break; |
| case ('='): |
| idlen = get_group_id(ptr++, ')', errorptr); |
| if (*errorptr) goto PCRE_ERROR_RETURN; |
| ptr += idlen+1; |
| length++; |
| break; |
| } |
| } |
| break; |
| |
| default: |
| ptr += 2; |
| for (;; ptr++) |
| { |
| if ((c = *ptr) == 'i') |
| { |
| options |= PCRE_CASELESS; |
| continue; |
| } |
| else if ((c = *ptr) == 'm') |
| { |
| options |= PCRE_MULTILINE; |
| continue; |
| } |
| else if ((c = *ptr) == 's') |
| { |
| options |= PCRE_DOTALL; |
| continue; |
| } |
| else if (c == 'x') |
| { |
| options |= PCRE_EXTENDED; |
| extended = TRUE; |
| length -= spaces; /* Already counted spaces */ |
| continue; |
| } |
| else if (c == ')') break; |
| |
| *errorptr = "undefined after (?"; |
| goto PCRE_ERROR_RETURN; |
| } |
| continue; /* End of this bracket handling */ |
| } |
| |
| /* Non-special forms of bracket. Save length for computing whole length |
| at end if there's a repeat that requires duplication of the group. */ |
| |
| if (brastackptr >= sizeof(brastack)/sizeof(int)) |
| { |
| *errorptr = "too many brackets"; |
| goto PCRE_ERROR_RETURN; |
| } |
| |
| brastack[brastackptr++] = length; |
| length += 3; |
| continue; |
| |
| /* Handle ket. Look for subsequent max/min; for certain sets of values we |
| have to replicate this bracket up to that many times. */ |
| |
| case ')': |
| length += 3; |
| { |
| int min = 1; |
| int max = 1; |
| int duplength = length - brastack[--brastackptr]; |
| |
| /* Leave ptr at the final char; for read_repeat_counts this happens |
| automatically; for the others we need an increment. */ |
| |
| if ((c = ptr[1]) == '{') |
| { |
| ptr = read_repeat_counts(ptr+2, &min, &max, errorptr); |
| if (*errorptr != NULL) goto PCRE_ERROR_RETURN; |
| } |
| else if (c == '*') { min = 0; max = -1; ptr++; } |
| else if (c == '+') { max = -1; ptr++; } |
| else if (c == '?') { min = 0; ptr++; } |
| |
| /* If there is a minimum > 1 we have to replicate up to min-1 times; if |
| there is a limited maximum we have to replicate up to max-1 times and |
| allow for a BRAZERO item before each optional copy, as we also have to |
| do before the first copy if the minimum is zero. */ |
| |
| if (min == 0) length++; |
| else if (min > 1) length += (min - 1) * duplength; |
| if (max > min) length += (max - min) * (duplength + 1); |
| } |
| |
| continue; |
| |
| /* Non-special character. For a run of such characters the length required |
| is the number of characters + 2, except that the maximum run length is 255. |
| We won't get a skipped space or a non-data escape or the start of a # |
| comment as the first character, so the length can't be zero. */ |
| |
| NORMAL_CHAR: |
| default: |
| length += 2; |
| runlength = 0; |
| do |
| { |
| if ((pcre_ctypes[c] & ctype_space) != 0) |
| { |
| if (extended) continue; |
| spaces++; |
| } |
| |
| if (extended && c == '#') |
| { |
| while ((c = *(++ptr)) != 0 && c != '\n'); |
| continue; |
| } |
| |
| /* Backslash may introduce a data char or a metacharacter; stop the |
| string before the latter. */ |
| |
| if (c == '\\') |
| { |
| uschar *saveptr = ptr; |
| c = check_escape(&ptr, errorptr); |
| if (*errorptr != NULL) goto PCRE_ERROR_RETURN; |
| if (c < 0) { ptr = saveptr; break; } |
| } |
| |
| /* Ordinary character or single-char escape */ |
| |
| runlength++; |
| } |
| |
| /* This "while" is the end of the "do" above. */ |
| |
| while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0); |
| |
| ptr--; |
| length += runlength; |
| continue; |
| } |
| } |
| |
| length += 4; /* For final KET and END */ |
| |
| if (length > 65539) |
| { |
| *errorptr = "regular expression too large"; |
| return NULL; |
| } |
| |
| /* Compute the size of data block needed and get it, either from malloc or |
| externally provided function. Put in the magic number and the options. */ |
| |
| size = length + sizeof(real_pcre) - sizeof(re->code); |
| re = (real_pcre *)(pcre_malloc)(size); |
| |
| if (re == NULL) |
| { |
| *errorptr = "failed to get memory"; |
| return NULL; |
| } |
| |
| re->magic_number = MAGIC_NUMBER; |
| re->options = options; |
| |
| /* Set up a starting, non-extracting bracket, then compile the expression. On |
| error, *errorptr will be set non-NULL, so we don't need to look at the result |
| of the function here. */ |
| |
| ptr = (uschar *)pattern; |
| code = re->code; |
| *code = OP_BRA; |
| (void)compile_regexp(extended, brackets, &code, &ptr, errorptr, dictionary); |
| re->top_bracket = brackets[1]; |
| |
| /* If not reached end of pattern on success, there's an excess bracket. */ |
| |
| if (*errorptr == NULL && *ptr != 0) *errorptr = "unmatched brackets"; |
| /* Fill in the terminating state and check for disastrous overflow, but |
| if debugging, leave the test till after things are printed out. */ |
| |
| *code++ = OP_END; |
| |
| #ifndef DEBUG |
| if (code - re->code > length) *errorptr = "internal error: code overflow"; |
| #endif |
| |
| /* Failed to compile */ |
| |
| if (*errorptr != NULL) |
| { |
| (pcre_free)(re); |
| PCRE_ERROR_RETURN: |
| *erroroffset = ptr - (uschar *)pattern; |
| return NULL; |
| } |
| |
| /* If the anchored option was not passed, set flag if we can determine that it |
| is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if |
| we can determine what the first character has to be, because that speeds up |
| unanchored matches no end. In the case of multiline matches, an alternative is |
| to set the PCRE_STARTLINE flag if all branches start with ^. */ |
| |
| if ((options & PCRE_ANCHORED) == 0) |
| { |
| if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0)) |
| re->options |= PCRE_ANCHORED; |
| else |
| { |
| int c = find_firstchar(re->code); |
| if (c >= 0) |
| { |
| re->first_char = c; |
| re->options |= PCRE_FIRSTSET; |
| } |
| else if (is_startline(re->code)) |
| re->options |= PCRE_STARTLINE; |
| } |
| } |
| |
| /* Print out the compiled data for debugging */ |
| |
| #ifdef DEBUG |
| |
| printf("Length = %d top_bracket = %d%s%s%s%s\n", |
| length, re->top_bracket, |
| ((re->options & PCRE_ANCHORED) != 0)? " anchored" : "", |
| ((re->options & PCRE_CASELESS) != 0)? " caseless" : "", |
| extended? " extended" : "", |
| ((re->options & PCRE_MULTILINE) != 0)? " multiline" : ""); |
| |
| if ((re->options & PCRE_FIRSTSET) != 0) |
| { |
| if (isprint(re->first_char)) printf("First char = %c\n", re->first_char); |
| else printf("First char = \\x%02x\n", re->first_char); |
| } |
| |
| code_end = code; |
| code_base = code = re->code; |
| |
| while (code < code_end) |
| { |
| int charlength; |
| |
| printf("%3d ", code - code_base); |
| |
| if (*code >= OP_BRA) |
| { |
| printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA); |
| code += 2; |
| } |
| |
| else switch(*code) |
| { |
| case OP_CHARS: |
| charlength = *(++code); |
| printf("%3d ", charlength); |
| while (charlength-- > 0) |
| if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c); |
| break; |
| |
| case OP_KETRMAX: |
| case OP_KETRMIN: |
| case OP_ALT: |
| case OP_KET: |
| case OP_ASSERT: |
| case OP_ASSERT_NOT: |
| printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]); |
| code += 2; |
| break; |
| |
| case OP_STAR: |
| case OP_MINSTAR: |
| case OP_PLUS: |
| case OP_MINPLUS: |
| case OP_QUERY: |
| case OP_MINQUERY: |
| case OP_TYPESTAR: |
| case OP_TYPEMINSTAR: |
| case OP_TYPEPLUS: |
| case OP_TYPEMINPLUS: |
| case OP_TYPEQUERY: |
| case OP_TYPEMINQUERY: |
| if (*code >= OP_TYPESTAR) |
| printf(" %s", OP_names[code[1]]); |
| else if (isprint(c = code[1])) printf(" %c", c); |
| else printf(" \\x%02x", c); |
| printf("%s", OP_names[*code++]); |
| break; |
| |
| case OP_EXACT: |
| case OP_UPTO: |
| case OP_MINUPTO: |
| if (isprint(c = code[3])) printf(" %c{", c); |
| else printf(" \\x%02x{", c); |
| if (*code != OP_EXACT) printf(","); |
| printf("%d}", (code[1] << 8) + code[2]); |
| if (*code == OP_MINUPTO) printf("?"); |
| code += 3; |
| break; |
| |
| case OP_TYPEEXACT: |
| case OP_TYPEUPTO: |
| case OP_TYPEMINUPTO: |
| printf(" %s{", OP_names[code[3]]); |
| if (*code != OP_TYPEEXACT) printf(","); |
| printf("%d}", (code[1] << 8) + code[2]); |
| if (*code == OP_TYPEMINUPTO) printf("?"); |
| code += 3; |
| break; |
| |
| case OP_REF: |
| printf(" \\%d", *(++code)); |
| break; |
| |
| case OP_CLASS: |
| case OP_NEGCLASS: |
| { |
| int i, min, max; |
| int flags = code[1]; |
| int rangecount = code[2]; |
| int charcount = code[3]; |
| |
| printf(" [%s", (*code == OP_CLASS)? "" : "^"); |
| code += 3; |
| |
| for (i = 0; i < 8; i++) |
| if ((flags & (1 << i)) != 0) printf("%s", class_names[i]); |
| |
| for (i = 0; i < rangecount; i++) |
| { |
| if (isprint(*(++code))) printf("%c-", *code); else printf("\\x%02x-", *code); |
| if (isprint(*(++code))) printf("%c", *code); else printf("\\x%02x", *code); |
| } |
| |
| for (i = 0; i < charcount; i++) |
| { |
| if (!isprint(*(++code))) printf("\\x%02x", *code); |
| else if (strchr("-\\]", *code) != NULL) printf("\\%c", *code); |
| else printf("%c", *code); |
| } |
| printf("]"); |
| |
| switch(*(++code)) |
| { |
| case OP_CRSTAR: |
| case OP_CRMINSTAR: |
| case OP_CRPLUS: |
| case OP_CRMINPLUS: |
| case OP_CRQUERY: |
| case OP_CRMINQUERY: |
| printf("%s", OP_names[*code]); |
| break; |
| |
| case OP_CRRANGE: |
| case OP_CRMINRANGE: |
| min = (code[1] << 8) + code[2]; |
| max = (code[3] << 8) + code[4]; |
| if (max == 0) printf("{%d,}", min); |
| else printf("{%d,%d}", min, max); |
| if (*code == OP_CRMINRANGE) printf("?"); |
| code += 4; |
| break; |
| |
| default: |
| code--; |
| } |
| } |
| break; |
| |
| /* Anything else is just a one-node item */ |
| |
| default: |
| printf(" %s", OP_names[*code]); |
| break; |
| } |
| |
| code++; |
| printf("\n"); |
| } |
| printf("------------------------------------------------------------------\n"); |
| |
| /* This check is done here in the debugging case so that the code that |
| was compiled can be seen. */ |
| |
| if (code - re->code > length) |
| { |
| *errorptr = "internal error: code overflow"; |
| (pcre_free)(re); |
| *erroroffset = ptr - (uschar *)pattern; |
| return NULL; |
| } |
| #endif |
| |
| return (pcre *)re; |
| } |
| |
| |
| |
| /************************************************* |
| * Match a character type * |
| *************************************************/ |
| |
| /* Not used in all the places it might be as it's sometimes faster |
| to put the code inline. |
| |
| Arguments: |
| type the character type |
| c the character |
| multiline the multiline flag |
| |
| Returns: TRUE if character is of the type |
| */ |
| |
| static BOOL |
| match_type(int type, int c, BOOL dotall) |
| { |
| |
| #ifdef DEBUG |
| if (isprint(c)) printf("matching subject %c against ", c); |
| else printf("matching subject \\x%02x against ", c); |
| printf("%s\n", OP_names[type]); |
| #endif |
| |
| switch(type) |
| { |
| case OP_ANY: return dotall || c != '\n'; |
| case OP_NOT_DIGIT: return (pcre_ctypes[c] & ctype_digit) == 0; |
| case OP_DIGIT: return (pcre_ctypes[c] & ctype_digit) != 0; |
| case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0; |
| case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0; |
| case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0; |
| case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0; |
| } |
| return FALSE; |
| } |
| |
| |
| /************************************************* |
| * Match a character class * |
| *************************************************/ |
| |
| /* Return "result" if char is in the class and "!result" otherwise. |
| |
| Arguments: |
| data points to the class item |
| c the subject character |
| result value to return if in class |
| md matching "static" data |
| |
| Returns: result or !result |
| */ |
| |
| static BOOL |
| match_class(register uschar *data, register int c, BOOL result, match_data *md) |
| { |
| int flags = data[1]; |
| int i; |
| uschar *base = data; |
| uschar *end; |
| |
| #ifdef DEBUG |
| { |
| uschar *d = base + 3; |
| |
| if (isprint(c)) |
| printf("match %c against [%s", c, result? "" : "^"); |
| else |
| printf("match \\x%02x against [%s", c, result? "" : "^"); |
| |
| for (i = 0; i < 8; i++) |
| if ((flags & (1 << i)) != 0) printf("%s", class_names[i]); |
| |
| for (i = 0; i < data[2]; i++) |
| { |
| if (isprint(*(++d))) printf("%c-", *d); else printf("\\x%02x-", *d); |
| if (isprint(*(++d))) printf("%c", *d); else printf("\\x%02x", *d); |
| } |
| |
| for (i = 0; i < data[3]; i++) |
| { |
| if (!isprint(*(++d))) printf("\\x%02x", *d); |
| else if (strchr("-\\]", *d) != NULL) printf("\\%c", *d); |
| else printf("%c", *d); |
| } |
| printf("]\n"); |
| } |
| #endif |
| |
| /* Test for any character types */ |
| |
| for (i = 0; flags != 0; i++) |
| { |
| if ((flags & 1) != 0 && match_type(class_ops[i], c, md->dotall)) |
| return result; |
| flags >>= 1; |
| } |
| |
| /* Advance pointer to the specific chars and do the caseless or caseful testing |
| of the ranges and individual characters as necessary. */ |
| |
| data += 4; |
| end = data + base[2] * 2; |
| |
| /* Caseless character ranges are slightly tricky, because of cases like [W-c]. |
| What we do is to uppercase the subject char if it is beyond the end of the |
| range, or lowercase it if it is before the start of the range and try again if |
| a caseful comparison has failed. This works because upper case letters come |
| before lower case in ASCII code. It would not work in EBCDIC, for example, |
| where they are the other way round, but then ranges like [W-c] would be illegal |
| in EBCDIC. */ |
| |
| if (md->caseless) |
| { |
| while (data < end) |
| { |
| register int d; |
| if (c >= (int)*data && c <= (int)data[1]) return result; |
| d = (c < (int)*data)? pcre_lcc[c] : pcre_ucc[c]; |
| if (d >= (int)*data && d <= (int)data[1]) return result; |
| data += 2; |
| } |
| end += base[3]; |
| c = pcre_lcc[c]; |
| while (data < end) if (c == pcre_lcc[*data++]) return result; |
| } |
| |
| /* Caseful is easy */ |
| |
| else |
| { |
| while (data < end) |
| { |
| if (c >= (int)*data && c <= (int)data[1]) return result; |
| data += 2; |
| } |
| end += base[3]; |
| while (data < end) if (c == *data++) return result; |
| } |
| |
| /* Character is not in the class */ |
| |
| return !result; |
| } |
| |
| |
| |
| /************************************************* |
| * Match a back-reference * |
| *************************************************/ |
| |
| /* If a back reference hasn't been set, the match fails. |
| |
| Arguments: |
| number reference number |
| eptr points into the subject |
| length length to be matched |
| md points to match data block |
| |
| Returns: TRUE if matched |
| */ |
| |
| static BOOL |
| match_ref(int number, register uschar *eptr, int length, match_data *md) |
| { |
| uschar *p = md->start_subject + md->offset_vector[number]; |
| |
| #ifdef DEBUG |
| if (eptr >= md->end_subject) |
| printf("matching subject <null>"); |
| else |
| { |
| printf("matching subject "); |
| pchars(eptr, length, TRUE, md); |
| } |
| printf(" against backref "); |
| pchars(p, length, FALSE, md); |
| printf("\n"); |
| #endif |
| |
| /* Always fail if not enough characters left */ |
| |
| if (length > md->end_subject - p) return FALSE; |
| |
| /* Separate the caselesss case for speed */ |
| |
| if (md->caseless) |
| { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; } |
| else |
| { while (length-- > 0) if (*p++ != *eptr++) return FALSE; } |
| |
| return TRUE; |
| } |
| |
| static int free_stack(match_data *md) |
| { |
| /* Free any stack space that was allocated by the call to match(). */ |
| if (md->off_num) free(md->off_num); |
| if (md->offset_top) free(md->offset_top); |
| if (md->r1) free(md->r1); |
| if (md->r2) free(md->r2); |
| if (md->eptr) free(md->eptr); |
| if (md->ecode) free(md->ecode); |
| return 0; |
| } |
| |
| static int grow_stack(match_data *md) |
| { |
| md->length = md->length ? md->length+md->length/2 : 200; |
| md->offset_top=realloc(md->offset_top, md->length*sizeof(int)); |
| md->eptr=realloc(md->eptr, md->length*sizeof(void *)); |
| md->ecode=realloc(md->ecode, md->length*sizeof(void *)); |
| md->off_num=realloc(md->off_num, md->length*sizeof(int)); |
| md->r1=realloc(md->r1, md->length*sizeof(int)); |
| md->r2=realloc(md->r2, md->length*sizeof(int)); |
| return 0; |
| } |
| |
| /************************************************* |
| * Match from current position * |
| *************************************************/ |
| |
| /* On entry ecode points to the first opcode, and eptr to the first character. |
| |
| Arguments: |
| eptr pointer in subject |
| ecode position in code |
| offset_top current top pointer |
| md pointer to "static" info for the match |
| |
| Returns: TRUE if matched |
| */ |
| |
| static BOOL |
| match(register uschar *eptr, register uschar *ecode, int offset_top, |
| match_data *md) |
| { |
| int save_stack_position = md->point; |
| match_loop: |
| |
| #define SUCCEED goto succeed |
| #define FAIL goto fail |
| |
| for (;;) |
| { |
| int min, max, ctype; |
| register int i; |
| register int c; |
| BOOL minimize = 0; |
| |
| /* Opening bracket. Check the alternative branches in turn, failing if none |
| match. We have to set the start offset if required and there is space |
| in the offset vector so that it is available for subsequent back references |
| if the bracket matches. However, if the bracket fails, we must put back the |
| previous value of both offsets in case they were set by a previous copy of |
| the same bracket. Don't worry about setting the flag for the error case here; |
| that is handled in the code for KET. */ |
| |
| if ((int)*ecode >= OP_BRA) |
| { |
| int number = (*ecode - OP_BRA) << 1; |
| int save_offset1 = 0, save_offset2 = 0; |
| |
| #ifdef DEBUG |
| printf("start bracket %d\n", number/2); |
| #endif |
| |
| if (number > 0 && number < md->offset_end) |
| { |
| save_offset1 = md->offset_vector[number]; |
| save_offset2 = md->offset_vector[number+1]; |
| md->offset_vector[number] = eptr - md->start_subject; |
| |
| #ifdef DEBUG |
| printf("saving %d %d\n", save_offset1, save_offset2); |
| #endif |
| } |
| |
| /* Recurse for all the alternatives. */ |
| |
| do |
| { |
| if (match(eptr, ecode+3, offset_top, md)) SUCCEED; |
| ecode += (ecode[1] << 8) + ecode[2]; |
| } |
| while (*ecode == OP_ALT); |
| |
| #ifdef DEBUG |
| printf("bracket %d failed\n", number/2); |
| #endif |
| |
| if (number > 0 && number < md->offset_end) |
| { |
| md->offset_vector[number] = save_offset1; |
| md->offset_vector[number+1] = save_offset2; |
| } |
| |
| FAIL; |
| } |
| |
| /* Other types of node can be handled by a switch */ |
| |
| switch(*ecode) |
| { |
| case OP_END: |
| md->end_match_ptr = eptr; /* Record where we ended */ |
| md->end_offset_top = offset_top; /* and how many extracts were taken */ |
| SUCCEED; |
| |
| /* Assertion brackets. Check the alternative branches in turn - the |
| matching won't pass the KET for an assertion. If any one branch matches, |
| the assertion is true. */ |
| |
| case OP_ASSERT: |
| do |
| { |
| if (match(eptr, ecode+3, offset_top, md)) break; |
| ecode += (ecode[1] << 8) + ecode[2]; |
| } |
| while (*ecode == OP_ALT); |
| if (*ecode == OP_KET) FAIL; |
| |
| /* Continue from after the assertion, updating the offsets high water |
| mark, since extracts may have been taken during the assertion. */ |
| |
| do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT); |
| ecode += 3; |
| offset_top = md->end_offset_top; |
| continue; |
| |
| /* Negative assertion: all branches must fail to match */ |
| |
| case OP_ASSERT_NOT: |
| do |
| { |
| if (match(eptr, ecode+3, offset_top, md)) FAIL; |
| ecode += (ecode[1] << 8) + ecode[2]; |
| } |
| while (*ecode == OP_ALT); |
| ecode += 3; |
| continue; |
| |
| /* An alternation is the end of a branch; scan along to find the end of the |
| bracketed group and go to there. */ |
| |
| case OP_ALT: |
| do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT); |
| break; |
| |
| /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating |
| that it may occur zero times. It may repeat infinitely, or not at all - |
| i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper |
| repeat limits are compiled as a number of copies, with the optional ones |
| preceded by BRAZERO or BRAMINZERO. */ |
| |
| case OP_BRAZERO: |
| { |
| uschar *next = ecode+1; |
| if (match(eptr, next, offset_top, md)) SUCCEED; |
| do next += (next[1] << 8) + next[2]; while (*next == OP_ALT); |
| ecode = next + 3; |
| } |
| break; |
| |
| case OP_BRAMINZERO: |
| { |
| uschar *next = ecode+1; |
| do next += (next[1] << 8) + next[2]; while (*next == OP_ALT); |
| if (match(eptr, next+3, offset_top, md)) SUCCEED; |
| ecode++; |
| } |
| break;; |
| |
| /* End of a group, repeated or non-repeating. If we are at the end of |
| an assertion "group", stop matching and SUCCEED, but record the |
| current high water mark for use by positive assertions. */ |
| |
| case OP_KET: |
| case OP_KETRMIN: |
| case OP_KETRMAX: |
| { |
| int number, start, end; |
| uschar *prev = ecode - (ecode[1] << 8) - ecode[2]; |
| |
| if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT) |
| { |
| md->end_offset_top = offset_top; |
| SUCCEED; |
| } |
| |
| /* In all other cases we have to check the group number back at the |
| start and if necessary complete handling an extraction by setting the |
| final offset and bumping the high water mark. */ |
| |
| number = (*prev - OP_BRA) << 1; |
| |
| #ifdef DEBUG |
| printf("end bracket %d\n", number/2); |
| #endif |
| |
| if (number > 0) |
| { |
| if (number >= md->offset_end) md->offset_overflow = TRUE; else |
| { |
| start=md->offset_vector[number] ; end =md->offset_vector[number+1]; |
| md->offset_vector[number+1] = eptr - md->start_subject; |
| if (offset_top <= number) offset_top = number + 2; |
| } |
| } |
| |
| /* For a non-repeating ket, just advance to the next node and continue at |
| this level. */ |
| |
| if (*ecode == OP_KET) |
| { |
| ecode += 3; |
| break; |
| } |
| |
| /* The repeating kets try the rest of the pattern or restart from the |
| preceding bracket, in the appropriate order. */ |
| |
| if (*ecode == OP_KETRMIN) |
| { |
| uschar *ptr; |
| if (match(eptr, ecode+3, offset_top, md)) goto succeed; |
| /* Handle alternation inside the BRA...KET; push the additional |
| alternatives onto the stack |
| XXX this tries the alternatives backwards! */ |
| ptr=prev; |
| do { |
| ptr += (ptr[1]<<8)+ ptr[2]; |
| if (*ptr==OP_ALT) |
| { |
| if (md->length == md->point) grow_stack(md); |
| md->offset_top[md->point] = offset_top; |
| md->eptr[md->point] = eptr; |
| md->ecode[md->point] = ptr+3; |
| md->r1[md->point] = 0; |
| md->r2[md->point] = 0; |
| md->off_num[md->point] = 0; |
| md->point++; |
| } |
| } while (*ptr==OP_ALT); |
| ecode=prev+3; goto match_loop; |
| } |
| else /* OP_KETRMAX */ |
| { |
| uschar *ptr; |
| int points_pushed=0; |
| |
| /* Push one failure point, that will resume matching at the code after |
| the KETRMAX opcode. */ |
| if (md->length == md->point) grow_stack(md); |
| md->offset_top[md->point] = offset_top; |
| md->eptr[md->point] = eptr; |
| md->ecode[md->point] = ecode+3; |
| md->r1[md->point] = md->offset_vector[number]; |
| md->r2[md->point] = md->offset_vector[number+1]; |
| md->off_num[md->point] = number; |
| md->point++; |
| |
| md->offset_vector[number] = eptr - md->start_subject; |
| /* Handle alternation inside the BRA...KET; push each of the |
| additional alternatives onto the stack |
| XXX this tries the alternatives backwards! */ |
| ptr=prev; |
| do { |
| ptr += (ptr[1]<<8)+ ptr[2]; |
| if (*ptr==OP_ALT) |
| { |
| if (md->length == md->point) grow_stack(md); |
| md->offset_top[md->point] = offset_top; |
| md->eptr[md->point] = eptr; |
| md->ecode[md->point] = ptr+3; |
| md->r1[md->point] = 0; |
| md->r2[md->point] = 0; |
| md->off_num[md->point] = 0; |
| md->point++; |
| points_pushed++; |
| } |
| } while (*ptr==OP_ALT); |
| /* Jump to the first (or only) alternative and resume trying to match */ |
| ecode=prev+3; goto match_loop; |
| } |
| } |
| FAIL; |
| |
| /* Start of subject, or after internal newline if multiline */ |
| |
| case OP_CIRC: |
| if (md->multiline) |
| { |
| if (eptr != md->start_subject && eptr[-1] != '\n') FAIL; |
| ecode++; |
| break; |
| } |
| /* ... else fall through */ |
| |
| /* Start of subject assertion */ |
| |
| case OP_SOD: |
| if (eptr != md->start_subject) FAIL; |
| ecode++; |
| break; |
| |
| /* End of subject, or before internal newline if multiline */ |
| |
| case OP_DOLL: |
| if (md->multiline) |
| { |
| if (eptr < md->end_subject && *eptr != '\n') FAIL; |
| ecode++; |
| break; |
| } |
| /* ... else fall through */ |
| |
| /* End of subject assertion */ |
| |
| case OP_EOD: |
| if (eptr < md->end_subject) FAIL; |
| ecode++; |
| break; |
| |
| /* Word boundary assertions */ |
| |
| case OP_NOT_WORD_BOUNDARY: |
| case OP_WORD_BOUNDARY: |
| { |
| BOOL prev_is_word = (eptr != md->start_subject) && |
| ((pcre_ctypes[eptr[-1]] & ctype_word) != 0); |
| BOOL cur_is_word = (eptr < md->end_subject) && |
| ((pcre_ctypes[*eptr] & ctype_word) != 0); |
| if ((*ecode++ == OP_WORD_BOUNDARY)? |
| cur_is_word == prev_is_word : cur_is_word != prev_is_word) |
| FAIL; |
| } |
| break; |
| |
| /* Match a single character type; inline for speed */ |
| |
| case OP_ANY: |
| if (!md->dotall && eptr < md->end_subject && *eptr == '\n') FAIL; |
| if (eptr++ >= md->end_subject) FAIL; |
| ecode++; |
| break; |
| |
| case OP_NOT_DIGIT: |
| if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0) |
| FAIL; |
| ecode++; |
| break; |
| |
| case OP_DIGIT: |
| if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0) |
| FAIL; |
| ecode++; |
| break; |
| |
| case OP_NOT_WHITESPACE: |
| if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0) |
| FAIL; |
| ecode++; |
| break; |
| |
| case OP_WHITESPACE: |
| if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0) |
| FAIL; |
| ecode++; |
| break; |
| |
| case OP_NOT_WORDCHAR: |
| if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0) |
| FAIL; |
| ecode++; |
| break; |
| |
| case OP_WORDCHAR: |
| if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0) |
| FAIL; |
| ecode++; |
| break; |
| |
| /* Match a back reference, possibly repeatedly. Look past the end of the |
| item to see if there is repeat information following. The code is similar |
| to that for character classes, but repeated for efficiency. Then obey |
| similar code to character type repeats - written out again for speed. |
| However, if the referenced string is the empty string, always treat |
| it as matched, any number of times (otherwise there could be infinite |
| loops). */ |
| |
| case OP_REF: |
| { |
| int length; |
| int number = ecode[1] << 1; /* Doubled reference number */ |
| ecode += 2; /* Advance past the item */ |
| |
| if (number >= offset_top || md->offset_vector[number] < 0) |
| { |
| md->errorcode = PCRE_ERROR_BADREF; |
| FAIL; |
| } |
| |
| length = md->offset_vector[number+1] - md->offset_vector[number]; |
| |
| switch (*ecode) |
| { |
| case OP_CRSTAR: |
| case OP_CRMINSTAR: |
| case OP_CRPLUS: |
| case OP_CRMINPLUS: |
| case OP_CRQUERY: |
| case OP_CRMINQUERY: |
| c = *ecode++ - OP_CRSTAR; |
| minimize = (c & 1) != 0; |
| min = rep_min[c]; /* Pick up values from tables; */ |
| max = rep_max[c]; /* zero for max => infinity */ |
| if (max == 0) max = INT_MAX; |
| break; |
| |
| case OP_CRRANGE: |
| case OP_CRMINRANGE: |
| minimize = (*ecode == OP_CRMINRANGE); |
| min = (ecode[1] << 8) + ecode[2]; |
| max = (ecode[3] << 8) + ecode[4]; |
| if (max == 0) max = INT_MAX; |
| ecode += 5; |
| break; |
| |
| default: /* No repeat follows */ |
| if (!match_ref(number, eptr, length, md)) FAIL; |
| eptr += length; |
| continue; /* With the main loop */ |
| } |
| |
| /* If the length of the reference is zero, just continue with the |
| main loop. */ |
| |
| if (length == 0) continue; |
| |
| /* First, ensure the minimum number of matches are present. We get back |
| the length of the reference string explicitly rather than passing the |
| address of eptr, so that eptr can be a register variable. */ |
| |
| for (i = 1; i <= min; i++) |
| { |
| if (!match_ref(number, eptr, length, md)) FAIL; |
| eptr += length; |
| } |
| |
| /* If min = max, continue at the same level without recursion. |
| They are not both allowed to be zero. */ |
| |
| if (min == max) continue; |
| |
| /* If minimizing, keep trying and advancing the pointer */ |
| |
| if (minimize) |
| { |
| for (i = min;; i++) |
| { |
| if (match(eptr, ecode, offset_top, md)) SUCCEED; |
| if (i >= max || !match_ref(number, eptr, length, md)) |
| FAIL; |
| eptr += length; |
| } |
| /* Control never gets here */ |
| } |
| |
| /* If maximizing, find the longest string and work backwards */ |
| |
| else |
| { |
| uschar *pp = eptr; |
| for (i = min; i < max; i++) |
| { |
| if (!match_ref(number, eptr, length, md)) break; |
| eptr += length; |
| } |
| while (eptr >= pp) |
| { |
| if (match(eptr, ecode, offset_top, md)) SUCCEED; |
| eptr -= length; |
| } |
| FAIL; |
| } |
| } |
| /* Control never gets here */ |
| |
| /* Match a character class, possibly repeatedly. Look past the end of the |
| item to see if there is repeat information following. Then obey similar |
| code to character type repeats - written out again for speed. */ |
| |
| case OP_CLASS: |
| case OP_NEGCLASS: |
| { |
| BOOL result = *ecode == OP_CLASS; |
| uschar *data = ecode; /* Save for matching */ |
| |
| ecode += 4 + 2 * ecode[2] + ecode[3]; /* Advance past the item */ |
| |
| switch (*ecode) |
| { |
| case OP_CRSTAR: |
| case OP_CRMINSTAR: |
| case OP_CRPLUS: |
| case OP_CRMINPLUS: |
| case OP_CRQUERY: |
| case OP_CRMINQUERY: |
| c = *ecode++ - OP_CRSTAR; |
| minimize = (c & 1) != 0; |
| min = rep_min[c]; /* Pick up values from tables; */ |
| max = rep_max[c]; /* zero for max => infinity */ |
| if (max == 0) max = INT_MAX; |
| break; |
| |
| case OP_CRRANGE: |
| case OP_CRMINRANGE: |
| minimize = (*ecode == OP_CRMINRANGE); |
| min = (ecode[1] << 8) + ecode[2]; |
| max = (ecode[3] << 8) + ecode[4]; |
| if (max == 0) max = INT_MAX; |
| ecode += 5; |
| break; |
| |
| default: /* No repeat follows */ |
| if (eptr >= md->end_subject || !match_class(data, *eptr++, result, md)) |
| FAIL; |
| continue; /* With the main loop */ |
| } |
| |
| /* First, ensure the minimum number of matches are present. */ |
| |
| for (i = 1; i <= min; i++) |
| if (eptr >= md->end_subject || !match_class(data, *eptr++, result, md)) |
| FAIL; |
| |
| /* If max == min we can continue with the main loop without the |
| need to recurse. */ |
| |
| if (min == max) continue; |
| |
| /* If minimizing, keep testing the rest of the expression and advancing |
| the pointer while it matches the class. */ |
| |
| if (minimize) |
| { |
| for (i = min;; i++) |
| { |
| if (match(eptr, ecode, offset_top, md)) SUCCEED; |
| if (i >= max || eptr >= md->end_subject || |
| !match_class(data, *eptr++, result, md)) FAIL; |
| } |
| /* Control never gets here */ |
| } |
| |
| /* If maximizing, find the longest possible run, then work backwards. */ |
| |
| else |
| { |
| uschar *pp = eptr; |
| for (i = min; i < max; i++) |
| { |
| if (eptr >= md->end_subject || !match_class(data, *eptr, result, md)) |
| break; |
| eptr++; |
| } |
| while (eptr >= pp) |
| if (match(eptr--, ecode, offset_top, md)) SUCCEED; |
| FAIL; |
| } |
| } |
| /* Control never gets here */ |
| |
| /* Match a run of characters */ |
| |
| case OP_CHARS: |
| { |
| register int length = ecode[1]; |
| ecode += 2; |
| |
| #ifdef DEBUG |
| if (eptr >= md->end_subject) |
| printf("matching subject <null> against pattern "); |
| else |
| { |
| printf("matching subject "); |
| pchars(eptr, length, TRUE, md); |
| printf(" against pattern "); |
| } |
| pchars(ecode, length, FALSE, md); |
| printf("\n"); |
| #endif |
| |
| if (length > md->end_subject - eptr) FAIL; |
| if (md->caseless) |
| { |
| while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) FAIL; |
| } |
| else |
| { |
| while (length-- > 0) if (*ecode++ != *eptr++) FAIL; |
| } |
| } |
| break; |
| |
| /* Match a single character repeatedly; different opcodes share code. */ |
| |
| case OP_EXACT: |
| min = max = (ecode[1] << 8) + ecode[2]; |
| ecode += 3; |
| goto REPEATCHAR; |
| |
| case OP_UPTO: |
| case OP_MINUPTO: |
| min = 0; |
| max = (ecode[1] << 8) + ecode[2]; |
| minimize = *ecode == OP_MINUPTO; |
| ecode += 3; |
| goto REPEATCHAR; |
| |
| case OP_STAR: |
| case OP_MINSTAR: |
| case OP_PLUS: |
| case OP_MINPLUS: |
| case OP_QUERY: |
| case OP_MINQUERY: |
| c = *ecode++ - OP_STAR; |
| minimize = (c & 1) != 0; |
| min = rep_min[c]; /* Pick up values from tables; */ |
| max = rep_max[c]; /* zero for max => infinity */ |
| if (max == 0) max = INT_MAX; |
| |
| /* Common code for all repeated single-character matches. We can give |
| up quickly if there are fewer than the minimum number of characters left in |
| the subject. */ |
| |
| REPEATCHAR: |
| if (min > md->end_subject - eptr) FAIL; |
| c = *ecode++; |
| |
| /* The code is duplicated for the caseless and caseful cases, for speed, |
| since matching characters is likely to be quite common. First, ensure the |
| minimum number of matches are present. If min = max, continue at the same |
| level without recursing. Otherwise, if minimizing, keep trying the rest of |
| the expression and advancing one matching character if failing, up to the |
| maximum. Alternatively, if maximizing, find the maximum number of |
| characters and work backwards. */ |
| |
| #ifdef DEBUG |
| printf("matching %c{%d,%d} against subject %.*s\n", c, min, max, |
| max, eptr); |
| #endif |
| |
| if (md->caseless) |
| { |
| c = pcre_lcc[c]; |
| for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) FAIL; |
| if (min == max) continue; |
| if (minimize) |
| { |
| for (i = min;; i++) |
| { |
| if (match(eptr, ecode, offset_top, md)) SUCCEED; |
| if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++]) |
| FAIL; |
| } |
| /* Control never gets here */ |
| } |
| else |
| { |
| uschar *pp = eptr; |
| for (i = min; i < max; i++) |
| { |
| if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break; |
| eptr++; |
| } |
| while (eptr >= pp) |
| if (match(eptr--, ecode, offset_top, md)) SUCCEED; |
| FAIL; |
| } |
| } |
| |
| /* Caseful comparisons */ |
| |
| else |
| { |
| for (i = 1; i <= min; i++) if (c != *eptr++) FAIL; |
| if (min == max) continue; |
| if (minimize) |
| { |
| for (i = min;; i++) |
| { |
| if (match(eptr, ecode, offset_top, md)) SUCCEED; |
| if (i >= max || eptr >= md->end_subject || c != *eptr++) FAIL; |
| } |
| /* Control never gets here */ |
| } |
| else |
| { |
| uschar *pp = eptr; |
| for (i = min; i < max; i++) |
| { |
| if (eptr >= md->end_subject || c != *eptr) break; |
| eptr++; |
| } |
| while (eptr >= pp) |
| if (match(eptr--, ecode, offset_top, md)) SUCCEED; |
| FAIL; |
| } |
| } |
| /* Control never gets here */ |
| |
| /* Match a single character type repeatedly; several different opcodes |
| share code. This is very similar to the code for single characters, but we |
| repeat it in the interests of efficiency. */ |
| |
| case OP_TYPEEXACT: |
| min = max = (ecode[1] << 8) + ecode[2]; |
| minimize = TRUE; |
| ecode += 3; |
| goto REPEATTYPE; |
| |
| case OP_TYPEUPTO: |
| case OP_TYPEMINUPTO: |
| min = 0; |
| max = (ecode[1] << 8) + ecode[2]; |
| minimize = *ecode == OP_TYPEMINUPTO; |
| ecode += 3; |
| goto REPEATTYPE; |
| |
| case OP_TYPESTAR: |
| case OP_TYPEMINSTAR: |
| case OP_TYPEPLUS: |
| case OP_TYPEMINPLUS: |
| case OP_TYPEQUERY: |
| case OP_TYPEMINQUERY: |
| c = *ecode++ - OP_TYPESTAR; |
| minimize = (c & 1) != 0; |
| min = rep_min[c]; /* Pick up values from tables; */ |
| max = rep_max[c]; /* zero for max => infinity */ |
| if (max == 0) max = INT_MAX; |
| |
| /* Common code for all repeated single character type matches */ |
| |
| REPEATTYPE: |
| ctype = *ecode++; /* Code for the character type */ |
| |
| /* First, ensure the minimum number of matches are present. Use inline |
| code for maximizing the speed, and do the type test once at the start |
| (i.e. keep it out of the loop). Also test that there are at least the |
| minimum number of characters before we start. */ |
| |
| if (min > md->end_subject - eptr) FAIL; |
| if (min > 0) switch(ctype) |
| { |
| case OP_ANY: |
| if (!md->dotall) |
| { for (i = 1; i <= min; i++) if (*eptr++ == '\n') FAIL; } |
| else eptr += min; |
| break; |
| |
| case OP_NOT_DIGIT: |
| for (i = 1; i <= min; i++) |
| if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) FAIL; |
| break; |
| |
| case OP_DIGIT: |
| for (i = 1; i <= min; i++) |
| if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) FAIL; |
| break; |
| |
| case OP_NOT_WHITESPACE: |
| for (i = 1; i <= min; i++) |
| if ((pcre_ctypes[*eptr++] & ctype_space) != 0) FAIL; |
| break; |
| |
| case OP_WHITESPACE: |
| for (i = 1; i <= min; i++) |
| if ((pcre_ctypes[*eptr++] & ctype_space) == 0) FAIL; |
| break; |
| |
| case OP_NOT_WORDCHAR: |
| for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0) |
| FAIL; |
| break; |
| |
| case OP_WORDCHAR: |
| for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0) |
| FAIL; |
| break; |
| } |
| |
| /* If min = max, continue at the same level without recursing */ |
| |
| if (min == max) continue; |
| |
| /* If minimizing, we have to test the rest of the pattern before each |
| subsequent match, so inlining isn't much help; just use the function. */ |
| |
| if (minimize) |
| { |
| for (i = min;; i++) |
| { |
| if (match(eptr, ecode, offset_top, md)) SUCCEED; |
| if (i >= max || eptr >= md->end_subject || |
| !match_type(ctype, *eptr++, md->dotall)) |
| FAIL; |
| } |
| /* Control never gets here */ |
| } |
| |
| /* If maximizing it is worth using inline code for speed, doing the type |
| test once at the start (i.e. keep it out of the loop). */ |
| |
| else |
| { |
| uschar *pp = eptr; |
| switch(ctype) |
| { |
| case OP_ANY: |
| if (!md->dotall) |
| { |
| for (i = min; i < max; i++) |
| { |
| if (eptr >= md->end_subject || *eptr == '\n') break; |
| eptr++; |
| } |
| } |
| else |
| { |
| c = max - min; |
| if (c > md->end_subject - eptr) c = md->end_subject - eptr; |
| eptr += c; |
| } |
| break; |
| |
| case OP_NOT_DIGIT: |
| for (i = min; i < max; i++) |
| { |
| if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0) |
| break; |
| eptr++; |
| } |
| break; |
| |
| case OP_DIGIT: |
| for (i = min; i < max; i++) |
| { |
| if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0) |
| break; |
| eptr++; |
| } |
| break; |
| |
| case OP_NOT_WHITESPACE: |
| for (i = min; i < max; i++) |
| { |
| if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0) |
| break; |
| eptr++; |
| } |
| break; |
| |
| case OP_WHITESPACE: |
| for (i = min; i < max; i++) |
| { |
| if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0) |
| break; |
| eptr++; |
| } |
| break; |
| |
| case OP_NOT_WORDCHAR: |
| for (i = min; i < max; i++) |
| { |
| if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0) |
| break; |
| eptr++; |
| } |
| break; |
| |
| case OP_WORDCHAR: |
| for (i = min; i < max; i++) |
| { |
| if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0) |
| break; |
| eptr++; |
| } |
| break; |
| } |
| |
| while (eptr >= pp) |
| if (match(eptr--, ecode, offset_top, md)) SUCCEED; |
| FAIL; |
| } |
| /* Control never gets here */ |
| |
| /* There's been some horrible disaster. */ |
| |
| default: |
| #ifdef DEBUG |
| printf("Unknown opcode %d\n", *ecode); |
| #endif |
| md->errorcode = PCRE_ERROR_UNKNOWN_NODE; |
| FAIL; |
| } |
| |
| /* Do not stick any code in here without much thought; it is assumed |
| that "continue" in the code above comes out to here to repeat the main |
| loop. */ |
| |
| } /* End of main loop */ |
| /* Control never reaches here */ |
| |
| fail: |
| if (md->point > save_stack_position) |
| { |
| /* If there are still points remaining on the stack, pop the next one off */ |
| int off_num; |
| |
| md->point--; |
| offset_top = md->offset_top[md->point]; |
| eptr = md->eptr[md->point]; |
| ecode = md->ecode[md->point]; |
| off_num = md->off_num[md->point]; |
| md->offset_vector[off_num] = md->r1[md->point]; |
| md->offset_vector[off_num+1] = md->r2[md->point]; |
| goto match_loop; |
| } |
| /* Failure, and nothing left on the stack, so end this function call */ |
| |
| /* Restore the top of the stack to where it was before this function |
| call. This lets us use one stack for everything; recursive calls |
| can push and pop information, and may increase the stack. When |
| the call returns, the parent function can resume pushing and |
| popping wherever it was. */ |
| |
| md->point = save_stack_position; |
| return FALSE; |
| |
| succeed: |
| return TRUE; |
| } |
| |
| |
| /************************************************* |
| * Execute a Regular Expression * |
| *************************************************/ |
| |
| /* This function applies a compiled re to a subject string and picks out |
| portions of the string if it matches. Two elements in the vector are set for |
| each substring: the offsets to the start and end of the substring. |
| |
| Arguments: |
| re points to the compiled expression |
| extra points to "hints" from pcre_study() or is NULL |
| subject points to the subject string |
| length length of subject string (may contain binary zeros) |
| options option bits |
| offsets points to a vector of ints to be filled in with offsets |
| offsetcount the number of elements in the vector |
| |
| Returns: > 0 => success; value is the number of elements filled in |
| = 0 => success, but offsets is not big enough |
| -1 => failed to match |
| < -1 => some kind of unexpected problem |
| */ |
| |
| int |
| pcre_exec(pcre *external_re, pcre_extra *external_extra, char *subject, |
| int length, int options, int *offsets, int offsetcount) |
| { |
| int resetcount; |
| int first_char = -1; |
| match_data match_block; |
| uschar *start_bits = NULL; |
| uschar *start_match = (uschar *)subject; |
| uschar *end_subject; |
| real_pcre *re = (real_pcre *)external_re; |
| real_pcre_extra *extra = (real_pcre_extra *)external_extra; |
| BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0; |
| BOOL startline = (re->options & PCRE_STARTLINE) != 0; |
| |
| if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION; |
| |
| if (re == NULL || subject == NULL || |
| (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL; |
| if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC; |
| |
| match_block.start_subject = (uschar *)subject; |
| match_block.end_subject = match_block.start_subject + length; |
| end_subject = match_block.end_subject; |
| |
| match_block.caseless = ((re->options | options) & PCRE_CASELESS) != 0; |
| match_block.multiline = ((re->options |options) & PCRE_MULTILINE) != 0; |
| match_block.dotall = ((re->options |options) & PCRE_DOTALL) != 0; |
| |
| match_block.offset_vector = offsets; /* Where offsets go */ |
| match_block.offset_end = (offsetcount & (-2)); /* Past max permitted (even) */ |
| match_block.offset_overflow = FALSE; |
| |
| match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */ |
| |
| /* Set the stack state to empty */ |
| match_block.off_num = match_block.offset_top = NULL; |
| match_block.r1 = match_block.r2 = NULL; |
| match_block.eptr = match_block.ecode = NULL; |
| match_block.point = match_block.length = 0; |
| |
| /* Compute the minimum number of offsets that we need to reset each time. Doing |
| this makes a huge difference to execution time when there aren't many brackets |
| in the pattern. */ |
| |
| resetcount = 2 + re->top_bracket * 2; |
| if (resetcount > offsetcount) resetcount = offsetcount; |
| |
| /* If MULTILINE is set at exec time but was not set at compile time, and the |
| anchored flag is set, we must re-check because a setting provoked by ^ in the |
| pattern is not right in multi-line mode. Calling is_anchored() again here does |
| the right check, because multiline is now set. If it now yields FALSE, the |
| expression must have had ^ starting some of its branches. Check to see if |
| that is true for *all* branches, and if so, set the startline flag. */ |
| |
| if (match_block. multiline && anchored && (re->options & PCRE_MULTILINE) == 0 && |
| !is_anchored(re->code, match_block.multiline)) |
| { |
| anchored = FALSE; |
| if (is_startline(re->code)) startline = TRUE; |
| } |
| |
| /* Set up the first character to match, if available. The first_char value is |
| never set for an anchored regular expression, but the anchoring may be forced |
| at run time, so we have to test for anchoring. The first char may be unset for |
| an unanchored pattern, of course. If there's no first char and the pattern was |
| studied, the may be a bitmap of possible first characters. However, we can |
| use this only if the caseless state of the studying was correct. */ |
| |
| if (!anchored) |
| { |
| if ((re->options & PCRE_FIRSTSET) != 0) |
| { |
| first_char = re->first_char; |
| if (match_block.caseless) first_char = pcre_lcc[first_char]; |
| } |
| else |
| if (!startline && extra != NULL && |
| (extra->options & PCRE_STUDY_MAPPED) != 0 && |
| ((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless) |
| start_bits = extra->start_bits; |
| } |
| |
| /* Loop for unanchored matches; for anchored regexps the loop runs just once. */ |
| |
| do |
| { |
| register int *iptr = offsets; |
| register int *iend = offsets + resetcount; |
| |
| /* Reset the maximum number of extractions we might see. */ |
| |
| while (iptr < iend) *iptr++ = -1; |
| |
| /* Advance to a unique first char if possible */ |
| |
| if (first_char >= 0) |
| { |
| if (match_block.caseless) |
| while (start_match < end_subject && pcre_lcc[*start_match] != first_char) |
| start_match++; |
| else |
| while (start_match < end_subject && *start_match != first_char) |
| start_match++; |
| } |
| |
| /* Or to just after \n for a multiline match if possible */ |
| |
| else if (startline) |
| { |
| if (start_match > match_block.start_subject) |
| { |
| while (start_match < end_subject && start_match[-1] != '\n') |
| start_match++; |
| } |
| } |
| |
| /* Or to a non-unique first char */ |
| |
| else if (start_bits != NULL) |
| { |
| while (start_match < end_subject) |
| { |
| register int c = *start_match; |
| if ((start_bits[c/8] & (1<<(c%8))) == 0) start_match++; else break; |
| } |
| } |
| |
| #ifdef DEBUG |
| printf(">>>> Match against: "); |
| pchars(start_match, end_subject - start_match, TRUE, &match_block); |
| printf("\n"); |
| #endif |
| |
| /* When a match occurs, substrings will be set for all internal extractions; |
| we just need to set up the whole thing as substring 0 before returning. If |
| there were too many extractions, set the return code to zero. */ |
| |
| if (match(start_match, re->code, 2, &match_block)) |
| { |
| int rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2; |
| if (match_block.offset_end < 2) rc = 0; else |
| { |
| offsets[0] = start_match - match_block.start_subject; |
| offsets[1] = match_block.end_match_ptr - match_block.start_subject; |
| } |
| #ifdef DEBUG |
| printf(">>>> returning %d\n", rc); |
| #endif |
| free_stack(&match_block); |
| return rc; |
| } |
| } |
| while (!anchored && |
| match_block.errorcode == PCRE_ERROR_NOMATCH && |
| start_match++ < end_subject); |
| |
| #ifdef DEBUG |
| printf(">>>> returning %d\n", match_block.errorcode); |
| #endif |
| free_stack(&match_block); |
| return match_block.errorcode; |
| } |
| |
| /* End of pcre.c */ |