Guido van Rossum | 51b3aa3 | 1997-10-06 14:43:11 +0000 | [diff] [blame] | 1 | /************************************************* |
| 2 | * Perl-Compatible Regular Expressions * |
| 3 | *************************************************/ |
| 4 | |
| 5 | |
| 6 | #define PCRE_VERSION "0.95 23-Sep-1997" |
| 7 | |
| 8 | |
| 9 | /* This is a library of functions to support regular expressions whose syntax |
| 10 | and semantics are as close as possible to those of the Perl 5 language. See |
| 11 | the file Tech.Notes for some information on the internals. |
| 12 | |
| 13 | Written by: Philip Hazel <ph10@cam.ac.uk> |
| 14 | |
| 15 | Copyright (c) 1997 University of Cambridge |
| 16 | |
| 17 | ----------------------------------------------------------------------------- |
| 18 | Permission is granted to anyone to use this software for any purpose on any |
| 19 | computer system, and to redistribute it freely, subject to the following |
| 20 | restrictions: |
| 21 | |
| 22 | 1. This software is distributed in the hope that it will be useful, |
| 23 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 24 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
| 25 | |
| 26 | 2. The origin of this software must not be misrepresented, either by |
| 27 | explicit claim or by omission. |
| 28 | |
| 29 | 3. Altered versions must be plainly marked as such, and must not be |
| 30 | misrepresented as being the original software. |
| 31 | ----------------------------------------------------------------------------- |
| 32 | */ |
| 33 | |
| 34 | /* This header contains definitions that are shared between the different |
| 35 | modules, but which are not relevant to the outside. */ |
| 36 | |
| 37 | /* Standard C headers plus the external interface definition */ |
| 38 | |
| 39 | #include <ctype.h> |
| 40 | #include <limits.h> |
| 41 | #include <stdio.h> |
| 42 | #include <stdlib.h> |
| 43 | #include <string.h> |
| 44 | #include "pcre.h" |
| 45 | |
| 46 | /* Private options flags start at the most significant end of the byte. The |
| 47 | public options defined in pcre.h start at the least significant end. Make sure |
| 48 | they don't overlap! */ |
| 49 | |
| 50 | #define PCRE_FIRSTSET 0x80 /* first_char is set */ |
| 51 | #define PCRE_STARTLINE 0x40 /* start after \n for multiline */ |
| 52 | |
| 53 | /* Options for the "extra" block produced by pcre_study(). */ |
| 54 | |
| 55 | #define PCRE_STUDY_CASELESS 0x01 /* study was caseless */ |
| 56 | #define PCRE_STUDY_MAPPED 0x20 /* a map of starting chars exists */ |
| 57 | |
| 58 | /* Masks for identifying the public options: all permitted at compile time, |
| 59 | only some permitted at run or study time. */ |
| 60 | |
| 61 | #ifdef FOR_PYTHON |
| 62 | #define PUBLIC_OPTIONS \ |
| 63 | (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE|PCRE_DOTALL) |
| 64 | #else |
| 65 | #define PUBLIC_OPTIONS \ |
| 66 | (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE) |
| 67 | #endif |
| 68 | #define PUBLIC_EXEC_OPTIONS (PCRE_CASELESS|PCRE_ANCHORED|PCRE_MULTILINE) |
| 69 | #define PUBLIC_STUDY_OPTIONS (PCRE_CASELESS) |
| 70 | |
| 71 | /* Magic number to provide a small check against being handed junk. */ |
| 72 | |
| 73 | #define MAGIC_NUMBER 0x50435245 /* 'PCRE' */ |
| 74 | |
| 75 | /* Miscellaneous definitions */ |
| 76 | |
| 77 | typedef int BOOL; |
| 78 | |
| 79 | #define FALSE 0 |
| 80 | #define TRUE 1 |
| 81 | |
| 82 | /* Flags for character classes - see also class_ops table below. */ |
| 83 | |
| 84 | #define CLASS_DIGITS 0x01 |
| 85 | #define CLASS_NOT_DIGITS 0x02 |
| 86 | #define CLASS_WHITESPACE 0x04 |
| 87 | #define CLASS_NOT_WHITESPACE 0x08 |
| 88 | #define CLASS_WORD 0x10 |
| 89 | #define CLASS_NOT_WORD 0x20 |
| 90 | |
| 91 | /* These are escaped items that aren't just an encoding of a particular data |
| 92 | value such as \n. They must have non-zero values, as check_escape() returns |
| 93 | their negation. Also, they must appear in the same order as in the opcode |
| 94 | definitions below, up to ESC_Z. The final one must be ESC_REF as subsequent |
| 95 | values are used for \1, \2, \3, etc. There is a test in the code for an escape |
| 96 | greater than ESC_b and less than ESC_Z to detect the types that may be |
| 97 | repeated. If any new escapes are put in-between that don't consume a character, |
| 98 | that code will have to change. */ |
| 99 | |
| 100 | enum { ESC_A = 1, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W, ESC_w, |
| 101 | ESC_Z, ESC_REF }; |
| 102 | |
| 103 | /* Opcode table: OP_BRA must be last, as all values >= it are used for brackets |
| 104 | that extract substrings. Starting from 1 (i.e. after OP_END), the values up to |
| 105 | OP_EOL must correspond in order to the list of escapes immediately above. */ |
| 106 | |
| 107 | enum { |
| 108 | OP_END, /* End of pattern */ |
| 109 | |
| 110 | /* Values corresponding to backslashed metacharacters */ |
| 111 | |
| 112 | OP_SOD, /* Start of data: \A */ |
| 113 | OP_NOT_WORD_BOUNDARY, /* \W */ |
| 114 | OP_WORD_BOUNDARY, /* \w */ |
| 115 | OP_NOT_DIGIT, /* \D */ |
| 116 | OP_DIGIT, /* \d */ |
| 117 | OP_NOT_WHITESPACE, /* \S */ |
| 118 | OP_WHITESPACE, /* \s */ |
| 119 | OP_NOT_WORDCHAR, /* \W */ |
| 120 | OP_WORDCHAR, /* \w */ |
| 121 | OP_EOD, /* End of data: or \Z. This must always be the last |
| 122 | of the backslashed meta values. */ |
| 123 | |
| 124 | OP_CIRC, /* Start of line - varies with multiline switch */ |
| 125 | OP_DOLL, /* End of line - varies with multiline switch */ |
| 126 | OP_ANY, /* Match any character */ |
| 127 | OP_CHARS, /* Match string of characters */ |
| 128 | |
| 129 | OP_STAR, /* The maximizing and minimizing versions of */ |
| 130 | OP_MINSTAR, /* all these opcodes must come in pairs, with */ |
| 131 | OP_PLUS, /* the minimizing one second. */ |
| 132 | OP_MINPLUS, /* This first set applies to single characters */ |
| 133 | OP_QUERY, |
| 134 | OP_MINQUERY, |
| 135 | OP_UPTO, /* From 0 to n matches. */ |
| 136 | OP_MINUPTO, |
| 137 | OP_EXACT, /* Exactly n matches. */ |
| 138 | |
| 139 | OP_TYPESTAR, /* The maximizing and minimizing versions of */ |
| 140 | OP_TYPEMINSTAR, /* all these opcodes must come in pairs, with */ |
| 141 | OP_TYPEPLUS, /* the minimizing one second. These codes must */ |
| 142 | OP_TYPEMINPLUS, /* be in exactly the same order as those above. */ |
| 143 | OP_TYPEQUERY, /* This set applies to character types such as \d */ |
| 144 | OP_TYPEMINQUERY, |
| 145 | OP_TYPEUPTO, |
| 146 | OP_TYPEMINUPTO, |
| 147 | OP_TYPEEXACT, |
| 148 | |
| 149 | OP_CRSTAR, /* The maximizing and minimizing versions of */ |
| 150 | OP_CRMINSTAR, /* all these opcodes must come in pairs, with */ |
| 151 | OP_CRPLUS, /* the minimizing one second. These codes must */ |
| 152 | OP_CRMINPLUS, /* be in exactly the same order as those above. */ |
| 153 | OP_CRQUERY, /* These are for character classes and back refs */ |
| 154 | OP_CRMINQUERY, |
| 155 | OP_CRRANGE, /* These are different to the two seta above. */ |
| 156 | OP_CRMINRANGE, |
| 157 | |
| 158 | OP_CLASS, /* Match a character class */ |
| 159 | OP_NEGCLASS, /* Don't match a character class */ |
| 160 | OP_REF, /* Match a back reference */ |
| 161 | |
| 162 | OP_ALT, /* Start of alternation */ |
| 163 | OP_KET, /* End of group that doesn't have an unbounded repeat */ |
| 164 | OP_KETRMAX, /* These two must remain together and in this */ |
| 165 | OP_KETRMIN, /* order. They are for groups the repeat for ever. */ |
| 166 | |
| 167 | OP_ASSERT, |
| 168 | OP_ASSERT_NOT, |
| 169 | |
| 170 | OP_BRAZERO, /* These two must remain together and in this */ |
| 171 | OP_BRAMINZERO, /* order. */ |
| 172 | |
| 173 | OP_BRA /* This and greater values are used for brackets that |
| 174 | extract substrings. */ |
| 175 | }; |
| 176 | |
| 177 | /* The highest extraction number. This is limited by the number of opcodes |
| 178 | left after OP_BRA, i.e. 255 - OP_BRA. We actually set it somewhat lower. */ |
| 179 | |
| 180 | #define EXTRACT_MAX 99 |
| 181 | |
| 182 | /* All character handling must be done as unsigned characters. Otherwise there |
| 183 | are problems with top-bit-set characters and functions such as isspace(). |
| 184 | However, we leave the interface to the outside world as char *, because that |
| 185 | should make things easier for callers. We define a short type for unsigned char |
| 186 | to save lots of typing. I tried "uchar", but it causes problems on Digital |
| 187 | Unix, where it is defined in sys/types, so use "uschar" instead. */ |
| 188 | |
| 189 | typedef unsigned char uschar; |
| 190 | |
| 191 | /* The real format of the start of the pcre block; the actual code vector |
| 192 | runs on as long as necessary after the end. */ |
| 193 | |
| 194 | typedef struct real_pcre { |
| 195 | unsigned int magic_number; |
| 196 | unsigned char options; |
| 197 | unsigned char top_bracket; |
| 198 | unsigned char first_char; |
| 199 | unsigned char code[1]; |
| 200 | } real_pcre; |
| 201 | |
| 202 | /* The real format of the extra block returned by pcre_study(). */ |
| 203 | |
| 204 | typedef struct real_pcre_extra { |
| 205 | unsigned char options; |
| 206 | unsigned char start_bits[32]; |
| 207 | } real_pcre_extra; |
| 208 | |
| 209 | /* Global tables from pcre-chartables.c */ |
| 210 | |
| 211 | extern uschar pcre_lcc[]; |
| 212 | extern uschar pcre_ucc[]; |
| 213 | extern uschar pcre_ctypes[]; |
| 214 | |
| 215 | /* Bit definitions for entries in pcre_ctypes[]. */ |
| 216 | |
| 217 | #define ctype_space 0x01 |
| 218 | #define ctype_digit 0x02 |
| 219 | #define ctype_xdigit 0x04 |
| 220 | #define ctype_word 0x08 /* alphameric or '_' */ |
| 221 | #ifdef FOR_PYTHON |
| 222 | #define ctype_odigit 0x10 /* Octal digits */ |
| 223 | #endif |
| 224 | #define ctype_meta 0x80 /* regexp meta char or zero (end pattern) */ |
| 225 | |
| 226 | /* End of pcre-internal.h */ |