blob: bec9197e6fdd279f9248512ac101a93f4e49e943 [file] [log] [blame]
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001
2/*************************************************
3* Perl-Compatible Regular Expressions *
4*************************************************/
5
6/* DO NOT EDIT THIS FILE! */
7
8/* This file is automatically written by the merge-files.py script
9included with the PCRE distribution for Python; it's produced from
10several C files, and code is removed in the process. If you want to
11modify the code or track down bugs, it will be much easier to work
12with the code in its original, multiple-file form. Don't edit this
13file by hand, or submit patches to it.
14
15The Python-specific PCRE distribution can be retrieved from
16 http://starship.skyport.net/crew/amk/regex/
17
Guido van Rossum58132c61997-12-17 00:24:13 +000018The unmodified original PCRE distribution is available at
19ftp://ftp.cus.cam.ac.uk/pub/software/programs/pcre/, and is originally
20written by: Philip Hazel <ph10@cam.ac.uk>
Guido van Rossum51b3aa31997-10-06 14:43:11 +000021
22Extensively modified by the Python String-SIG: <string-sig@python.org>
23Send bug reports to: <string-sig@python.org>
24(They'll figure out if it's a bug in PCRE or in the Python-specific
25changes.)
26
27 Copyright (c) 1997 University of Cambridge
28
29-----------------------------------------------------------------------------
30Permission is granted to anyone to use this software for any purpose on any
31computer system, and to redistribute it freely, subject to the following
32restrictions:
33
341. This software is distributed in the hope that it will be useful,
35 but WITHOUT ANY WARRANTY; without even the implied warranty of
36 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
37
382. The origin of this software must not be misrepresented, either by
39 explicit claim or by omission.
40
413. Altered versions must be plainly marked as such, and must not be
42 misrepresented as being the original software.
43-----------------------------------------------------------------------------
44*/
45
46
47#define FOR_PYTHON
Guido van Rossum58132c61997-12-17 00:24:13 +000048#include "pcre-int.h"
Guido van Rossum51b3aa31997-10-06 14:43:11 +000049#include "Python.h"
Guido van Rossum50700601997-12-08 17:15:20 +000050#include "mymalloc.h"
51#include <ctype.h>
Guido van Rossum51b3aa31997-10-06 14:43:11 +000052#include "graminit.h"
53
54/*************************************************
55* Perl-Compatible Regular Expressions *
56*************************************************/
57
58/* This file is automatically written by the makechartables auxiliary
59program. If you edit it by hand, you might like to edit the Makefile to
60prevent its ever being regenerated. */
61
62/* This table is a lower casing table. */
63
64unsigned char pcre_lcc[] = {
65 0, 1, 2, 3, 4, 5, 6, 7,
66 8, 9, 10, 11, 12, 13, 14, 15,
67 16, 17, 18, 19, 20, 21, 22, 23,
68 24, 25, 26, 27, 28, 29, 30, 31,
69 32, 33, 34, 35, 36, 37, 38, 39,
70 40, 41, 42, 43, 44, 45, 46, 47,
71 48, 49, 50, 51, 52, 53, 54, 55,
72 56, 57, 58, 59, 60, 61, 62, 63,
73 64, 97, 98, 99,100,101,102,103,
74 104,105,106,107,108,109,110,111,
75 112,113,114,115,116,117,118,119,
76 120,121,122, 91, 92, 93, 94, 95,
77 96, 97, 98, 99,100,101,102,103,
78 104,105,106,107,108,109,110,111,
79 112,113,114,115,116,117,118,119,
80 120,121,122,123,124,125,126,127,
81 128,129,130,131,132,133,134,135,
82 136,137,138,139,140,141,142,143,
83 144,145,146,147,148,149,150,151,
84 152,153,154,155,156,157,158,159,
85 160,161,162,163,164,165,166,167,
86 168,169,170,171,172,173,174,175,
87 176,177,178,179,180,181,182,183,
88 184,185,186,187,188,189,190,191,
89 192,193,194,195,196,197,198,199,
90 200,201,202,203,204,205,206,207,
91 208,209,210,211,212,213,214,215,
92 216,217,218,219,220,221,222,223,
93 224,225,226,227,228,229,230,231,
94 232,233,234,235,236,237,238,239,
95 240,241,242,243,244,245,246,247,
96 248,249,250,251,252,253,254,255 };
97
Guido van Rossum50700601997-12-08 17:15:20 +000098/* This table is a case flipping table. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +000099
Guido van Rossum50700601997-12-08 17:15:20 +0000100unsigned char pcre_fcc[] = {
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000101 0, 1, 2, 3, 4, 5, 6, 7,
102 8, 9, 10, 11, 12, 13, 14, 15,
103 16, 17, 18, 19, 20, 21, 22, 23,
104 24, 25, 26, 27, 28, 29, 30, 31,
105 32, 33, 34, 35, 36, 37, 38, 39,
106 40, 41, 42, 43, 44, 45, 46, 47,
107 48, 49, 50, 51, 52, 53, 54, 55,
108 56, 57, 58, 59, 60, 61, 62, 63,
Guido van Rossum50700601997-12-08 17:15:20 +0000109 64, 97, 98, 99,100,101,102,103,
110 104,105,106,107,108,109,110,111,
111 112,113,114,115,116,117,118,119,
112 120,121,122, 91, 92, 93, 94, 95,
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000113 96, 65, 66, 67, 68, 69, 70, 71,
114 72, 73, 74, 75, 76, 77, 78, 79,
115 80, 81, 82, 83, 84, 85, 86, 87,
116 88, 89, 90,123,124,125,126,127,
117 128,129,130,131,132,133,134,135,
118 136,137,138,139,140,141,142,143,
119 144,145,146,147,148,149,150,151,
120 152,153,154,155,156,157,158,159,
121 160,161,162,163,164,165,166,167,
122 168,169,170,171,172,173,174,175,
123 176,177,178,179,180,181,182,183,
124 184,185,186,187,188,189,190,191,
125 192,193,194,195,196,197,198,199,
126 200,201,202,203,204,205,206,207,
127 208,209,210,211,212,213,214,215,
128 216,217,218,219,220,221,222,223,
129 224,225,226,227,228,229,230,231,
130 232,233,234,235,236,237,238,239,
131 240,241,242,243,244,245,246,247,
132 248,249,250,251,252,253,254,255 };
133
Guido van Rossum50700601997-12-08 17:15:20 +0000134/* This table contains bit maps for digits, letters, 'word' chars, and
135white space. Each map is 32 bytes long and the bits run from the least
136significant end of each byte. */
137
138unsigned char pcre_cbits[] = {
139 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
140 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
141 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
142 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
143
144 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
145 0xfe,0xff,0xff,0x07,0xfe,0xff,0xff,0x07,
146 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
147 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
148
149 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
150 0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07,
151 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
152 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
153
154 0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00,
155 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
156 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
157 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 };
158
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000159/* This table identifies various classes of character by individual bits:
Guido van Rossum50700601997-12-08 17:15:20 +0000160 0x01 white space character
161 0x02 letter
162 0x04 decimal digit
163 0x08 hexadecimal digit
164 0x10 alphanumeric or '_'
165 0x80 regular expression metacharacter or binary zero
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000166*/
167
168unsigned char pcre_ctypes[] = {
169 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
170 0x00,0x01,0x01,0x01,0x01,0x01,0x00,0x00, /* 8- 15 */
171 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
172 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
173 0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /* - ' */
174 0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /* ( - / */
Guido van Rossum50700601997-12-08 17:15:20 +0000175 0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c, /* 0 - 7 */
176 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x80, /* 8 - ? */
177 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* @ - G */
178 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* H - O */
179 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* P - W */
180 0x12,0x12,0x12,0x80,0x00,0x00,0x80,0x10, /* X - _ */
181 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* ` - g */
182 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* h - o */
183 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* p - w */
184 0x12,0x12,0x12,0x80,0x80,0x00,0x00,0x00, /* x -127 */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000185 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
186 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
187 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
188 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
189 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
190 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
191 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
192 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
193 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
194 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
195 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
196 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
197 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
198 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
199 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
200 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
201
Guido van Rossum50700601997-12-08 17:15:20 +0000202/* End of chartables.c */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000203/*************************************************
204* Perl-Compatible Regular Expressions *
205*************************************************/
206
207/*
208This is a library of functions to support regular expressions whose syntax
209and semantics are as close as possible to those of the Perl 5 language. See
210the file Tech.Notes for some information on the internals.
211
212Written by: Philip Hazel <ph10@cam.ac.uk>
213
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000214 Copyright (c) 1998 University of Cambridge
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000215
216-----------------------------------------------------------------------------
217Permission is granted to anyone to use this software for any purpose on any
218computer system, and to redistribute it freely, subject to the following
219restrictions:
220
2211. This software is distributed in the hope that it will be useful,
222 but WITHOUT ANY WARRANTY; without even the implied warranty of
223 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
224
2252. The origin of this software must not be misrepresented, either by
226 explicit claim or by omission.
227
2283. Altered versions must be plainly marked as such, and must not be
229 misrepresented as being the original software.
230-----------------------------------------------------------------------------
231*/
232
233
234/* Include the internals header, which itself includes Standard C headers plus
235the external pcre header. */
236
237
238
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000239
240/*************************************************
241* Create bitmap of starting chars *
242*************************************************/
243
244/* This function scans a compiled unanchored expression and attempts to build a
245bitmap of the set of initial characters. If it can't, it returns FALSE. As time
246goes by, we may be able to get more clever at doing this.
247
248Arguments:
249 code points to an expression
250 start_bits points to a 32-byte table, initialized to 0
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000251
252Returns: TRUE if table built, FALSE otherwise
253*/
254
255static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +0000256set_start_bits(const uschar *code, uschar *start_bits)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000257{
Guido van Rossum50700601997-12-08 17:15:20 +0000258register int c;
259
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000260do
261 {
Guido van Rossum58132c61997-12-17 00:24:13 +0000262 const uschar *tcode = code + 3;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000263 BOOL try_next = TRUE;
264
265 while (try_next)
266 {
267 try_next = FALSE;
268
269 if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
270 {
Guido van Rossum50700601997-12-08 17:15:20 +0000271 if (!set_start_bits(tcode, start_bits)) return FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000272 }
273
274 else switch(*tcode)
275 {
276 default:
277 return FALSE;
278
279 /* BRAZERO does the bracket, but carries on. */
280
281 case OP_BRAZERO:
282 case OP_BRAMINZERO:
Guido van Rossum50700601997-12-08 17:15:20 +0000283 if (!set_start_bits(++tcode, start_bits)) return FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000284 do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
285 tcode += 3;
286 try_next = TRUE;
287 break;
288
289 /* Single-char * or ? sets the bit and tries the next item */
290
291 case OP_STAR:
292 case OP_MINSTAR:
293 case OP_QUERY:
294 case OP_MINQUERY:
Guido van Rossum50700601997-12-08 17:15:20 +0000295 start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000296 tcode += 2;
297 try_next = TRUE;
298 break;
299
300 /* Single-char upto sets the bit and tries the next */
301
302 case OP_UPTO:
303 case OP_MINUPTO:
Guido van Rossum50700601997-12-08 17:15:20 +0000304 start_bits[tcode[3]/8] |= (1 << (tcode[3]&7));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000305 tcode += 4;
306 try_next = TRUE;
307 break;
308
309 /* At least one single char sets the bit and stops */
310
311 case OP_EXACT: /* Fall through */
312 tcode++;
313
314 case OP_CHARS: /* Fall through */
315 tcode++;
316
317 case OP_PLUS:
318 case OP_MINPLUS:
Guido van Rossum50700601997-12-08 17:15:20 +0000319 start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000320 break;
321
322 /* Single character type sets the bits and stops */
323
324 case OP_NOT_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000325 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000326 break;
327
328 case OP_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000329 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000330 break;
331
332 case OP_NOT_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000333 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000334 break;
335
336 case OP_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000337 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000338 break;
339
340 case OP_NOT_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000341 for (c = 0; c < 32; c++)
342 start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000343 break;
344
345 case OP_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000346 for (c = 0; c < 32; c++)
347 start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000348 break;
349
350 /* One or more character type fudges the pointer and restarts, knowing
351 it will hit a single character type and stop there. */
352
353 case OP_TYPEPLUS:
354 case OP_TYPEMINPLUS:
355 tcode++;
356 try_next = TRUE;
357 break;
358
359 case OP_TYPEEXACT:
360 tcode += 3;
361 try_next = TRUE;
362 break;
363
364 /* Zero or more repeats of character types set the bits and then
365 try again. */
366
367 case OP_TYPEUPTO:
368 case OP_TYPEMINUPTO:
369 tcode += 2; /* Fall through */
370
371 case OP_TYPESTAR:
372 case OP_TYPEMINSTAR:
373 case OP_TYPEQUERY:
374 case OP_TYPEMINQUERY:
375 switch(tcode[1])
376 {
377 case OP_NOT_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000378 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000379 break;
380
381 case OP_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000382 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000383 break;
384
385 case OP_NOT_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000386 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000387 break;
388
389 case OP_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000390 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000391 break;
392
393 case OP_NOT_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000394 for (c = 0; c < 32; c++)
395 start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000396 break;
397
398 case OP_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000399 for (c = 0; c < 32; c++)
400 start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000401 break;
402 }
403
404 tcode += 2;
405 try_next = TRUE;
406 break;
407
408 /* Character class: set the bits and either carry on or not,
409 according to the repeat count. */
410
411 case OP_CLASS:
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000412 case OP_NEGCLASS:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000413 {
Guido van Rossum50700601997-12-08 17:15:20 +0000414 tcode++;
415 for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
416 tcode += 32;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000417 switch (*tcode)
418 {
419 case OP_CRSTAR:
420 case OP_CRMINSTAR:
421 case OP_CRQUERY:
422 case OP_CRMINQUERY:
423 tcode++;
424 try_next = TRUE;
425 break;
426
427 case OP_CRRANGE:
428 case OP_CRMINRANGE:
429 if (((tcode[1] << 8) + tcode[2]) == 0)
430 {
431 tcode += 5;
432 try_next = TRUE;
433 }
434 break;
435 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000436 }
437 break; /* End of class handling */
438
439 } /* End of switch */
440 } /* End of try_next loop */
441
442 code += (code[1] << 8) + code[2]; /* Advance to next branch */
443 }
444while (*code == OP_ALT);
445return TRUE;
446}
447
448
449
450/*************************************************
451* Study a compiled expression *
452*************************************************/
453
454/* This function is handed a compiled expression that it must study to produce
455information that will speed up the matching. It returns a pcre_extra block
456which then gets handed back to pcre_exec().
457
458Arguments:
459 re points to the compiled expression
460 options contains option bits
461 errorptr points to where to place error messages;
462 set NULL unless error
463
464Returns: pointer to a pcre_extra block,
465 NULL on error or if no optimization possible
466*/
467
468pcre_extra *
Guido van Rossum58132c61997-12-17 00:24:13 +0000469pcre_study(const pcre *external_re, int options, const char **errorptr)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000470{
471BOOL caseless;
472uschar start_bits[32];
473real_pcre_extra *extra;
Guido van Rossum58132c61997-12-17 00:24:13 +0000474const real_pcre *re = (const real_pcre *)external_re;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000475
476*errorptr = NULL;
477
478if (re == NULL || re->magic_number != MAGIC_NUMBER)
479 {
480 *errorptr = "argument is not a compiled regular expression";
481 return NULL;
482 }
483
484if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
485 {
486 *errorptr = "unknown or incorrect option bit(s) set";
487 return NULL;
488 }
489
Guido van Rossum50700601997-12-08 17:15:20 +0000490/* Caseless can either be from the compiled regex or from options. */
491
492caseless = ((re->options | options) & PCRE_CASELESS) != 0;
493
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000494/* For an anchored pattern, or an unchored pattern that has a first char, or a
495multiline pattern that matches only at "line starts", no further processing at
496present. */
497
498if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
499 return NULL;
500
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000501/* See if we can find a fixed set of initial characters for the pattern. */
502
Guido van Rossum50700601997-12-08 17:15:20 +0000503memset(start_bits, 0, 32 * sizeof(uschar));
504if (!set_start_bits(re->code, start_bits)) return NULL;
505
506/* If this studying is caseless, scan the created bit map and duplicate the
507bits for any letters. */
508
509if (caseless)
510 {
511 register int c;
512 for (c = 0; c < 256; c++)
513 {
514 if ((start_bits[c/8] & (1 << (c&7))) != 0 &&
515 (pcre_ctypes[c] & ctype_letter) != 0)
516 {
517 int d = pcre_fcc[c];
518 start_bits[d/8] |= (1 << (d&7));
519 }
520 }
521 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000522
523/* Get an "extra" block and put the information therein. */
524
525extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
526
527if (extra == NULL)
528 {
529 *errorptr = "failed to get memory";
530 return NULL;
531 }
Guido van Rossum50700601997-12-08 17:15:20 +0000532
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000533extra->options = PCRE_STUDY_MAPPED | (caseless? PCRE_STUDY_CASELESS : 0);
Guido van Rossum50700601997-12-08 17:15:20 +0000534memcpy(extra->start_bits, start_bits, sizeof(start_bits));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000535
536return (pcre_extra *)extra;
537}
538
Guido van Rossum50700601997-12-08 17:15:20 +0000539/* End of study.c */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000540/*************************************************
541* Perl-Compatible Regular Expressions *
542*************************************************/
543
544/*
545This is a library of functions to support regular expressions whose syntax
546and semantics are as close as possible to those of the Perl 5 language. See
547the file Tech.Notes for some information on the internals.
548
549Written by: Philip Hazel <ph10@cam.ac.uk>
550
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000551 Copyright (c) 1998 University of Cambridge
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000552
553-----------------------------------------------------------------------------
554Permission is granted to anyone to use this software for any purpose on any
555computer system, and to redistribute it freely, subject to the following
556restrictions:
557
5581. This software is distributed in the hope that it will be useful,
559 but WITHOUT ANY WARRANTY; without even the implied warranty of
560 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
561
5622. The origin of this software must not be misrepresented, either by
563 explicit claim or by omission.
564
5653. Altered versions must be plainly marked as such, and must not be
566 misrepresented as being the original software.
567-----------------------------------------------------------------------------
568*/
569
570
571/* Define DEBUG to get debugging output on stdout. */
572
573/* #define DEBUG */
574
Guido van Rossum557dea11997-12-22 22:46:52 +0000575/* Use a macro for debugging printing, 'cause that eliminates the the use
576of #ifdef inline, and there are *still* stupid compilers about that don't like
577indented pre-processor statements. I suppose it's only been 10 years... */
578
579#ifdef DEBUG
580#define DPRINTF(p) printf p
581#else
582#define DPRINTF(p) /*nothing*/
583#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000584
585/* Include the internals header, which itself includes Standard C headers plus
586the external pcre header. */
587
588
Guido van Rossum50700601997-12-08 17:15:20 +0000589
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000590
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000591#ifndef Py_eval_input
592/* For Python 1.4, graminit.h has to be explicitly included */
593#define Py_eval_input eval_input
Guido van Rossum50700601997-12-08 17:15:20 +0000594
595#endif /* FOR_PYTHON */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000596
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000597/* Allow compilation as C++ source code, should anybody want to do that. */
598
599#ifdef __cplusplus
600#define class pcre_class
601#endif
602
603
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000604/* Min and max values for the common repeats; for the maxima, 0 => infinity */
605
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000606static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
607static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000608
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000609/* Text forms of OP_ values and things, for debugging (not all used) */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000610
611#ifdef DEBUG
Guido van Rossum58132c61997-12-17 00:24:13 +0000612static const char *OP_names[] = {
613 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
Guido van Rossum50700601997-12-08 17:15:20 +0000614 "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z",
615 "localized \\B", "localized \\b", "localized \\W", "localized \\w",
616 "^", "$", "Any", "chars",
617 "not",
618 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000619 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
620 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
621 "*", "*?", "+", "+?", "?", "??", "{", "{",
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000622 "class", "negclass", "classL", "Ref",
Guido van Rossum50700601997-12-08 17:15:20 +0000623 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000624 "Brazero", "Braminzero", "Bra"
625};
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000626#endif
627
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000628/* Table for handling escaped characters in the range '0'-'z'. Positive returns
629are simple data values; negative values are for special things like \d and so
630on. Zero means further processing is needed (for things like \x), or the escape
631is invalid. */
632
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000633static const short int escapes[] = {
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000634 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
635 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
636 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
637 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
638 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
639 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
640 '`', 7, -ESC_b, 0, -ESC_d, 0, '\f', 0, /* ` - g */
641 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
642 0, 0, '\r', -ESC_s, '\t', 0, '\v', -ESC_w, /* p - w */
643 0, 0, 0 /* x - z */
644};
645
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000646/* Definition to allow mutual recursion */
647
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000648static BOOL
649compile_regex(int, int *, uschar **, const uschar **, const char **,
650 PyObject *);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000651
652/* Structure for passing "static" information around between the functions
653doing the matching, so that they are thread-safe. */
654
655typedef struct match_data {
656 int errorcode; /* As it says */
657 int *offset_vector; /* Offset vector */
658 int offset_end; /* One past the end */
659 BOOL offset_overflow; /* Set if too many extractions */
660 BOOL caseless; /* Case-independent flag */
Guido van Rossum50700601997-12-08 17:15:20 +0000661 BOOL runtime_caseless; /* Caseless forced at run time */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000662 BOOL multiline; /* Multiline flag */
Guido van Rossum50700601997-12-08 17:15:20 +0000663 BOOL notbol; /* NOTBOL flag */
664 BOOL noteol; /* NOTEOL flag */
665 BOOL dotall; /* Dot matches any char */
666 BOOL endonly; /* Dollar not before final \n */
Guido van Rossum58132c61997-12-17 00:24:13 +0000667 const uschar *start_subject; /* Start of the subject string */
668 const uschar *end_subject; /* End of the subject string */
Guido van Rossum50700601997-12-08 17:15:20 +0000669 jmp_buf fail_env; /* Environment for longjump() break out */
Guido van Rossum58132c61997-12-17 00:24:13 +0000670 const uschar *end_match_ptr; /* Subject position at end match */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000671 int end_offset_top; /* Highwater mark at end of match */
Guido van Rossum50700601997-12-08 17:15:20 +0000672 jmp_buf error_env; /* For longjmp() if an error occurs deep inside a
673 matching operation */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000674 int length; /* Length of the allocated stacks */
675 int point; /* Point to add next item pushed onto stacks */
676 /* Pointers to the 6 stacks */
677 int *off_num, *offset_top, *r1, *r2;
Guido van Rossum58132c61997-12-17 00:24:13 +0000678 const uschar **eptr, **ecode;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000679} match_data;
680
681
682
683/*************************************************
Guido van Rossum50700601997-12-08 17:15:20 +0000684* Global variables *
685*************************************************/
686
687/* PCRE is thread-clean and doesn't use any global variables in the normal
688sense. However, it calls memory allocation and free functions via the two
689indirections below, which are can be changed by the caller, but are shared
690between all threads. */
691
692void *(*pcre_malloc)(size_t) = malloc;
693void (*pcre_free)(void *) = free;
694
695
696
697
698/*************************************************
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000699* Return version string *
700*************************************************/
701
Guido van Rossum58132c61997-12-17 00:24:13 +0000702const char *
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000703pcre_version(void)
704{
705return PCRE_VERSION;
706}
707
708
709
710
711/*************************************************
712* Return info about a compiled pattern *
713*************************************************/
714
715/* This function picks potentially useful data out of the private
716structure.
717
718Arguments:
719 external_re points to compiled code
720 optptr where to pass back the options
721 first_char where to pass back the first character,
722 or -1 if multiline and all branches start ^,
723 or -2 otherwise
724
725Returns: number of identifying extraction brackets
726 or negative values on error
727*/
728
729int
Guido van Rossum50700601997-12-08 17:15:20 +0000730pcre_info(const pcre *external_re, int *optptr, int *first_char)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000731{
Guido van Rossum58132c61997-12-17 00:24:13 +0000732const real_pcre *re = (real_pcre *)external_re;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000733if (re == NULL) return PCRE_ERROR_NULL;
734if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
735if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
736if (first_char != NULL)
737 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
738 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
739return re->top_bracket;
740}
741
742
743
744
745#ifdef DEBUG
746/*************************************************
747* Debugging function to print chars *
748*************************************************/
749
750/* Print a sequence of chars in printable format, stopping at the end of the
751subject if the requested.
752
753Arguments:
754 p points to characters
755 length number to print
756 is_subject TRUE if printing from within md->start_subject
757 md pointer to matching data block, if is_subject is TRUE
758
759Returns: nothing
760*/
761
Guido van Rossum557dea11997-12-22 22:46:52 +0000762static void
763pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000764{
765int c;
766if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
767while (length-- > 0)
768 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
769}
770#endif
771
772
773
774
775/*************************************************
776* Check subpattern for empty operand *
777*************************************************/
778
779/* This function checks a bracketed subpattern to see if any of the paths
780through it could match an empty string. This is used to diagnose an error if
781such a subpattern is followed by a quantifier with an unlimited upper bound.
782
783Argument:
784 code points to the opening bracket
785
786Returns: TRUE or FALSE
787*/
788
789static BOOL
790could_be_empty(uschar *code)
791{
792do {
793 uschar *cc = code + 3;
794
795 /* Scan along the opcodes for this branch; as soon as we find something
796 that matches a non-empty string, break out and advance to test the next
797 branch. If we get to the end of the branch, return TRUE for the whole
798 sub-expression. */
799
800 for (;;)
801 {
802 /* Test an embedded subpattern; if it could not be empty, break the
803 loop. Otherwise carry on in the branch. */
804
Guido van Rossum50700601997-12-08 17:15:20 +0000805 if ((int)(*cc) >= OP_BRA || (int)(*cc) == OP_ONCE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000806 {
807 if (!could_be_empty(cc)) break;
808 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
809 cc += 3;
810 }
811
812 else switch (*cc)
813 {
814 /* Reached end of a branch: the subpattern may match the empty string */
815
816 case OP_ALT:
817 case OP_KET:
818 case OP_KETRMAX:
819 case OP_KETRMIN:
820 return TRUE;
821
Guido van Rossumd0f432b1998-02-20 21:45:14 +0000822 /* Skip over entire bracket groups with zero lower bound */
823
824 case OP_BRAZERO:
825 case OP_BRAMINZERO:
826 cc++;
827 /* Fall through */
828
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000829 /* Skip over assertive subpatterns */
830
831 case OP_ASSERT:
832 case OP_ASSERT_NOT:
833 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
834 cc += 3;
835 break;
836
837 /* Skip over things that don't match chars */
838
839 case OP_SOD:
840 case OP_EOD:
841 case OP_CIRC:
842 case OP_DOLL:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000843 case OP_NOT_WORD_BOUNDARY:
844 case OP_WORD_BOUNDARY:
Guido van Rossum50700601997-12-08 17:15:20 +0000845 case OP_NOT_WORD_BOUNDARY_L:
846 case OP_WORD_BOUNDARY_L:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000847 cc++;
848 break;
849
850 /* Skip over simple repeats with zero lower bound */
851
852 case OP_STAR:
853 case OP_MINSTAR:
854 case OP_QUERY:
855 case OP_MINQUERY:
Guido van Rossum50700601997-12-08 17:15:20 +0000856 case OP_NOTSTAR:
857 case OP_NOTMINSTAR:
858 case OP_NOTQUERY:
859 case OP_NOTMINQUERY:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000860 case OP_TYPESTAR:
861 case OP_TYPEMINSTAR:
862 case OP_TYPEQUERY:
863 case OP_TYPEMINQUERY:
864 cc += 2;
865 break;
866
867 /* Skip over UPTOs (lower bound is zero) */
868
869 case OP_UPTO:
870 case OP_MINUPTO:
871 case OP_TYPEUPTO:
872 case OP_TYPEMINUPTO:
873 cc += 4;
874 break;
875
876 /* Check a class or a back reference for a zero minimum */
877
878 case OP_CLASS:
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000879 case OP_NEGCLASS:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000880 case OP_REF:
Guido van Rossum50700601997-12-08 17:15:20 +0000881 case OP_CLASS_L:
882 switch(*cc)
883 {
884 case (OP_REF): cc += 2; break;
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000885 case (OP_CLASS): case (OP_NEGCLASS): cc += 1+32; break;
Guido van Rossum50700601997-12-08 17:15:20 +0000886 case (OP_CLASS_L): cc += 1+1+32; break;
887 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000888
889 switch (*cc)
890 {
891 case OP_CRSTAR:
892 case OP_CRMINSTAR:
893 case OP_CRQUERY:
894 case OP_CRMINQUERY:
895 cc++;
896 break;
897
898 case OP_CRRANGE:
899 case OP_CRMINRANGE:
900 if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
901 cc += 3;
902 break;
903
904 default:
905 goto NEXT_BRANCH;
906 }
907 break;
908
909 /* Anything else matches at least one character */
910
911 default:
912 goto NEXT_BRANCH;
913 }
914 }
915
916 NEXT_BRANCH:
917 code += (code[1] << 8) + code[2];
918 }
919while (*code == OP_ALT);
920
921/* No branches match the empty string */
922
923return FALSE;
924}
925
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000926/* Determine the length of a group ID in an expression like
927 (?P<foo_123>...)
928Arguments:
929 ptr pattern position pointer (say that 3 times fast)
930 finalchar the character that will mark the end of the ID
931 errorptr points to the pointer to the error message
932*/
933
934static int
Guido van Rossum58132c61997-12-17 00:24:13 +0000935get_group_id(const uschar *ptr, char finalchar, const char **errorptr)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000936{
Guido van Rossum58132c61997-12-17 00:24:13 +0000937 const uschar *start = ptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000938
939 /* If the first character is not in \w, or is in \w but is a digit,
940 report an error */
941 if (!(pcre_ctypes[*ptr] & ctype_word) ||
942 (pcre_ctypes[*ptr++] & ctype_digit))
943 {
944 *errorptr = "(?P identifier must start with a letter or underscore";
945 return 0;
946 }
947
948 /* Increment ptr until we either hit a null byte, the desired
949 final character, or a non-word character */
950 for(; (*ptr != 0) && (*ptr != finalchar) &&
951 (pcre_ctypes[*ptr] & ctype_word); ptr++)
952 {
Guido van Rossumc3861071997-10-08 02:07:40 +0000953 /* Empty loop body */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000954 }
955 if (*ptr==finalchar)
956 return ptr-start;
957 if (*ptr==0)
958 {
959 *errorptr = "unterminated (?P identifier";
960 return 0;
961 }
962 *errorptr = "illegal character in (?P identifier";
963 return 0;
964}
965
966/*************************************************
967* Handle escapes *
968*************************************************/
969
970/* This function is called when a \ has been encountered. It either returns a
971positive value for a simple escape such as \n, or a negative value which
972encodes one of the more complicated things such as \d. On entry, ptr is
973pointing at the \. On exit, it is on the final character of the escape
974sequence.
975
976Arguments:
977 ptrptr points to the pattern position pointer
978 errorptr points to the pointer to the error message
Guido van Rossum50700601997-12-08 17:15:20 +0000979 bracount number of previous extracting brackets
980 options the options bits
981 isclass TRUE if inside a character class
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000982
983Returns: zero or positive => a data character
984 negative => a special escape sequence
985 on error, errorptr is set
986*/
987
988static int
Guido van Rossum58132c61997-12-17 00:24:13 +0000989check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
990 int options, BOOL isclass)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000991{
Guido van Rossum58132c61997-12-17 00:24:13 +0000992const uschar *ptr = *ptrptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000993int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
994int i;
995
Guido van Rossum50700601997-12-08 17:15:20 +0000996if (c == 0) *errorptr = ERR1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000997
998/* Digits or letters may have special meaning; all others are literals. */
999
1000else if (c < '0' || c > 'z') {}
1001
1002/* Do an initial lookup in a table. A non-zero result is something that can be
1003returned immediately. Otherwise further processing may be required. */
1004
1005else if ((i = escapes[c - '0']) != 0) c = i;
1006
1007/* Escapes that need further processing, or are illegal. */
1008
Guido van Rossum50700601997-12-08 17:15:20 +00001009else
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001010 {
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001011
Guido van Rossum50700601997-12-08 17:15:20 +00001012 switch (c)
1013 {
1014 /* The handling of escape sequences consisting of a string of digits
1015 starting with one that is not zero is not straightforward. By experiment,
1016 the way Perl works seems to be as follows:
1017
1018 Outside a character class, the digits are read as a decimal number. If the
1019 number is less than 10, or if there are that many previous extracting
1020 left brackets, then it is a back reference. Otherwise, up to three octal
1021 digits are read to form an escaped byte. Thus \123 is likely to be octal
1022 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
1023 value is greater than 377, the least significant 8 bits are taken. Inside a
1024 character class, \ followed by a digit is always an octal number. */
1025
1026 case '1': case '2': case '3': case '4': case '5':
1027 case '6': case '7': case '8': case '9':
1028
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001029 {
1030 /* PYTHON: Try to compute an octal value for a character */
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001031 for(c=0, i=0; ptr[i]!=0 && i<3; i++)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001032 {
1033 if (( pcre_ctypes[ ptr[i] ] & ctype_odigit) != 0)
1034 c = c * 8 + ptr[i]-'0';
1035 else
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001036 break; /* Non-octal character--break out of the loop */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001037 }
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001038 /* It's a character if there were exactly 3 octal digits, or if
1039 we're inside a character class and there was at least one
1040 octal digit. */
1041 if ( (i == 3) || (isclass && i!=0) )
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001042 {
1043 ptr += i-1;
1044 break;
1045 }
1046 c = ptr[0]; /* Restore the first character after the \ */
1047 c -= '0'; i = 1;
1048 while (i<2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0)
1049 {
1050 c = c * 10 + ptr[1] - '0';
1051 ptr++; i++;
1052 }
1053 if (c > 255 - ESC_REF) *errorptr = "back reference too big";
1054 c = -(ESC_REF + c);
1055 }
1056 break;
1057
Guido van Rossum50700601997-12-08 17:15:20 +00001058 /* \0 always starts an octal number, but we may drop through to here with a
1059 larger first octal digit */
1060
1061 case '0':
1062 c -= '0';
1063 while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&
1064 ptr[1] != '8' && ptr[1] != '9')
1065 c = c * 8 + *(++ptr) - '0';
1066 break;
1067
1068 /* Special escapes not starting with a digit are straightforward */
1069
1070 case 'x':
1071 c = 0;
1072 while ( (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001073 {
Guido van Rossum50700601997-12-08 17:15:20 +00001074 ptr++;
1075 c = c * 16 + pcre_lcc[*ptr] -
1076 (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
1077 c &= 255;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001078 }
1079 break;
1080
1081
Guido van Rossum50700601997-12-08 17:15:20 +00001082 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1083 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
1084 for Perl compatibility, it is a literal. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001085
Guido van Rossum50700601997-12-08 17:15:20 +00001086 default:
1087 if ((options & PCRE_EXTRA) != 0) switch(c)
1088 {
1089 case 'X':
1090 c = -ESC_X; /* This could be a lookup if it ever got into Perl */
1091 break;
1092
1093 default:
1094 *errorptr = ERR3;
1095 break;
1096 }
1097 break;
1098 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001099 }
1100
1101*ptrptr = ptr;
1102return c;
1103}
1104
1105
1106
1107/*************************************************
Guido van Rossum50700601997-12-08 17:15:20 +00001108* Check for counted repeat *
1109*************************************************/
1110
1111/* This function is called when a '{' is encountered in a place where it might
1112start a quantifier. It looks ahead to see if it really is a quantifier or not.
1113It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1114where the ddds are digits.
1115
1116Arguments:
1117 p pointer to the first char after '{'
1118
1119Returns: TRUE or FALSE
1120*/
1121
1122static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00001123is_counted_repeat(const uschar *p)
Guido van Rossum50700601997-12-08 17:15:20 +00001124{
1125if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1126while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1127if (*p == '}') return TRUE;
1128
1129if (*p++ != ',') return FALSE;
1130if (*p == '}') return TRUE;
1131
1132if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1133while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1134return (*p == '}');
1135}
1136
1137
1138
1139/*************************************************
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001140* Read repeat counts *
1141*************************************************/
1142
Guido van Rossum50700601997-12-08 17:15:20 +00001143/* Read an item of the form {n,m} and return the values. This is called only
1144after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1145so the syntax is guaranteed to be correct, but we need to check the values.
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001146
1147Arguments:
1148 p pointer to first char after '{'
1149 minp pointer to int for min
1150 maxp pointer to int for max
1151 returned as -1 if no max
1152 errorptr points to pointer to error message
1153
1154Returns: pointer to '}' on success;
1155 current ptr on error, with errorptr set
1156*/
1157
Guido van Rossum58132c61997-12-17 00:24:13 +00001158static const uschar *
1159read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001160{
1161int min = 0;
1162int max = -1;
1163
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001164while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
1165
1166if (*p == '}') max = min; else
1167 {
Guido van Rossum50700601997-12-08 17:15:20 +00001168 if (*(++p) != '}')
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001169 {
1170 max = 0;
1171 while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001172 if (max < min)
1173 {
Guido van Rossum50700601997-12-08 17:15:20 +00001174 *errorptr = ERR4;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001175 return p;
1176 }
1177 }
1178 }
1179
1180/* Do paranoid checks, then fill in the required variables, and pass back the
1181pointer to the terminating '}'. */
1182
Guido van Rossum50700601997-12-08 17:15:20 +00001183if (min > 65535 || max > 65535)
1184 *errorptr = ERR5;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001185else
1186 {
1187 *minp = min;
1188 *maxp = max;
1189 }
1190return p;
1191}
1192
1193
1194
1195/*************************************************
1196* Compile one branch *
1197*************************************************/
1198
1199/* Scan the pattern, compiling it into the code vector.
1200
1201Arguments:
Guido van Rossum50700601997-12-08 17:15:20 +00001202 options the option bits
1203 bracket points to number of brackets used
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001204 code points to the pointer to the current code point
1205 ptrptr points to the current pattern pointer
1206 errorptr points to pointer to error message
1207
1208Returns: TRUE on success
1209 FALSE, with *errorptr set on error
1210*/
1211
1212static BOOL
Guido van Rossum50700601997-12-08 17:15:20 +00001213compile_branch(int options, int *brackets, uschar **codeptr,
Guido van Rossum58132c61997-12-17 00:24:13 +00001214 const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001215{
1216int repeat_type, op_type;
1217int repeat_min, repeat_max;
1218int bravalue, length;
1219register int c;
1220register uschar *code = *codeptr;
Guido van Rossum58132c61997-12-17 00:24:13 +00001221const uschar *ptr = *ptrptr;
1222const uschar *oldptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001223uschar *previous = NULL;
Guido van Rossum50700601997-12-08 17:15:20 +00001224uschar class[32];
1225uschar *class_flag; /* Pointer to the single-byte flag for OP_CLASS_L */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001226
1227/* Switch on next character until the end of the branch */
1228
1229for (;; ptr++)
1230 {
Guido van Rossum50700601997-12-08 17:15:20 +00001231 BOOL negate_class;
1232 int class_charcount;
1233 int class_lastchar;
1234
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001235 c = *ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00001236 if ((options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001237 {
1238 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1239 if (c == '#')
1240 {
1241 while ((c = *(++ptr)) != 0 && c != '\n');
1242 continue;
1243 }
1244 }
1245
1246 switch(c)
1247 {
1248 /* The branch terminates at end of string, |, or ). */
1249
1250 case 0:
1251 case '|':
1252 case ')':
1253 *codeptr = code;
1254 *ptrptr = ptr;
1255 return TRUE;
1256
1257 /* Handle single-character metacharacters */
1258
1259 case '^':
1260 previous = NULL;
1261 *code++ = OP_CIRC;
1262 break;
1263
1264 case '$':
1265 previous = NULL;
1266 *code++ = OP_DOLL;
1267 break;
1268
1269 case '.':
1270 previous = code;
1271 *code++ = OP_ANY;
1272 break;
1273
Guido van Rossum50700601997-12-08 17:15:20 +00001274 /* Character classes. These always build a 32-byte bitmap of the permitted
1275 characters, except in the special case where there is only one character.
1276 For negated classes, we build the map as usual, then invert it at the end.
1277 */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001278
1279 case '[':
Guido van Rossum50700601997-12-08 17:15:20 +00001280 previous = code;
1281 if (options & PCRE_LOCALE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001282 {
Guido van Rossum50700601997-12-08 17:15:20 +00001283 *code++ = OP_CLASS_L;
1284 /* Set the flag for localized classes (like \w) to 0 */
1285 class_flag = code;
1286 *class_flag = 0;
1287 }
1288 else
1289 {
1290 *code++ = OP_CLASS;
1291 class_flag = NULL;
1292 }
1293
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001294 /* If the first character is '^', set the negation flag, and use a
1295 different opcode. This only matters if caseless matching is specified at
1296 runtime. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001297
Guido van Rossum50700601997-12-08 17:15:20 +00001298 if ((c = *(++ptr)) == '^')
1299 {
1300 negate_class = TRUE;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001301 if (*(code-1)==OP_CLASS) *(code-1) = OP_NEGCLASS;
Guido van Rossum50700601997-12-08 17:15:20 +00001302 c = *(++ptr);
1303 }
1304 else negate_class = FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001305
Guido van Rossum50700601997-12-08 17:15:20 +00001306 /* Keep a count of chars so that we can optimize the case of just a single
1307 character. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001308
Guido van Rossum50700601997-12-08 17:15:20 +00001309 class_charcount = 0;
1310 class_lastchar = -1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001311
Guido van Rossum50700601997-12-08 17:15:20 +00001312 /* Initialize the 32-char bit map to all zeros. We have to build the
1313 map in a temporary bit of store, in case the class contains only 1
1314 character, because in that case the compiled code doesn't use the
1315 bit map. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001316
Guido van Rossum50700601997-12-08 17:15:20 +00001317 memset(class, 0, 32 * sizeof(uschar));
1318
1319 /* Process characters until ] is reached. By writing this as a "do" it
1320 means that an initial ] is taken as a data character. */
1321
1322 do
1323 {
1324 if (c == 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001325 {
Guido van Rossum50700601997-12-08 17:15:20 +00001326 *errorptr = ERR6;
1327 goto FAILED;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001328 }
1329
Guido van Rossum50700601997-12-08 17:15:20 +00001330 /* Backslash may introduce a single character, or it may introduce one
1331 of the specials, which just set a flag. Escaped items are checked for
1332 validity in the pre-compiling pass. The sequence \b is a special case.
Guido van Rossum58132c61997-12-17 00:24:13 +00001333 Inside a class (and only there) it is treated as backspace. Elsewhere
Guido van Rossum50700601997-12-08 17:15:20 +00001334 it marks a word boundary. Other escapes have preset maps ready to
1335 or into the one we are building. We assume they have more than one
1336 character in them, so set class_count bigger than one. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001337
Guido van Rossum50700601997-12-08 17:15:20 +00001338 if (c == '\\')
1339 {
1340 c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1341 if (-c == ESC_b) c = '\b';
1342 else if (c < 0)
1343 {
1344 class_charcount = 10;
1345 switch (-c)
1346 {
1347 case ESC_d:
Guido van Rossum50700601997-12-08 17:15:20 +00001348 {
1349 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];
1350 }
1351 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001352
Guido van Rossum50700601997-12-08 17:15:20 +00001353 case ESC_D:
Guido van Rossum50700601997-12-08 17:15:20 +00001354 {
1355 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];
1356 }
1357 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001358
Guido van Rossum50700601997-12-08 17:15:20 +00001359 case ESC_w:
1360 if (options & PCRE_LOCALE)
1361 {
1362 *class_flag |= 1;
1363 }
1364 else
1365 {
1366 for (c = 0; c < 32; c++)
1367 class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1368 }
1369 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001370
Guido van Rossum50700601997-12-08 17:15:20 +00001371 case ESC_W:
1372 if (options & PCRE_LOCALE)
1373 {
1374 *class_flag |= 2;
1375 }
1376 else
1377 {
1378 for (c = 0; c < 32; c++)
1379 class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1380 }
1381 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001382
Guido van Rossum50700601997-12-08 17:15:20 +00001383 case ESC_s:
Guido van Rossum50700601997-12-08 17:15:20 +00001384 {
1385 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];
1386 }
1387 continue;
1388
1389 case ESC_S:
Guido van Rossum50700601997-12-08 17:15:20 +00001390 {
1391 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];
1392 }
1393 continue;
1394
1395 default:
1396 *errorptr = ERR7;
1397 goto FAILED;
1398 }
1399 }
1400 /* Fall through if single character */
1401 }
1402
1403 /* A single character may be followed by '-' to form a range. However,
1404 Perl does not permit ']' to be the end of the range. A '-' character
1405 here is treated as a literal. */
1406
1407 if (ptr[1] == '-' && ptr[2] != ']')
1408 {
1409 int d;
1410 ptr += 2;
1411 d = *ptr;
1412
1413 if (d == 0)
1414 {
1415 *errorptr = ERR6;
1416 goto FAILED;
1417 }
1418
1419 /* The second part of a range can be a single-character escape, but
1420 not any of the other escapes. */
1421
1422 if (d == '\\')
1423 {
1424 d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1425 if (d < 0)
1426 {
1427 if (d == -ESC_b) d = '\b'; else
1428 {
1429 *errorptr = ERR7;
1430 goto FAILED;
1431 }
1432 }
1433 }
1434
1435 if (d < c)
1436 {
1437 *errorptr = ERR8;
1438 goto FAILED;
1439 }
1440
1441 for (; c <= d; c++)
1442 {
1443 class[c/8] |= (1 << (c&7));
1444 if ((options & PCRE_CASELESS) != 0)
1445 {
1446 int uc = pcre_fcc[c]; /* flip case */
1447 class[uc/8] |= (1 << (uc&7));
1448 }
1449 class_charcount++; /* in case a one-char range */
1450 class_lastchar = c;
1451 }
1452 continue; /* Go get the next char in the class */
1453 }
1454
1455 /* Handle a lone single character - we can get here for a normal
1456 non-escape char, or after \ that introduces a single character. */
1457
1458 class [c/8] |= (1 << (c&7));
1459 if ((options & PCRE_CASELESS) != 0)
1460 {
1461 c = pcre_fcc[c]; /* flip case */
1462 class[c/8] |= (1 << (c&7));
1463 }
1464 class_charcount++;
1465 class_lastchar = c;
1466 }
1467
1468 /* Loop until ']' reached; the check for end of string happens inside the
1469 loop. This "while" is the end of the "do" above. */
1470
1471 while ((c = *(++ptr)) != ']');
1472
1473 /* If class_charcount is 1 and class_lastchar is not negative, we saw
1474 precisely one character. This doesn't need the whole 32-byte bit map.
1475 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1476 it's negative. */
1477
1478 if (class_charcount == 1 && class_lastchar >= 0)
1479 {
1480 if (negate_class)
1481 {
1482 code[-1] = OP_NOT;
1483 }
1484 else
1485 {
1486 code[-1] = OP_CHARS;
1487 *code++ = 1;
1488 }
1489 *code++ = class_lastchar;
1490 }
1491
1492 /* Otherwise, negate the 32-byte map if necessary, and copy it into
1493 the code vector. */
1494
1495 else
1496 {
1497 /* If this is a localized opcode, bump the code pointer up */
1498 if (class_flag) code++;
1499 if (negate_class)
1500 {
1501 if (class_flag) *class_flag = (*class_flag) ^ 63;
1502 for (c = 0; c < 32; c++) code[c] = ~class[c];
1503 }
1504 else
1505 memcpy(code, class, 32);
1506 code += 32;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001507 }
1508 break;
1509
1510 /* Various kinds of repeat */
1511
1512 case '{':
Guido van Rossum50700601997-12-08 17:15:20 +00001513 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001514 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
1515 if (*errorptr != NULL) goto FAILED;
1516 goto REPEAT;
1517
1518 case '*':
1519 repeat_min = 0;
1520 repeat_max = -1;
1521 goto REPEAT;
1522
1523 case '+':
1524 repeat_min = 1;
1525 repeat_max = -1;
1526 goto REPEAT;
1527
1528 case '?':
1529 repeat_min = 0;
1530 repeat_max = 1;
1531
1532 REPEAT:
1533 if (previous == NULL)
1534 {
Guido van Rossum50700601997-12-08 17:15:20 +00001535 *errorptr = ERR9;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001536 goto FAILED;
1537 }
1538
1539 /* If the next character is '?' this is a minimizing repeat. Advance to the
1540 next character. */
1541
1542 if (ptr[1] == '?') { repeat_type = 1; ptr++; } else repeat_type = 0;
1543
Guido van Rossum50700601997-12-08 17:15:20 +00001544 /* If the maximum is zero then the minimum must also be zero; Perl allows
1545 this case, so we do too - by simply omitting the item altogether. */
1546
1547 if (repeat_max == 0) code = previous;
1548
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001549 /* If previous was a string of characters, chop off the last one and use it
1550 as the subject of the repeat. If there was only one character, we can
1551 abolish the previous item altogether. */
1552
Guido van Rossum50700601997-12-08 17:15:20 +00001553 else if (*previous == OP_CHARS)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001554 {
1555 int len = previous[1];
1556 if (len == 1)
1557 {
1558 c = previous[2];
1559 code = previous;
1560 }
1561 else
1562 {
1563 c = previous[len+1];
1564 previous[1]--;
1565 code--;
1566 }
1567 op_type = 0; /* Use single-char op codes */
1568 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1569 }
1570
Guido van Rossum50700601997-12-08 17:15:20 +00001571 /* If previous was a single negated character ([^a] or similar), we use
1572 one of the special opcodes, replacing it. The code is shared with single-
1573 character repeats by adding a suitable offset into repeat_type. */
1574
1575 else if ((int)*previous == OP_NOT)
1576 {
1577 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
1578 c = previous[1];
1579 code = previous;
1580 goto OUTPUT_SINGLE_REPEAT;
1581 }
1582
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001583 /* If previous was a character type match (\d or similar), abolish it and
1584 create a suitable repeat item. The code is shared with single-character
1585 repeats by adding a suitable offset into repeat_type. */
1586
Guido van Rossum50700601997-12-08 17:15:20 +00001587 else if ((int)*previous < OP_CIRC || *previous == OP_ANY)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001588 {
1589 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1590 c = *previous;
1591 code = previous;
1592
1593 OUTPUT_SINGLE_REPEAT:
1594 repeat_type += op_type; /* Combine both values for many cases */
1595
1596 /* A minimum of zero is handled either as the special case * or ?, or as
1597 an UPTO, with the maximum given. */
1598
1599 if (repeat_min == 0)
1600 {
1601 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1602 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1603 else
1604 {
1605 *code++ = OP_UPTO + repeat_type;
1606 *code++ = repeat_max >> 8;
1607 *code++ = (repeat_max & 255);
1608 }
1609 }
1610
1611 /* The case {1,} is handled as the special case + */
1612
1613 else if (repeat_min == 1 && repeat_max == -1)
1614 *code++ = OP_PLUS + repeat_type;
1615
1616 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1617 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1618
1619 else
1620 {
1621 if (repeat_min != 1)
1622 {
1623 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1624 *code++ = repeat_min >> 8;
1625 *code++ = (repeat_min & 255);
1626 }
1627
1628 /* If the mininum is 1 and the previous item was a character string,
1629 we either have to put back the item that got cancelled if the string
1630 length was 1, or add the character back onto the end of a longer
1631 string. For a character type nothing need be done; it will just get put
1632 back naturally. */
1633
1634 else if (*previous == OP_CHARS)
1635 {
1636 if (code == previous) code += 2; else previous[1]++;
1637 }
1638
Guido van Rossum557dea11997-12-22 22:46:52 +00001639 /* If the maximum is unlimited, insert an OP_STAR. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001640
Guido van Rossum557dea11997-12-22 22:46:52 +00001641 if (repeat_max < 0)
1642 {
1643 *code++ = c;
1644 *code++ = OP_STAR + repeat_type;
1645 }
1646
1647 /* Else insert an UPTO if the max is greater than the min. */
1648
1649 else if (repeat_max != repeat_min)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001650 {
1651 *code++ = c;
1652 repeat_max -= repeat_min;
1653 *code++ = OP_UPTO + repeat_type;
1654 *code++ = repeat_max >> 8;
1655 *code++ = (repeat_max & 255);
1656 }
1657 }
1658
1659 /* The character or character type itself comes last in all cases. */
1660
1661 *code++ = c;
1662 }
1663
1664 /* If previous was a character class or a back reference, we put the repeat
1665 stuff after it. */
1666
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001667 else if (*previous == OP_CLASS || *previous == OP_NEGCLASS ||
1668 *previous==OP_CLASS_L || *previous == OP_REF)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001669 {
1670 if (repeat_min == 0 && repeat_max == -1)
1671 *code++ = OP_CRSTAR + repeat_type;
1672 else if (repeat_min == 1 && repeat_max == -1)
1673 *code++ = OP_CRPLUS + repeat_type;
1674 else if (repeat_min == 0 && repeat_max == 1)
1675 *code++ = OP_CRQUERY + repeat_type;
1676 else
1677 {
1678 *code++ = OP_CRRANGE + repeat_type;
1679 *code++ = repeat_min >> 8;
1680 *code++ = repeat_min & 255;
1681 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1682 *code++ = repeat_max >> 8;
1683 *code++ = repeat_max & 255;
1684 }
1685 }
1686
1687 /* If previous was a bracket group, we may have to replicate it in certain
1688 cases. If the maximum repeat count is unlimited, check that the bracket
1689 group cannot match the empty string, and diagnose an error if it can. */
1690
1691 else if ((int)*previous >= OP_BRA)
1692 {
1693 int i;
Guido van Rossum557dea11997-12-22 22:46:52 +00001694 int len = code - previous;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001695
1696 if (repeat_max == -1 && could_be_empty(previous))
1697 {
Guido van Rossum50700601997-12-08 17:15:20 +00001698 *errorptr = ERR10;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001699 goto FAILED;
1700 }
1701
1702 /* If the minimum is greater than zero, and the maximum is unlimited or
1703 equal to the minimum, the first copy remains where it is, and is
1704 replicated up to the minimum number of times. This case includes the +
1705 repeat, but of course no replication is needed in that case. */
1706
1707 if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1708 {
1709 for (i = 1; i < repeat_min; i++)
1710 {
Guido van Rossum557dea11997-12-22 22:46:52 +00001711 memcpy(code, previous, len);
1712 code += len;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001713 }
1714 }
1715
1716 /* If the minimum is zero, stick BRAZERO in front of the first copy.
1717 Then, if there is a fixed upper limit, replicated up to that many times,
1718 sticking BRAZERO in front of all the optional ones. */
1719
1720 else
1721 {
1722 if (repeat_min == 0)
1723 {
Guido van Rossum557dea11997-12-22 22:46:52 +00001724 memmove(previous+1, previous, len);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001725 code++;
1726 *previous++ = OP_BRAZERO + repeat_type;
1727 }
1728
1729 for (i = 1; i < repeat_min; i++)
1730 {
Guido van Rossum557dea11997-12-22 22:46:52 +00001731 memcpy(code, previous, len);
1732 code += len;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001733 }
1734
1735 for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1736 {
1737 *code++ = OP_BRAZERO + repeat_type;
Guido van Rossum557dea11997-12-22 22:46:52 +00001738 memcpy(code, previous, len);
1739 code += len;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001740 }
1741 }
1742
1743 /* If the maximum is unlimited, set a repeater in the final copy. */
1744
1745 if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
1746 }
1747
1748 /* Else there's some kind of shambles */
1749
1750 else
1751 {
Guido van Rossum50700601997-12-08 17:15:20 +00001752 *errorptr = ERR11;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001753 goto FAILED;
1754 }
1755
1756 /* In all case we no longer have a previous item. */
1757
1758 previous = NULL;
1759 break;
1760
1761
1762 /* Start of nested bracket sub-expression, or comment or lookahead.
1763 First deal with special things that can come after a bracket; all are
1764 introduced by ?, and the appearance of any of them means that this is not a
1765 referencing group. They were checked for validity in the first pass over
1766 the string, so we don't have to check for syntax errors here. */
1767
1768 case '(':
1769 previous = code; /* Only real brackets can be repeated */
1770 if (*(++ptr) == '?')
1771 {
1772 bravalue = OP_BRA;
1773
1774 switch (*(++ptr))
1775 {
1776 case '#':
1777 case 'i':
Guido van Rossumbd49ac41997-12-10 23:05:53 +00001778 case 'L':
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001779 case 'm':
1780 case 's':
1781 case 'x':
1782 ptr++;
1783 while (*ptr != ')') ptr++;
1784 previous = NULL;
1785 continue;
1786
1787 case ':': /* Non-extracting bracket */
1788 ptr++;
1789 break;
1790
1791 case '=': /* Assertions can't be repeated */
1792 bravalue = OP_ASSERT;
1793 ptr++;
1794 previous = NULL;
1795 break;
1796
1797 case '!':
1798 bravalue = OP_ASSERT_NOT;
1799 ptr++;
1800 previous = NULL;
1801 break;
1802
1803 case ('P'):
1804 ptr++;
1805 if (*ptr=='<')
1806 {
1807 /* (?P<groupname>...) */
1808 int idlen;
1809 PyObject *string, *intobj;
1810
1811 ptr++;
1812 idlen = get_group_id(ptr, '>', errorptr);
1813 if (*errorptr) {
1814 goto FAILED;
1815 }
Guido van Rossum57ba4f31997-12-02 20:40:28 +00001816 string = PyString_FromStringAndSize((char*)ptr, idlen);
Guido van Rossum50700601997-12-08 17:15:20 +00001817 intobj = PyInt_FromLong( brackets[0] + 1 );
Guido van Rossum58132c61997-12-17 00:24:13 +00001818 if (intobj == NULL || string == NULL)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001819 {
1820 Py_XDECREF(string);
1821 Py_XDECREF(intobj);
1822 *errorptr = "exception raised";
1823 goto FAILED;
1824 }
1825 PyDict_SetItem(dictionary, string, intobj);
Guido van Rossum58132c61997-12-17 00:24:13 +00001826 Py_DECREF(string); Py_DECREF(intobj); /* XXX DECREF commented out! */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001827 ptr += idlen+1; /* Point to rest of expression */
1828 goto do_grouping_bracket;
1829 }
1830 if (*ptr=='=')
1831 {
1832 /* (?P=groupname) */
1833 int idlen, refnum;
1834 PyObject *string, *intobj;
1835
1836 ptr++;
1837 idlen = get_group_id(ptr, ')', errorptr);
1838 if (*errorptr) {
1839 goto FAILED;
1840 }
Guido van Rossum50700601997-12-08 17:15:20 +00001841 string = PyString_FromStringAndSize((char *)ptr, idlen);
Guido van Rossumc3861071997-10-08 02:07:40 +00001842 if (string==NULL) {
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001843 *errorptr = "exception raised";
1844 goto FAILED;
1845 }
1846 intobj = PyDict_GetItem(dictionary, string);
1847 if (intobj==NULL) {
Guido van Rossumc3861071997-10-08 02:07:40 +00001848 Py_DECREF(string);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001849 *errorptr = "?P= group identifier isn't defined";
1850 goto FAILED;
1851 }
1852
1853 refnum = PyInt_AsLong(intobj);
Guido van Rossum1eadb411997-12-15 17:33:24 +00001854 Py_DECREF(string);
Guido van Rossum58132c61997-12-17 00:24:13 +00001855 /* The caller doesn't own the reference to the value
1856 returned from PyDict_GetItem, so intobj is not
1857 DECREF'ed. */
1858
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001859 *code++ = OP_REF;
1860 *code++ = refnum;
1861 /* The continue will cause the top-level for() loop to
1862 be resumed, so ptr will be immediately incremented.
1863 Therefore, the following line adds just idlen, not
1864 idlen+1 */
1865 ptr += idlen;
1866 continue;
1867 }
1868 /* The character after ?P is neither < nor =, so
1869 report an error. Add more Python-extensions here. */
1870 *errorptr="unknown after (?P";
1871 goto FAILED;
1872 break;
Guido van Rossum50700601997-12-08 17:15:20 +00001873
1874 case '>': /* "Match once" brackets */
1875 if ((options & PCRE_EXTRA) != 0) /* Not yet standard */
1876 {
1877 bravalue = OP_ONCE;
1878 ptr++;
1879 previous = NULL;
1880 break;
1881 }
1882 /* Else fall through */
1883
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001884 default:
Guido van Rossum50700601997-12-08 17:15:20 +00001885 *errorptr = ERR12;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001886 goto FAILED;
1887 }
1888 }
1889
1890 /* Else we have a referencing group */
1891
1892 else
1893 {
1894 do_grouping_bracket:
Guido van Rossum50700601997-12-08 17:15:20 +00001895 if (++(*brackets) > EXTRACT_MAX)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001896 {
Guido van Rossum50700601997-12-08 17:15:20 +00001897 *errorptr = ERR13;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001898 goto FAILED;
1899 }
Guido van Rossum50700601997-12-08 17:15:20 +00001900 bravalue = OP_BRA + *brackets;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001901 }
1902
1903 /* Process nested bracketed re; at end pointer is on the bracket. We copy
1904 code into a non-register variable in order to be able to pass its address
1905 because some compilers complain otherwise. */
1906
1907 *code = bravalue;
1908 {
1909 uschar *mcode = code;
Guido van Rossum50700601997-12-08 17:15:20 +00001910 if (!compile_regex(options, brackets, &mcode, &ptr, errorptr, dictionary))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001911 goto FAILED;
1912 code = mcode;
1913 }
1914
1915 if (*ptr != ')')
1916 {
Guido van Rossum50700601997-12-08 17:15:20 +00001917 *errorptr = ERR14;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001918 goto FAILED;
1919 }
1920 break;
1921
1922 /* Check \ for being a real metacharacter; if not, fall through and handle
1923 it as a data character at the start of a string. Escape items are checked
1924 for validity in the pre-compiling pass. */
1925
1926 case '\\':
1927 oldptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00001928 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001929
1930 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1931 are arranged to be the negation of the corresponding OP_values. For the
1932 back references, the values are ESC_REF plus the reference number. Only
1933 back references and those types that consume a character may be repeated.
1934 We can test for values between ESC_b and ESC_Z for the latter; this may
1935 have to change if any new ones are ever created. */
1936
1937 if (c < 0)
1938 {
1939 if (-c >= ESC_REF)
1940 {
Guido van Rossum50700601997-12-08 17:15:20 +00001941 int refnum = -c - ESC_REF;
1942 if (*brackets < refnum)
1943 {
1944 *errorptr = ERR15;
1945 goto FAILED;
1946 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001947 previous = code;
1948 *code++ = OP_REF;
1949 *code++ = refnum;
1950 }
1951 else
1952 {
Guido van Rossum50700601997-12-08 17:15:20 +00001953 previous = (-c > ESC_b && -c < ESC_X)? code : NULL;
1954 if ( (options & PCRE_LOCALE) != 0)
1955 {
1956 switch (c)
1957 {
1958 case (-ESC_b): c = -OP_WORD_BOUNDARY_L; break;
1959 case (-ESC_B): c = -OP_NOT_WORD_BOUNDARY_L; break;
1960 case (-ESC_w): c = -OP_WORDCHAR_L; break;
1961 case (-ESC_W): c = -OP_NOT_WORDCHAR_L; break;
1962 }
1963 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001964 *code++ = -c;
1965 }
1966 continue;
1967 }
1968
Guido van Rossum58132c61997-12-17 00:24:13 +00001969 /* Data character: Reset and fall through */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001970
1971 ptr = oldptr;
1972 c = '\\';
1973
1974 /* Handle a run of data characters until a metacharacter is encountered.
1975 The first character is guaranteed not to be whitespace or # when the
1976 extended flag is set. */
1977
Guido van Rossum50700601997-12-08 17:15:20 +00001978 NORMAL_CHAR:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001979 default:
1980 previous = code;
1981 *code = OP_CHARS;
1982 code += 2;
1983 length = 0;
1984
1985 do
1986 {
Guido van Rossum50700601997-12-08 17:15:20 +00001987 if ((options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001988 {
1989 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1990 if (c == '#')
1991 {
1992 while ((c = *(++ptr)) != 0 && c != '\n');
1993 if (c == 0) break;
1994 continue;
1995 }
1996 }
1997
1998 /* Backslash may introduce a data char or a metacharacter. Escaped items
1999 are checked for validity in the pre-compiling pass. Stop the string
2000 before a metaitem. */
2001
2002 if (c == '\\')
2003 {
2004 oldptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00002005 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002006 if (c < 0) { ptr = oldptr; break; }
2007 }
2008
2009 /* Ordinary character or single-char escape */
2010
2011 *code++ = c;
2012 length++;
2013 }
2014
2015 /* This "while" is the end of the "do" above. */
2016
2017 while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2018
2019 /* Compute the length and set it in the data vector, and advance to
2020 the next state. */
2021
2022 previous[1] = length;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00002023 if (length < 255) ptr--;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002024 break;
2025 }
2026 } /* end of big loop */
2027
2028/* Control never reaches here by falling through, only by a goto for all the
2029error states. Pass back the position in the pattern so that it can be displayed
2030to the user for diagnosing the error. */
2031
2032FAILED:
2033*ptrptr = ptr;
2034return FALSE;
2035}
2036
2037
2038
2039
2040/*************************************************
2041* Compile sequence of alternatives *
2042*************************************************/
2043
2044/* On entry, ptr is pointing past the bracket character, but on return
2045it points to the closing bracket, or vertical bar, or end of string.
2046The code variable is pointing at the byte into which the BRA operator has been
2047stored.
2048
2049Argument:
Guido van Rossum50700601997-12-08 17:15:20 +00002050 options the option bits
2051 brackets -> int containing the number of extracting brackets used
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002052 codeptr -> the address of the current code pointer
2053 ptrptr -> the address of the current pattern pointer
2054 errorptr -> pointer to error message
2055
2056Returns: TRUE on success
2057*/
2058
2059static BOOL
Guido van Rossum50700601997-12-08 17:15:20 +00002060compile_regex(int options, int *brackets, uschar **codeptr,
Guido van Rossum58132c61997-12-17 00:24:13 +00002061 const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002062{
Guido van Rossum58132c61997-12-17 00:24:13 +00002063const uschar *ptr = *ptrptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002064uschar *code = *codeptr;
2065uschar *start_bracket = code;
2066
2067for (;;)
2068 {
2069 int length;
2070 uschar *last_branch = code;
2071
2072 code += 3;
Guido van Rossum50700601997-12-08 17:15:20 +00002073 if (!compile_branch(options, brackets, &code, &ptr, errorptr, dictionary))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002074 {
2075 *ptrptr = ptr;
2076 return FALSE;
2077 }
2078
2079 /* Fill in the length of the last branch */
2080
2081 length = code - last_branch;
2082 last_branch[1] = length >> 8;
2083 last_branch[2] = length & 255;
2084
2085 /* Reached end of expression, either ')' or end of pattern. Insert a
2086 terminating ket and the length of the whole bracketed item, and return,
2087 leaving the pointer at the terminating char. */
2088
2089 if (*ptr != '|')
2090 {
2091 length = code - start_bracket;
2092 *code++ = OP_KET;
2093 *code++ = length >> 8;
2094 *code++ = length & 255;
2095 *codeptr = code;
2096 *ptrptr = ptr;
2097 return TRUE;
2098 }
2099
2100 /* Another branch follows; insert an "or" node and advance the pointer. */
2101
2102 *code = OP_ALT;
2103 ptr++;
2104 }
2105/* Control never reaches here */
2106}
2107
2108
2109
2110/*************************************************
2111* Check for anchored expression *
2112*************************************************/
2113
2114/* Try to find out if this is an anchored regular expression. Consider each
2115alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2116all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2117it's anchored. However, if this is a multiline pattern, then only OP_SOD
2118counts, since OP_CIRC can match in the middle.
2119
2120A branch is also implicitly anchored if it starts with .* because that will try
2121the rest of the pattern at all possible matching points, so there is no point
2122trying them again.
2123
2124Argument: points to start of expression (the bracket)
2125Returns: TRUE or FALSE
2126*/
2127
2128static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00002129is_anchored(register const uschar *code, BOOL multiline)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002130{
2131do {
2132 int op = (int)code[3];
Guido van Rossum50700601997-12-08 17:15:20 +00002133 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002134 { if (!is_anchored(code+3, multiline)) return FALSE; }
2135 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2136 { if (code[4] != OP_ANY) return FALSE; }
2137 else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;
2138 code += (code[1] << 8) + code[2];
2139 }
2140while (*code == OP_ALT);
2141return TRUE;
2142}
2143
2144
2145
2146/*************************************************
2147* Check for start with \n line expression *
2148*************************************************/
2149
2150/* This is called for multiline expressions to try to find out if every branch
2151starts with ^ so that "first char" processing can be done to speed things up.
2152
2153Argument: points to start of expression (the bracket)
2154Returns: TRUE or FALSE
2155*/
2156
2157static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00002158is_startline(const uschar *code)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002159{
2160do {
2161 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2162 { if (!is_startline(code+3)) return FALSE; }
2163 else if (code[3] != OP_CIRC) return FALSE;
2164 code += (code[1] << 8) + code[2];
2165 }
2166while (*code == OP_ALT);
2167return TRUE;
2168}
2169
2170
2171
2172/*************************************************
2173* Check for fixed first char *
2174*************************************************/
2175
2176/* Try to find out if there is a fixed first character. This is called for
2177unanchored expressions, as it speeds up their processing quite considerably.
2178Consider each alternative branch. If they all start with the same char, or with
2179a bracket all of whose alternatives start with the same char (recurse ad lib),
2180then we return that char, otherwise -1.
2181
2182Argument: points to start of expression (the bracket)
2183Returns: -1 or the fixed first char
2184*/
2185
2186static int
2187find_firstchar(uschar *code)
2188{
2189register int c = -1;
2190do
2191 {
2192 register int charoffset = 4;
2193
2194 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2195 {
2196 register int d;
2197 if ((d = find_firstchar(code+3)) < 0) return -1;
2198 if (c < 0) c = d; else if (c != d) return -1;
2199 }
2200
2201 else switch(code[3])
2202 {
2203 default:
2204 return -1;
2205
2206 case OP_EXACT: /* Fall through */
2207 charoffset++;
2208
2209 case OP_CHARS: /* Fall through */
2210 charoffset++;
2211
2212 case OP_PLUS:
2213 case OP_MINPLUS:
2214 if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;
2215 break;
2216 }
2217 code += (code[1] << 8) + code[2];
2218 }
2219while (*code == OP_ALT);
2220return c;
2221}
2222
2223
2224
2225/*************************************************
2226* Compile a Regular Expression *
2227*************************************************/
2228
2229/* This function takes a string and returns a pointer to a block of store
2230holding a compiled version of the expression.
2231
2232Arguments:
2233 pattern the regular expression
2234 options various option bits
2235 errorptr pointer to pointer to error text
2236 erroroffset ptr offset in pattern where error was detected
2237
2238Returns: pointer to compiled data block, or NULL on error,
2239 with errorptr and erroroffset set
2240*/
2241
2242pcre *
Guido van Rossum58132c61997-12-17 00:24:13 +00002243pcre_compile(const char *pattern, int options, const char **errorptr,
Guido van Rossum50700601997-12-08 17:15:20 +00002244 int *erroroffset, PyObject *dictionary)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002245{
2246real_pcre *re;
2247int spaces = 0;
2248int length = 3; /* For initial BRA plus length */
2249int runlength;
2250int c, size;
Guido van Rossum50700601997-12-08 17:15:20 +00002251int bracount = 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002252int brastack[200];
Guido van Rossum50700601997-12-08 17:15:20 +00002253int top_backref = 0;
Guido van Rossum58132c61997-12-17 00:24:13 +00002254unsigned int brastackptr = 0;
2255uschar *code;
2256const uschar *ptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002257
2258#ifdef DEBUG
2259uschar *code_base, *code_end;
2260#endif
2261
Guido van Rossum50700601997-12-08 17:15:20 +00002262/* We can't pass back an error message if errorptr is NULL; I guess the best we
2263can do is just return NULL. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002264
Guido van Rossum50700601997-12-08 17:15:20 +00002265if (errorptr == NULL) return NULL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002266*errorptr = NULL;
Guido van Rossum50700601997-12-08 17:15:20 +00002267
2268/* However, we can give a message for this error */
2269
2270if (erroroffset == NULL)
2271 {
2272 *errorptr = ERR16;
2273 return NULL;
2274 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002275*erroroffset = 0;
2276
2277if ((options & ~PUBLIC_OPTIONS) != 0)
2278 {
Guido van Rossum50700601997-12-08 17:15:20 +00002279 *errorptr = ERR17;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002280 return NULL;
2281 }
2282
Guido van Rossum557dea11997-12-22 22:46:52 +00002283DPRINTF(("------------------------------------------------------------------\n"));
2284DPRINTF(("%s\n", pattern));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002285
2286/* The first thing to do is to make a pass over the pattern to compute the
2287amount of store required to hold the compiled code. This does not have to be
2288perfect as long as errors are overestimates. At the same time we can detect any
2289internal flag settings. Make an attempt to correct for any counted white space
2290if an "extended" flag setting appears late in the pattern. We can't be so
2291clever for #-comments. */
2292
Guido van Rossum58132c61997-12-17 00:24:13 +00002293ptr = (const uschar *)(pattern - 1);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002294while ((c = *(++ptr)) != 0)
2295 {
Guido van Rossum50700601997-12-08 17:15:20 +00002296 int min, max;
2297 int class_charcount;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002298
2299 if ((pcre_ctypes[c] & ctype_space) != 0)
2300 {
Guido van Rossum50700601997-12-08 17:15:20 +00002301 if ((options & PCRE_EXTENDED) != 0) continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002302 spaces++;
2303 }
2304
Guido van Rossum50700601997-12-08 17:15:20 +00002305 if (c == '#' && (options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002306 {
2307 while ((c = *(++ptr)) != 0 && c != '\n');
2308 continue;
2309 }
2310
2311 switch(c)
2312 {
2313 /* A backslashed item may be an escaped "normal" character or a
2314 character type. For a "normal" character, put the pointers and
2315 character back so that tests for whitespace etc. in the input
2316 are done correctly. */
2317
2318 case '\\':
2319 {
Guido van Rossum58132c61997-12-17 00:24:13 +00002320 const uschar *save_ptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00002321 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002322 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2323 if (c >= 0)
2324 {
2325 ptr = save_ptr;
2326 c = '\\';
2327 goto NORMAL_CHAR;
2328 }
2329 }
2330 length++;
2331
2332 /* A back reference needs an additional char, plus either one or 5
Guido van Rossum50700601997-12-08 17:15:20 +00002333 bytes for a repeat. We also need to keep the value of the highest
2334 back reference. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002335
2336 if (c <= -ESC_REF)
2337 {
Guido van Rossum50700601997-12-08 17:15:20 +00002338 int refnum = -c - ESC_REF;
2339 if (refnum > top_backref) top_backref = refnum;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002340 length++; /* For single back reference */
Guido van Rossum50700601997-12-08 17:15:20 +00002341 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002342 {
2343 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2344 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2345 if ((min == 0 && (max == 1 || max == -1)) ||
2346 (min == 1 && max == -1))
2347 length++;
2348 else length += 5;
2349 if (ptr[1] == '?') ptr++;
2350 }
2351 }
2352 continue;
2353
2354 case '^':
2355 case '.':
2356 case '$':
2357 case '*': /* These repeats won't be after brackets; */
2358 case '+': /* those are handled separately */
2359 case '?':
2360 length++;
2361 continue;
2362
2363 /* This covers the cases of repeats after a single char, metachar, class,
2364 or back reference. */
2365
2366 case '{':
Guido van Rossum50700601997-12-08 17:15:20 +00002367 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002368 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
2369 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2370 if ((min == 0 && (max == 1 || max == -1)) ||
2371 (min == 1 && max == -1))
2372 length++;
2373 else
2374 {
2375 length--; /* Uncount the original char or metachar */
2376 if (min == 1) length++; else if (min > 0) length += 4;
2377 if (max > 0) length += 4; else length += 2;
2378 }
2379 if (ptr[1] == '?') ptr++;
2380 continue;
2381
2382 /* An alternation contains an offset to the next branch or ket. */
2383 case '|':
2384 length += 3;
2385 continue;
2386
Guido van Rossum50700601997-12-08 17:15:20 +00002387 /* A character class uses 33 characters. Don't worry about character types
2388 that aren't allowed in classes - they'll get picked up during the compile.
2389 A character class that contains only one character uses 2 or 3 bytes,
2390 depending on whether it is negated or not. Notice this where we can. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002391
2392 case '[':
Guido van Rossum50700601997-12-08 17:15:20 +00002393 class_charcount = 0;
2394 if (*(++ptr) == '^') ptr++;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002395 do
2396 {
Guido van Rossum50700601997-12-08 17:15:20 +00002397 if (*ptr == '\\')
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002398 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002399 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002400 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
Guido van Rossum557dea11997-12-22 22:46:52 +00002401 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002402 }
Guido van Rossum50700601997-12-08 17:15:20 +00002403 else class_charcount++;
2404 ptr++;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002405 }
2406 while (*ptr != 0 && *ptr != ']');
2407
Guido van Rossum50700601997-12-08 17:15:20 +00002408 /* Repeats for negated single chars are handled by the general code */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002409
Guido van Rossum50700601997-12-08 17:15:20 +00002410 if (class_charcount == 1) length += 3; else
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002411 {
Guido van Rossum50700601997-12-08 17:15:20 +00002412 length += 33;
2413 if (options & PCRE_LOCALE) length++; /* Add a byte for the localization flag */
2414
2415 /* A repeat needs either 1 or 5 bytes. */
2416
Guido van Rossum557dea11997-12-22 22:46:52 +00002417 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
Guido van Rossum50700601997-12-08 17:15:20 +00002418 {
2419 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2420 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2421 if ((min == 0 && (max == 1 || max == -1)) ||
2422 (min == 1 && max == -1))
2423 length++;
2424 else length += 5;
2425 if (ptr[1] == '?') ptr++;
2426 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002427 }
2428 continue;
2429
2430 /* Brackets may be genuine groups or special things */
2431
2432 case '(':
2433
2434 /* Handle special forms of bracket, which all start (? */
2435
2436 if (ptr[1] == '?') switch (c = ptr[2])
2437 {
2438 /* Skip over comments entirely */
2439 case '#':
2440 ptr += 3;
2441 while (*ptr != 0 && *ptr != ')') ptr++;
2442 if (*ptr == 0)
2443 {
Guido van Rossum50700601997-12-08 17:15:20 +00002444 *errorptr = ERR18;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002445 goto PCRE_ERROR_RETURN;
2446 }
2447 continue;
2448
2449 /* Non-referencing groups and lookaheads just move the pointer on, and
Guido van Rossum50700601997-12-08 17:15:20 +00002450 then behave like a non-special bracket, except that they don't increment
2451 the count of extracting brackets. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002452
2453 case ':':
2454 case '=':
2455 case '!':
2456 ptr += 2;
2457 break;
2458
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002459 case ('P'):
2460 {
2461 int idlen;
2462 switch (*ptr++) {
2463 case ('<'):
2464 idlen = get_group_id(ptr++, '>', errorptr);
2465 if (*errorptr) goto PCRE_ERROR_RETURN;
2466 ptr += idlen+1;
2467 break;
2468 case ('='):
2469 idlen = get_group_id(ptr++, ')', errorptr);
2470 if (*errorptr) goto PCRE_ERROR_RETURN;
2471 ptr += idlen+1;
2472 length++;
2473 break;
2474 }
2475 }
2476 break;
2477
Guido van Rossum50700601997-12-08 17:15:20 +00002478 /* Ditto for the "once only" bracket, allowed only if the extra bit
2479 is set. */
2480
2481 case '>':
2482 if ((options & PCRE_EXTRA) != 0)
2483 {
2484 ptr += 2;
2485 break;
2486 }
2487 /* Else fall thourh */
2488
2489 /* Else loop setting valid options until ) is met. Anything else is an
2490 error. */
2491
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002492 default:
2493 ptr += 2;
2494 for (;; ptr++)
2495 {
2496 if ((c = *ptr) == 'i')
2497 {
2498 options |= PCRE_CASELESS;
2499 continue;
2500 }
Guido van Rossumbd49ac41997-12-10 23:05:53 +00002501 else if ((c = *ptr) == 'L')
Guido van Rossum50700601997-12-08 17:15:20 +00002502 {
2503 options |= PCRE_LOCALE;
2504 continue;
2505 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002506 else if ((c = *ptr) == 'm')
2507 {
2508 options |= PCRE_MULTILINE;
2509 continue;
2510 }
Guido van Rossum50700601997-12-08 17:15:20 +00002511 else if (c == 's')
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002512 {
2513 options |= PCRE_DOTALL;
2514 continue;
2515 }
2516 else if (c == 'x')
2517 {
2518 options |= PCRE_EXTENDED;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002519 length -= spaces; /* Already counted spaces */
2520 continue;
2521 }
2522 else if (c == ')') break;
2523
Guido van Rossum50700601997-12-08 17:15:20 +00002524 *errorptr = ERR12;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002525 goto PCRE_ERROR_RETURN;
2526 }
2527 continue; /* End of this bracket handling */
2528 }
2529
Guido van Rossum50700601997-12-08 17:15:20 +00002530 /* Extracting brackets must be counted so we can process escapes in a
2531 Perlish way. */
2532
2533 else bracount++;
2534
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002535 /* Non-special forms of bracket. Save length for computing whole length
2536 at end if there's a repeat that requires duplication of the group. */
2537
2538 if (brastackptr >= sizeof(brastack)/sizeof(int))
2539 {
Guido van Rossum50700601997-12-08 17:15:20 +00002540 *errorptr = ERR19;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002541 goto PCRE_ERROR_RETURN;
2542 }
2543
2544 brastack[brastackptr++] = length;
2545 length += 3;
2546 continue;
2547
2548 /* Handle ket. Look for subsequent max/min; for certain sets of values we
Guido van Rossum557dea11997-12-22 22:46:52 +00002549 have to replicate this bracket up to that many times. If brastackptr is
2550 0 this is an unmatched bracket which will generate an error, but take care
2551 not to try to access brastack[-1]. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002552
2553 case ')':
2554 length += 3;
2555 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002556 int minval = 1;
2557 int maxval = 1;
2558 int duplength = (brastackptr > 0)? length - brastack[--brastackptr] : 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002559
2560 /* Leave ptr at the final char; for read_repeat_counts this happens
2561 automatically; for the others we need an increment. */
2562
Guido van Rossum50700601997-12-08 17:15:20 +00002563 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002564 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002565 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002566 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2567 }
Guido van Rossum557dea11997-12-22 22:46:52 +00002568 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2569 else if (c == '+') { maxval = -1; ptr++; }
2570 else if (c == '?') { minval = 0; ptr++; }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002571
Guido van Rossum557dea11997-12-22 22:46:52 +00002572 /* If there is a minimum > 1 we have to replicate up to minval-1 times;
2573 if there is a limited maximum we have to replicate up to maxval-1 times
2574 and allow for a BRAZERO item before each optional copy, as we also have
2575 to do before the first copy if the minimum is zero. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002576
Guido van Rossum557dea11997-12-22 22:46:52 +00002577 if (minval == 0) length++;
2578 else if (minval > 1) length += (minval - 1) * duplength;
2579 if (maxval > minval) length += (maxval - minval) * (duplength + 1);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002580 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002581 continue;
2582
2583 /* Non-special character. For a run of such characters the length required
2584 is the number of characters + 2, except that the maximum run length is 255.
2585 We won't get a skipped space or a non-data escape or the start of a #
2586 comment as the first character, so the length can't be zero. */
2587
2588 NORMAL_CHAR:
2589 default:
2590 length += 2;
2591 runlength = 0;
2592 do
2593 {
2594 if ((pcre_ctypes[c] & ctype_space) != 0)
2595 {
Guido van Rossum50700601997-12-08 17:15:20 +00002596 if ((options & PCRE_EXTENDED) != 0) continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002597 spaces++;
2598 }
2599
Guido van Rossum50700601997-12-08 17:15:20 +00002600 if (c == '#' && (options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002601 {
2602 while ((c = *(++ptr)) != 0 && c != '\n');
2603 continue;
2604 }
2605
2606 /* Backslash may introduce a data char or a metacharacter; stop the
2607 string before the latter. */
2608
2609 if (c == '\\')
2610 {
Guido van Rossum58132c61997-12-17 00:24:13 +00002611 const uschar *saveptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00002612 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002613 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2614 if (c < 0) { ptr = saveptr; break; }
2615 }
2616
2617 /* Ordinary character or single-char escape */
2618
2619 runlength++;
2620 }
2621
2622 /* This "while" is the end of the "do" above. */
2623
2624 while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2625
2626 ptr--;
2627 length += runlength;
2628 continue;
2629 }
2630 }
2631
2632length += 4; /* For final KET and END */
2633
2634if (length > 65539)
2635 {
Guido van Rossum50700601997-12-08 17:15:20 +00002636 *errorptr = ERR20;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002637 return NULL;
2638 }
2639
2640/* Compute the size of data block needed and get it, either from malloc or
Guido van Rossum557dea11997-12-22 22:46:52 +00002641externally provided function. We specify "code[0]" in the offsetof() expression
2642rather than just "code", because it has been reported that one broken compiler
2643fails on "code" because it is also an independent variable. It should make no
2644difference to the value of the offsetof(). */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002645
Guido van Rossum557dea11997-12-22 22:46:52 +00002646size = length + offsetof(real_pcre, code[0]);
Guido van Rossum50700601997-12-08 17:15:20 +00002647re = (real_pcre *)(pcre_malloc)(size+50);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002648
2649if (re == NULL)
2650 {
Guido van Rossum50700601997-12-08 17:15:20 +00002651 *errorptr = ERR21;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002652 return NULL;
2653 }
2654
Guido van Rossum557dea11997-12-22 22:46:52 +00002655/* Put in the magic number and the options. */
2656
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002657re->magic_number = MAGIC_NUMBER;
2658re->options = options;
2659
2660/* Set up a starting, non-extracting bracket, then compile the expression. On
2661error, *errorptr will be set non-NULL, so we don't need to look at the result
2662of the function here. */
2663
Guido van Rossum58132c61997-12-17 00:24:13 +00002664ptr = (const uschar *)pattern;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002665code = re->code;
2666*code = OP_BRA;
Guido van Rossum50700601997-12-08 17:15:20 +00002667bracount = 0;
2668(void)compile_regex(options, &bracount, &code, &ptr, errorptr, dictionary);
2669re->top_bracket = bracount;
2670re->top_backref = top_backref;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002671
2672/* If not reached end of pattern on success, there's an excess bracket. */
2673
Guido van Rossum50700601997-12-08 17:15:20 +00002674if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
2675
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002676/* Fill in the terminating state and check for disastrous overflow, but
2677if debugging, leave the test till after things are printed out. */
2678
2679*code++ = OP_END;
2680
Guido van Rossum50700601997-12-08 17:15:20 +00002681
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002682#ifndef DEBUG
Guido van Rossum50700601997-12-08 17:15:20 +00002683if (code - re->code > length) *errorptr = ERR23;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002684#endif
2685
2686/* Failed to compile */
2687
2688if (*errorptr != NULL)
2689 {
2690 (pcre_free)(re);
2691 PCRE_ERROR_RETURN:
Guido van Rossum58132c61997-12-17 00:24:13 +00002692 *erroroffset = ptr - (const uschar *)pattern;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002693 return NULL;
2694 }
2695
2696/* If the anchored option was not passed, set flag if we can determine that it
2697is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
2698we can determine what the first character has to be, because that speeds up
2699unanchored matches no end. In the case of multiline matches, an alternative is
2700to set the PCRE_STARTLINE flag if all branches start with ^. */
2701
2702if ((options & PCRE_ANCHORED) == 0)
2703 {
2704 if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))
2705 re->options |= PCRE_ANCHORED;
2706 else
2707 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002708 int ch = find_firstchar(re->code);
2709 if (ch >= 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002710 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002711 re->first_char = ch;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002712 re->options |= PCRE_FIRSTSET;
2713 }
2714 else if (is_startline(re->code))
2715 re->options |= PCRE_STARTLINE;
2716 }
2717 }
2718
2719/* Print out the compiled data for debugging */
2720
2721#ifdef DEBUG
2722
Guido van Rossum50700601997-12-08 17:15:20 +00002723printf("Length = %d top_bracket = %d top_backref=%d\n",
2724 length, re->top_bracket, re->top_backref);
2725
2726if (re->options != 0)
2727 {
2728 printf("%s%s%s%s%s%s%s\n",
2729 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2730 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2731 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2732 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2733 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2734 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
2735 ((re->options & PCRE_EXTRA) != 0)? "extra " : "");
2736 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002737
2738if ((re->options & PCRE_FIRSTSET) != 0)
2739 {
2740 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2741 else printf("First char = \\x%02x\n", re->first_char);
2742 }
2743
2744code_end = code;
2745code_base = code = re->code;
2746
2747while (code < code_end)
2748 {
2749 int charlength;
2750
2751 printf("%3d ", code - code_base);
2752
2753 if (*code >= OP_BRA)
2754 {
2755 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2756 code += 2;
2757 }
2758
2759 else switch(*code)
2760 {
2761 case OP_CHARS:
2762 charlength = *(++code);
2763 printf("%3d ", charlength);
2764 while (charlength-- > 0)
2765 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2766 break;
2767
2768 case OP_KETRMAX:
2769 case OP_KETRMIN:
2770 case OP_ALT:
2771 case OP_KET:
2772 case OP_ASSERT:
2773 case OP_ASSERT_NOT:
Guido van Rossum50700601997-12-08 17:15:20 +00002774 case OP_ONCE:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002775 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2776 code += 2;
2777 break;
2778
2779 case OP_STAR:
2780 case OP_MINSTAR:
2781 case OP_PLUS:
2782 case OP_MINPLUS:
2783 case OP_QUERY:
2784 case OP_MINQUERY:
2785 case OP_TYPESTAR:
2786 case OP_TYPEMINSTAR:
2787 case OP_TYPEPLUS:
2788 case OP_TYPEMINPLUS:
2789 case OP_TYPEQUERY:
2790 case OP_TYPEMINQUERY:
2791 if (*code >= OP_TYPESTAR)
2792 printf(" %s", OP_names[code[1]]);
2793 else if (isprint(c = code[1])) printf(" %c", c);
2794 else printf(" \\x%02x", c);
2795 printf("%s", OP_names[*code++]);
2796 break;
2797
2798 case OP_EXACT:
2799 case OP_UPTO:
2800 case OP_MINUPTO:
2801 if (isprint(c = code[3])) printf(" %c{", c);
2802 else printf(" \\x%02x{", c);
Guido van Rossum557dea11997-12-22 22:46:52 +00002803 if (*code != OP_EXACT) printf("0,");
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002804 printf("%d}", (code[1] << 8) + code[2]);
2805 if (*code == OP_MINUPTO) printf("?");
2806 code += 3;
2807 break;
2808
2809 case OP_TYPEEXACT:
2810 case OP_TYPEUPTO:
2811 case OP_TYPEMINUPTO:
2812 printf(" %s{", OP_names[code[3]]);
2813 if (*code != OP_TYPEEXACT) printf(",");
2814 printf("%d}", (code[1] << 8) + code[2]);
2815 if (*code == OP_TYPEMINUPTO) printf("?");
2816 code += 3;
2817 break;
2818
Guido van Rossum50700601997-12-08 17:15:20 +00002819 case OP_NOT:
2820 if (isprint(c = *(++code))) printf(" [^%c]", c);
2821 else printf(" [^\\x%02x]", c);
2822 break;
2823
2824 case OP_NOTSTAR:
2825 case OP_NOTMINSTAR:
2826 case OP_NOTPLUS:
2827 case OP_NOTMINPLUS:
2828 case OP_NOTQUERY:
2829 case OP_NOTMINQUERY:
2830 if (isprint(c = code[1])) printf(" [^%c]", c);
2831 else printf(" [^\\x%02x]", c);
2832 printf("%s", OP_names[*code++]);
2833 break;
2834
2835 case OP_NOTEXACT:
2836 case OP_NOTUPTO:
2837 case OP_NOTMINUPTO:
2838 if (isprint(c = code[3])) printf(" [^%c]{", c);
2839 else printf(" [^\\x%02x]{", c);
2840 if (*code != OP_NOTEXACT) printf(",");
2841 printf("%d}", (code[1] << 8) + code[2]);
2842 if (*code == OP_NOTMINUPTO) printf("?");
2843 code += 3;
2844 break;
2845
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002846 case OP_REF:
2847 printf(" \\%d", *(++code));
Guido van Rossum557dea11997-12-22 22:46:52 +00002848 code ++;
2849 goto CLASS_REF_REPEAT;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002850
2851 case OP_CLASS:
Guido van Rossum042ff9e1998-04-03 21:13:31 +00002852 case OP_NEGCLASS:
Guido van Rossum50700601997-12-08 17:15:20 +00002853 case OP_CLASS_L:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002854 {
2855 int i, min, max;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002856
Guido van Rossum50700601997-12-08 17:15:20 +00002857 if (*code==OP_CLASS_L)
2858 {
2859 code++;
2860 printf("Locflag = %i ", *code++);
Guido van Rossum042ff9e1998-04-03 21:13:31 +00002861 printf(" [");
Guido van Rossum50700601997-12-08 17:15:20 +00002862 }
2863 else
Guido van Rossum042ff9e1998-04-03 21:13:31 +00002864 {
2865 if (*code++ == OP_CLASS) printf(" [");
2866 else printf(" ^[");
2867 }
2868
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002869
Guido van Rossum50700601997-12-08 17:15:20 +00002870 for (i = 0; i < 256; i++)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002871 {
Guido van Rossum50700601997-12-08 17:15:20 +00002872 if ((code[i/8] & (1 << (i&7))) != 0)
2873 {
2874 int j;
2875 for (j = i+1; j < 256; j++)
2876 if ((code[j/8] & (1 << (j&7))) == 0) break;
2877 if (i == '-' || i == ']') printf("\\");
2878 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2879 if (--j > i)
2880 {
2881 printf("-");
2882 if (j == '-' || j == ']') printf("\\");
2883 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2884 }
2885 i = j;
2886 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002887 }
2888 printf("]");
Guido van Rossum50700601997-12-08 17:15:20 +00002889 code += 32;
2890 /* code ++;*/
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002891
Guido van Rossum557dea11997-12-22 22:46:52 +00002892 CLASS_REF_REPEAT:
2893
Guido van Rossum50700601997-12-08 17:15:20 +00002894 switch(*code)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002895 {
2896 case OP_CRSTAR:
2897 case OP_CRMINSTAR:
2898 case OP_CRPLUS:
2899 case OP_CRMINPLUS:
2900 case OP_CRQUERY:
2901 case OP_CRMINQUERY:
2902 printf("%s", OP_names[*code]);
2903 break;
2904
2905 case OP_CRRANGE:
2906 case OP_CRMINRANGE:
2907 min = (code[1] << 8) + code[2];
2908 max = (code[3] << 8) + code[4];
2909 if (max == 0) printf("{%d,}", min);
2910 else printf("{%d,%d}", min, max);
2911 if (*code == OP_CRMINRANGE) printf("?");
2912 code += 4;
2913 break;
2914
2915 default:
2916 code--;
2917 }
2918 }
2919 break;
2920
2921 /* Anything else is just a one-node item */
2922
2923 default:
2924 printf(" %s", OP_names[*code]);
2925 break;
2926 }
2927
2928 code++;
2929 printf("\n");
2930 }
2931printf("------------------------------------------------------------------\n");
2932
2933/* This check is done here in the debugging case so that the code that
2934was compiled can be seen. */
2935
2936if (code - re->code > length)
2937 {
Guido van Rossum50700601997-12-08 17:15:20 +00002938 printf("length=%i, code length=%i\n", length, code-re->code);
2939 *errorptr = ERR23;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002940 (pcre_free)(re);
2941 *erroroffset = ptr - (uschar *)pattern;
2942 return NULL;
2943 }
2944#endif
2945
2946return (pcre *)re;
2947}
2948
2949
2950
2951/*************************************************
2952* Match a character type *
2953*************************************************/
2954
2955/* Not used in all the places it might be as it's sometimes faster
2956to put the code inline.
2957
2958Arguments:
2959 type the character type
2960 c the character
Guido van Rossum50700601997-12-08 17:15:20 +00002961 dotall the dotall flag
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002962
2963Returns: TRUE if character is of the type
2964*/
2965
2966static BOOL
2967match_type(int type, int c, BOOL dotall)
2968{
2969
2970#ifdef DEBUG
2971if (isprint(c)) printf("matching subject %c against ", c);
2972 else printf("matching subject \\x%02x against ", c);
2973printf("%s\n", OP_names[type]);
2974#endif
2975
2976switch(type)
2977 {
2978 case OP_ANY: return dotall || c != '\n';
2979 case OP_NOT_DIGIT: return (pcre_ctypes[c] & ctype_digit) == 0;
2980 case OP_DIGIT: return (pcre_ctypes[c] & ctype_digit) != 0;
2981 case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
2982 case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0;
2983 case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0;
2984 case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0;
Guido van Rossum58132c61997-12-17 00:24:13 +00002985 case OP_NOT_WORDCHAR_L: return (c!='_' && !isalnum(c));
2986 case OP_WORDCHAR_L: return (c=='_' || isalnum(c));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002987 }
2988return FALSE;
2989}
2990
2991
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002992
2993/*************************************************
2994* Match a back-reference *
2995*************************************************/
2996
2997/* If a back reference hasn't been set, the match fails.
2998
2999Arguments:
3000 number reference number
3001 eptr points into the subject
3002 length length to be matched
3003 md points to match data block
3004
3005Returns: TRUE if matched
3006*/
3007
3008static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00003009match_ref(int number, register const uschar *eptr, int length, match_data *md)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003010{
Guido van Rossum58132c61997-12-17 00:24:13 +00003011const uschar *p = md->start_subject + md->offset_vector[number];
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003012
3013#ifdef DEBUG
3014if (eptr >= md->end_subject)
3015 printf("matching subject <null>");
3016else
3017 {
3018 printf("matching subject ");
3019 pchars(eptr, length, TRUE, md);
3020 }
3021printf(" against backref ");
3022pchars(p, length, FALSE, md);
3023printf("\n");
3024#endif
3025
3026/* Always fail if not enough characters left */
3027
3028if (length > md->end_subject - p) return FALSE;
3029
Guido van Rossum58132c61997-12-17 00:24:13 +00003030/* Separate the caseless case for speed */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003031
3032if (md->caseless)
3033 { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }
3034else
3035 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3036
3037return TRUE;
3038}
3039
3040static int free_stack(match_data *md)
3041{
3042/* Free any stack space that was allocated by the call to match(). */
3043if (md->off_num) free(md->off_num);
3044if (md->offset_top) free(md->offset_top);
3045if (md->r1) free(md->r1);
3046if (md->r2) free(md->r2);
3047if (md->eptr) free(md->eptr);
Guido van Rossumc3861071997-10-08 02:07:40 +00003048if (md->ecode) free(md->ecode);
3049return 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003050}
3051
3052static int grow_stack(match_data *md)
3053{
Guido van Rossum50700601997-12-08 17:15:20 +00003054 if (md->length != 0)
3055 {
3056 md->length = md->length + md->length/2;
3057 }
3058 else
3059 {
3060 int string_len = md->end_subject - md->start_subject + 1;
3061 if (string_len < 80) {md->length = string_len; }
3062 else {md->length = 80;}
3063 }
3064 PyMem_RESIZE(md->offset_top, int, md->length);
Guido van Rossum58132c61997-12-17 00:24:13 +00003065 PyMem_RESIZE(md->eptr, const uschar *, md->length);
3066 PyMem_RESIZE(md->ecode, const uschar *, md->length);
Guido van Rossum50700601997-12-08 17:15:20 +00003067 PyMem_RESIZE(md->off_num, int, md->length);
3068 PyMem_RESIZE(md->r1, int, md->length);
3069 PyMem_RESIZE(md->r2, int, md->length);
3070 if (md->offset_top == NULL || md->eptr == NULL || md->ecode == NULL ||
3071 md->off_num == NULL || md->r1 == NULL || md->r2 == NULL)
3072 {
3073 PyErr_SetString(PyExc_MemoryError, "Can't increase failure stack for re operation");
3074 longjmp(md->error_env, 1);
3075 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003076 return 0;
3077}
3078
Guido van Rossum50700601997-12-08 17:15:20 +00003079
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003080/*************************************************
3081* Match from current position *
3082*************************************************/
3083
3084/* On entry ecode points to the first opcode, and eptr to the first character.
3085
3086Arguments:
3087 eptr pointer in subject
3088 ecode position in code
3089 offset_top current top pointer
3090 md pointer to "static" info for the match
3091
3092Returns: TRUE if matched
3093*/
3094
3095static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00003096match(register const uschar *eptr, register const uschar *ecode, int offset_top,
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003097 match_data *md)
3098{
3099 int save_stack_position = md->point;
3100match_loop:
3101
3102#define SUCCEED goto succeed
3103#define FAIL goto fail
3104
3105for (;;)
3106 {
3107 int min, max, ctype;
3108 register int i;
3109 register int c;
Guido van Rossum58132c61997-12-17 00:24:13 +00003110 BOOL minimize = FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003111
3112 /* Opening bracket. Check the alternative branches in turn, failing if none
3113 match. We have to set the start offset if required and there is space
3114 in the offset vector so that it is available for subsequent back references
3115 if the bracket matches. However, if the bracket fails, we must put back the
3116 previous value of both offsets in case they were set by a previous copy of
3117 the same bracket. Don't worry about setting the flag for the error case here;
3118 that is handled in the code for KET. */
3119
3120 if ((int)*ecode >= OP_BRA)
3121 {
3122 int number = (*ecode - OP_BRA) << 1;
Guido van Rossum58132c61997-12-17 00:24:13 +00003123 int save_offset1 = 0, save_offset2 = 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003124
Guido van Rossum557dea11997-12-22 22:46:52 +00003125 DPRINTF(("start bracket %d\n", number/2));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003126
3127 if (number > 0 && number < md->offset_end)
3128 {
3129 save_offset1 = md->offset_vector[number];
3130 save_offset2 = md->offset_vector[number+1];
3131 md->offset_vector[number] = eptr - md->start_subject;
3132
Guido van Rossum557dea11997-12-22 22:46:52 +00003133 DPRINTF(("saving %d %d\n", save_offset1, save_offset2));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003134 }
3135
3136 /* Recurse for all the alternatives. */
3137
3138 do
3139 {
3140 if (match(eptr, ecode+3, offset_top, md)) SUCCEED;
3141 ecode += (ecode[1] << 8) + ecode[2];
3142 }
3143 while (*ecode == OP_ALT);
3144
Guido van Rossum557dea11997-12-22 22:46:52 +00003145 DPRINTF(("bracket %d failed\n", number/2));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003146
3147 if (number > 0 && number < md->offset_end)
3148 {
3149 md->offset_vector[number] = save_offset1;
3150 md->offset_vector[number+1] = save_offset2;
3151 }
3152
3153 FAIL;
3154 }
3155
3156 /* Other types of node can be handled by a switch */
3157
3158 switch(*ecode)
3159 {
3160 case OP_END:
3161 md->end_match_ptr = eptr; /* Record where we ended */
3162 md->end_offset_top = offset_top; /* and how many extracts were taken */
3163 SUCCEED;
3164
Guido van Rossum50700601997-12-08 17:15:20 +00003165 /* The equivalent of Prolog's "cut" - if the rest doesn't match, the
3166 whole thing doesn't match, so we have to get out via a longjmp(). */
3167
3168 case OP_CUT:
3169 if (match(eptr, ecode+1, offset_top, md)) SUCCEED;
3170 longjmp(md->fail_env, 1);
3171
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003172 /* Assertion brackets. Check the alternative branches in turn - the
3173 matching won't pass the KET for an assertion. If any one branch matches,
3174 the assertion is true. */
3175
3176 case OP_ASSERT:
3177 do
3178 {
3179 if (match(eptr, ecode+3, offset_top, md)) break;
3180 ecode += (ecode[1] << 8) + ecode[2];
3181 }
3182 while (*ecode == OP_ALT);
3183 if (*ecode == OP_KET) FAIL;
3184
3185 /* Continue from after the assertion, updating the offsets high water
3186 mark, since extracts may have been taken during the assertion. */
3187
3188 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3189 ecode += 3;
3190 offset_top = md->end_offset_top;
3191 continue;
3192
3193 /* Negative assertion: all branches must fail to match */
3194
3195 case OP_ASSERT_NOT:
3196 do
3197 {
3198 if (match(eptr, ecode+3, offset_top, md)) FAIL;
3199 ecode += (ecode[1] << 8) + ecode[2];
3200 }
3201 while (*ecode == OP_ALT);
3202 ecode += 3;
3203 continue;
3204
Guido van Rossum50700601997-12-08 17:15:20 +00003205 /* "Once" brackets are like assertion brackets except that after a match,
3206 the point in the subject string is not moved back. Thus there can never be
3207 a move back into the brackets. Check the alternative branches in turn - the
3208 matching won't pass the KET for this kind of subpattern. If any one branch
3209 matches, we carry on, leaving the subject pointer. */
3210
3211 case OP_ONCE:
3212 do
3213 {
3214 if (match(eptr, ecode+3, offset_top, md)) break;
3215 ecode += (ecode[1] << 8) + ecode[2];
3216 }
3217 while (*ecode == OP_ALT);
Guido van Rossum557dea11997-12-22 22:46:52 +00003218 if (*ecode == OP_KET) FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00003219
3220 /* Continue as from after the assertion, updating the offsets high water
3221 mark, since extracts may have been taken. */
3222
3223 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3224 ecode += 3;
3225 offset_top = md->end_offset_top;
3226 eptr = md->end_match_ptr;
3227 continue;
3228
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003229 /* An alternation is the end of a branch; scan along to find the end of the
3230 bracketed group and go to there. */
3231
3232 case OP_ALT:
3233 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3234 break;
3235
3236 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3237 that it may occur zero times. It may repeat infinitely, or not at all -
3238 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3239 repeat limits are compiled as a number of copies, with the optional ones
3240 preceded by BRAZERO or BRAMINZERO. */
3241
3242 case OP_BRAZERO:
3243 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003244 const uschar *next = ecode+1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003245 if (match(eptr, next, offset_top, md)) SUCCEED;
3246 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3247 ecode = next + 3;
3248 }
3249 break;
3250
3251 case OP_BRAMINZERO:
3252 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003253 const uschar *next = ecode+1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003254 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3255 if (match(eptr, next+3, offset_top, md)) SUCCEED;
3256 ecode++;
3257 }
3258 break;;
3259
3260 /* End of a group, repeated or non-repeating. If we are at the end of
3261 an assertion "group", stop matching and SUCCEED, but record the
3262 current high water mark for use by positive assertions. */
3263
3264 case OP_KET:
3265 case OP_KETRMIN:
3266 case OP_KETRMAX:
3267 {
Guido van Rossum50700601997-12-08 17:15:20 +00003268 int number;
Guido van Rossum58132c61997-12-17 00:24:13 +00003269 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003270
Guido van Rossum50700601997-12-08 17:15:20 +00003271 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003272 {
Guido van Rossum50700601997-12-08 17:15:20 +00003273 md->end_match_ptr = eptr; /* For ONCE */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003274 md->end_offset_top = offset_top;
3275 SUCCEED;
3276 }
3277
3278 /* In all other cases we have to check the group number back at the
3279 start and if necessary complete handling an extraction by setting the
3280 final offset and bumping the high water mark. */
3281
3282 number = (*prev - OP_BRA) << 1;
3283
Guido van Rossum557dea11997-12-22 22:46:52 +00003284 DPRINTF(("end bracket %d\n", number/2));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003285
3286 if (number > 0)
3287 {
3288 if (number >= md->offset_end) md->offset_overflow = TRUE; else
3289 {
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003290 md->offset_vector[number+1] = eptr - md->start_subject;
3291 if (offset_top <= number) offset_top = number + 2;
3292 }
3293 }
3294
3295 /* For a non-repeating ket, just advance to the next node and continue at
3296 this level. */
3297
3298 if (*ecode == OP_KET)
3299 {
3300 ecode += 3;
3301 break;
3302 }
3303
3304 /* The repeating kets try the rest of the pattern or restart from the
3305 preceding bracket, in the appropriate order. */
3306
3307 if (*ecode == OP_KETRMIN)
3308 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003309 const uschar *ptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003310 if (match(eptr, ecode+3, offset_top, md)) goto succeed;
3311 /* Handle alternation inside the BRA...KET; push the additional
Guido van Rossum58132c61997-12-17 00:24:13 +00003312 alternatives onto the stack */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003313 ptr=prev;
3314 do {
3315 ptr += (ptr[1]<<8)+ ptr[2];
3316 if (*ptr==OP_ALT)
3317 {
Guido van Rossum50700601997-12-08 17:15:20 +00003318 if (md->length == md->point)
3319 {
3320 grow_stack(md);
3321 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003322 md->offset_top[md->point] = offset_top;
3323 md->eptr[md->point] = eptr;
3324 md->ecode[md->point] = ptr+3;
3325 md->r1[md->point] = 0;
3326 md->r2[md->point] = 0;
3327 md->off_num[md->point] = 0;
3328 md->point++;
3329 }
3330 } while (*ptr==OP_ALT);
3331 ecode=prev+3; goto match_loop;
3332 }
3333 else /* OP_KETRMAX */
3334 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003335 const uschar *ptr;
3336 /*int points_pushed=0;*/
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003337
3338 /* Push one failure point, that will resume matching at the code after
3339 the KETRMAX opcode. */
Guido van Rossum50700601997-12-08 17:15:20 +00003340 if (md->length == md->point)
3341 {
3342 grow_stack(md);
3343 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003344 md->offset_top[md->point] = offset_top;
3345 md->eptr[md->point] = eptr;
3346 md->ecode[md->point] = ecode+3;
3347 md->r1[md->point] = md->offset_vector[number];
3348 md->r2[md->point] = md->offset_vector[number+1];
3349 md->off_num[md->point] = number;
3350 md->point++;
3351
3352 md->offset_vector[number] = eptr - md->start_subject;
3353 /* Handle alternation inside the BRA...KET; push each of the
Guido van Rossum58132c61997-12-17 00:24:13 +00003354 additional alternatives onto the stack */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003355 ptr=prev;
3356 do {
3357 ptr += (ptr[1]<<8)+ ptr[2];
3358 if (*ptr==OP_ALT)
3359 {
Guido van Rossum50700601997-12-08 17:15:20 +00003360 if (md->length == md->point)
3361 if (md->length == md->point)
3362 {
3363 grow_stack(md);
3364 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003365 md->offset_top[md->point] = offset_top;
3366 md->eptr[md->point] = eptr;
3367 md->ecode[md->point] = ptr+3;
3368 md->r1[md->point] = 0;
3369 md->r2[md->point] = 0;
3370 md->off_num[md->point] = 0;
3371 md->point++;
Guido van Rossum58132c61997-12-17 00:24:13 +00003372 /*points_pushed++;*/
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003373 }
3374 } while (*ptr==OP_ALT);
3375 /* Jump to the first (or only) alternative and resume trying to match */
3376 ecode=prev+3; goto match_loop;
3377 }
3378 }
Guido van Rossum58132c61997-12-17 00:24:13 +00003379 break;
3380
Guido van Rossum50700601997-12-08 17:15:20 +00003381 /* Start of subject unless notbol, or after internal newline if multiline */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003382
3383 case OP_CIRC:
Guido van Rossum50700601997-12-08 17:15:20 +00003384 if (md->notbol && eptr == md->start_subject) FAIL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003385 if (md->multiline)
3386 {
3387 if (eptr != md->start_subject && eptr[-1] != '\n') FAIL;
3388 ecode++;
3389 break;
3390 }
3391 /* ... else fall through */
3392
3393 /* Start of subject assertion */
3394
3395 case OP_SOD:
3396 if (eptr != md->start_subject) FAIL;
3397 ecode++;
3398 break;
3399
Guido van Rossum50700601997-12-08 17:15:20 +00003400 /* Assert before internal newline if multiline, or before
3401 a terminating newline unless endonly is set, else end of subject unless
3402 noteol is set. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003403
3404 case OP_DOLL:
Guido van Rossum50700601997-12-08 17:15:20 +00003405 if (md->noteol && eptr >= md->end_subject) FAIL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003406 if (md->multiline)
3407 {
3408 if (eptr < md->end_subject && *eptr != '\n') FAIL;
3409 ecode++;
3410 break;
3411 }
Guido van Rossum50700601997-12-08 17:15:20 +00003412 else if (!md->endonly)
3413 {
3414 if (eptr < md->end_subject - 1 ||
3415 (eptr == md->end_subject - 1 && *eptr != '\n')) FAIL;
3416 ecode++;
3417 break;
3418 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003419 /* ... else fall through */
3420
3421 /* End of subject assertion */
3422
3423 case OP_EOD:
3424 if (eptr < md->end_subject) FAIL;
3425 ecode++;
3426 break;
3427
3428 /* Word boundary assertions */
3429
3430 case OP_NOT_WORD_BOUNDARY:
3431 case OP_WORD_BOUNDARY:
3432 {
3433 BOOL prev_is_word = (eptr != md->start_subject) &&
3434 ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
3435 BOOL cur_is_word = (eptr < md->end_subject) &&
3436 ((pcre_ctypes[*eptr] & ctype_word) != 0);
3437 if ((*ecode++ == OP_WORD_BOUNDARY)?
3438 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3439 FAIL;
3440 }
3441 break;
3442
Guido van Rossum50700601997-12-08 17:15:20 +00003443 case OP_NOT_WORD_BOUNDARY_L:
3444 case OP_WORD_BOUNDARY_L:
3445 {
3446 BOOL prev_is_word = (eptr != md->start_subject) &&
Guido van Rossum58132c61997-12-17 00:24:13 +00003447 (isalnum(eptr[-1]) || eptr[-1]=='_');
Guido van Rossum50700601997-12-08 17:15:20 +00003448 BOOL cur_is_word = (eptr < md->end_subject) &&
Guido van Rossum58132c61997-12-17 00:24:13 +00003449 (isalnum(*eptr) || *eptr=='_');
Guido van Rossum50700601997-12-08 17:15:20 +00003450 if ((*ecode++ == OP_WORD_BOUNDARY_L)?
3451 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3452 FAIL;
3453 }
3454 break;
3455
3456
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003457 /* Match a single character type; inline for speed */
3458
3459 case OP_ANY:
3460 if (!md->dotall && eptr < md->end_subject && *eptr == '\n') FAIL;
3461 if (eptr++ >= md->end_subject) FAIL;
3462 ecode++;
3463 break;
3464
3465 case OP_NOT_DIGIT:
3466 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)
3467 FAIL;
3468 ecode++;
3469 break;
3470
3471 case OP_DIGIT:
3472 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)
3473 FAIL;
3474 ecode++;
3475 break;
3476
3477 case OP_NOT_WHITESPACE:
3478 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)
3479 FAIL;
3480 ecode++;
3481 break;
3482
3483 case OP_WHITESPACE:
3484 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)
3485 FAIL;
3486 ecode++;
3487 break;
3488
3489 case OP_NOT_WORDCHAR:
3490 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)
3491 FAIL;
3492 ecode++;
3493 break;
3494
3495 case OP_WORDCHAR:
3496 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)
3497 FAIL;
3498 ecode++;
3499 break;
3500
Guido van Rossum50700601997-12-08 17:15:20 +00003501 case OP_NOT_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00003502 if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) ))
Guido van Rossum557dea11997-12-22 22:46:52 +00003503 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00003504 eptr++;
3505 ecode++;
3506 break;
3507
3508 case OP_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00003509 if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) ))
Guido van Rossum557dea11997-12-22 22:46:52 +00003510 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00003511 eptr++;
3512 ecode++;
3513 break;
3514
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003515 /* Match a back reference, possibly repeatedly. Look past the end of the
3516 item to see if there is repeat information following. The code is similar
3517 to that for character classes, but repeated for efficiency. Then obey
3518 similar code to character type repeats - written out again for speed.
3519 However, if the referenced string is the empty string, always treat
3520 it as matched, any number of times (otherwise there could be infinite
3521 loops). */
3522
3523 case OP_REF:
3524 {
3525 int length;
3526 int number = ecode[1] << 1; /* Doubled reference number */
3527 ecode += 2; /* Advance past the item */
3528
3529 if (number >= offset_top || md->offset_vector[number] < 0)
3530 {
3531 md->errorcode = PCRE_ERROR_BADREF;
3532 FAIL;
3533 }
3534
3535 length = md->offset_vector[number+1] - md->offset_vector[number];
3536
3537 switch (*ecode)
3538 {
3539 case OP_CRSTAR:
3540 case OP_CRMINSTAR:
3541 case OP_CRPLUS:
3542 case OP_CRMINPLUS:
3543 case OP_CRQUERY:
3544 case OP_CRMINQUERY:
3545 c = *ecode++ - OP_CRSTAR;
3546 minimize = (c & 1) != 0;
3547 min = rep_min[c]; /* Pick up values from tables; */
3548 max = rep_max[c]; /* zero for max => infinity */
3549 if (max == 0) max = INT_MAX;
3550 break;
3551
3552 case OP_CRRANGE:
3553 case OP_CRMINRANGE:
3554 minimize = (*ecode == OP_CRMINRANGE);
3555 min = (ecode[1] << 8) + ecode[2];
3556 max = (ecode[3] << 8) + ecode[4];
3557 if (max == 0) max = INT_MAX;
3558 ecode += 5;
3559 break;
3560
3561 default: /* No repeat follows */
3562 if (!match_ref(number, eptr, length, md)) FAIL;
3563 eptr += length;
3564 continue; /* With the main loop */
3565 }
3566
3567 /* If the length of the reference is zero, just continue with the
3568 main loop. */
3569
3570 if (length == 0) continue;
3571
3572 /* First, ensure the minimum number of matches are present. We get back
3573 the length of the reference string explicitly rather than passing the
3574 address of eptr, so that eptr can be a register variable. */
3575
3576 for (i = 1; i <= min; i++)
3577 {
3578 if (!match_ref(number, eptr, length, md)) FAIL;
3579 eptr += length;
3580 }
3581
3582 /* If min = max, continue at the same level without recursion.
3583 They are not both allowed to be zero. */
3584
3585 if (min == max) continue;
3586
3587 /* If minimizing, keep trying and advancing the pointer */
3588
3589 if (minimize)
3590 {
3591 for (i = min;; i++)
3592 {
3593 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3594 if (i >= max || !match_ref(number, eptr, length, md))
3595 FAIL;
3596 eptr += length;
3597 }
3598 /* Control never gets here */
3599 }
3600
3601 /* If maximizing, find the longest string and work backwards */
3602
3603 else
3604 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003605 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003606 for (i = min; i < max; i++)
3607 {
3608 if (!match_ref(number, eptr, length, md)) break;
3609 eptr += length;
3610 }
3611 while (eptr >= pp)
3612 {
3613 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3614 eptr -= length;
3615 }
3616 FAIL;
3617 }
3618 }
3619 /* Control never gets here */
3620
3621 /* Match a character class, possibly repeatedly. Look past the end of the
3622 item to see if there is repeat information following. Then obey similar
Guido van Rossum50700601997-12-08 17:15:20 +00003623 code to character type repeats - written out again for speed. If caseless
3624 matching was set at runtime but not at compile time, we have to check both
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003625 versions of a character, and we have to behave differently for positive and
3626 negative classes. This is the only time where OP_CLASS and OP_NEGCLASS are
3627 treated differently. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003628
3629 case OP_CLASS:
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003630 case OP_NEGCLASS:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003631 {
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003632 BOOL nasty_case = *ecode == OP_NEGCLASS && md->runtime_caseless;
Guido van Rossum58132c61997-12-17 00:24:13 +00003633 const uschar *data = ecode + 1; /* Save for matching */
3634 ecode += 33; /* Advance past the item */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003635
3636 switch (*ecode)
3637 {
3638 case OP_CRSTAR:
3639 case OP_CRMINSTAR:
3640 case OP_CRPLUS:
3641 case OP_CRMINPLUS:
3642 case OP_CRQUERY:
3643 case OP_CRMINQUERY:
3644 c = *ecode++ - OP_CRSTAR;
3645 minimize = (c & 1) != 0;
3646 min = rep_min[c]; /* Pick up values from tables; */
3647 max = rep_max[c]; /* zero for max => infinity */
3648 if (max == 0) max = INT_MAX;
3649 break;
3650
3651 case OP_CRRANGE:
3652 case OP_CRMINRANGE:
3653 minimize = (*ecode == OP_CRMINRANGE);
3654 min = (ecode[1] << 8) + ecode[2];
3655 max = (ecode[3] << 8) + ecode[4];
3656 if (max == 0) max = INT_MAX;
3657 ecode += 5;
3658 break;
3659
3660 default: /* No repeat follows */
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003661 min = max = 1;
3662 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003663 }
3664
3665 /* First, ensure the minimum number of matches are present. */
3666
3667 for (i = 1; i <= min; i++)
Guido van Rossum50700601997-12-08 17:15:20 +00003668 {
3669 if (eptr >= md->end_subject) FAIL;
3670 c = *eptr++;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003671
3672 /* Either not runtime caseless, or it was a positive class. For
3673 runtime caseless, continue if either case is in the map. */
3674
3675 if (!nasty_case)
Guido van Rossum50700601997-12-08 17:15:20 +00003676 {
Guido van Rossum50700601997-12-08 17:15:20 +00003677 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003678 if (md->runtime_caseless)
3679 {
3680 c = pcre_fcc[c];
3681 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3682 }
Guido van Rossum50700601997-12-08 17:15:20 +00003683 }
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003684
3685 /* Runtime caseless and it was a negative class. Continue only if
3686 both cases are in the map. */
3687
3688 else
3689 {
3690 if ((data[c/8] & (1 << (c&7))) == 0) FAIL;
3691 c = pcre_fcc[c];
3692 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3693 }
3694
3695 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00003696 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003697
3698 /* If max == min we can continue with the main loop without the
3699 need to recurse. */
3700
3701 if (min == max) continue;
3702
3703 /* If minimizing, keep testing the rest of the expression and advancing
3704 the pointer while it matches the class. */
3705
3706 if (minimize)
3707 {
3708 for (i = min;; i++)
3709 {
3710 if (match(eptr, ecode, offset_top, md)) SUCCEED;
Guido van Rossum50700601997-12-08 17:15:20 +00003711 if (i >= max || eptr >= md->end_subject) FAIL;
3712 c = *eptr++;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003713
3714 /* Either not runtime caseless, or it was a positive class. For
3715 runtime caseless, continue if either case is in the map. */
3716
3717 if (!nasty_case)
Guido van Rossum50700601997-12-08 17:15:20 +00003718 {
Guido van Rossum50700601997-12-08 17:15:20 +00003719 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003720 if (md->runtime_caseless)
3721 {
3722 c = pcre_fcc[c];
3723 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3724 }
Guido van Rossum50700601997-12-08 17:15:20 +00003725 }
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003726
3727 /* Runtime caseless and it was a negative class. Continue only if
3728 both cases are in the map. */
3729
3730 else
3731 {
3732 if ((data[c/8] & (1 << (c&7))) == 0) return FALSE;
3733 c = pcre_fcc[c];
3734 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3735 }
3736
Guido van Rossum50700601997-12-08 17:15:20 +00003737 FAIL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003738 }
3739 /* Control never gets here */
3740 }
3741
3742 /* If maximizing, find the longest possible run, then work backwards. */
3743
3744 else
3745 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003746 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00003747 for (i = min; i < max; eptr++, i++)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003748 {
Guido van Rossum50700601997-12-08 17:15:20 +00003749 if (eptr >= md->end_subject) break;
3750 c = *eptr;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003751
3752 /* Either not runtime caseless, or it was a positive class. For
3753 runtime caseless, continue if either case is in the map. */
3754
3755 if (!nasty_case)
Guido van Rossum50700601997-12-08 17:15:20 +00003756 {
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003757 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3758 if (md->runtime_caseless)
3759 {
3760 c = pcre_fcc[c];
3761 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3762 }
3763 }
3764
3765 /* Runtime caseless and it was a negative class. Continue only if
3766 both cases are in the map. */
3767
3768 else
3769 {
3770 if ((data[c/8] & (1 << (c&7))) == 0) break;
Guido van Rossum50700601997-12-08 17:15:20 +00003771 c = pcre_fcc[c];
3772 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3773 }
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003774
Guido van Rossum50700601997-12-08 17:15:20 +00003775 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003776 }
Guido van Rossum50700601997-12-08 17:15:20 +00003777
3778 while (eptr >= pp)
3779 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3780 FAIL;
3781 }
3782 }
3783 /* Control never gets here */
3784
3785 /* OP_CLASS_L opcode: handles localized character classes */
3786
3787 case OP_CLASS_L:
3788 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003789 const uschar *data = ecode + 1; /* Save for matching */
3790 const uschar locale_flag = *data;
Guido van Rossum50700601997-12-08 17:15:20 +00003791 ecode++; data++; /* The localization support adds an extra byte */
3792
3793 ecode += 33; /* Advance past the item */
3794
3795 switch (*ecode)
3796 {
3797 case OP_CRSTAR:
3798 case OP_CRMINSTAR:
3799 case OP_CRPLUS:
3800 case OP_CRMINPLUS:
3801 case OP_CRQUERY:
3802 case OP_CRMINQUERY:
3803 c = *ecode++ - OP_CRSTAR;
3804 minimize = (c & 1) != 0;
3805 min = rep_min[c]; /* Pick up values from tables; */
3806 max = rep_max[c]; /* zero for max => infinity */
3807 if (max == 0) max = INT_MAX;
3808 break;
3809
3810 case OP_CRRANGE:
3811 case OP_CRMINRANGE:
3812 minimize = (*ecode == OP_CRMINRANGE);
3813 min = (ecode[1] << 8) + ecode[2];
3814 max = (ecode[3] << 8) + ecode[4];
3815 if (max == 0) max = INT_MAX;
3816 ecode += 5;
3817 break;
3818
3819 default: /* No repeat follows */
3820 if (eptr >= md->end_subject) FAIL;
3821 c = *eptr++;
3822 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
Guido van Rossum58132c61997-12-17 00:24:13 +00003823 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3824 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003825#if 0
3826 if ( (locale_flag & 4) && isdigit(c) ) continue; /* Locale \d */
3827 if ( (locale_flag & 8) && !isdigit(c) ) continue; /* Locale \D */
3828 if ( (locale_flag & 16) && isspace(c) ) continue; /* Locale \s */
3829 if ( (locale_flag & 32) && !isspace(c) ) continue; /* Locale \S */
3830#endif
3831
3832 if (md->runtime_caseless)
3833 {
3834 c = pcre_fcc[c];
3835 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
3836
Guido van Rossum58132c61997-12-17 00:24:13 +00003837 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3838 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003839 }
3840 FAIL;
3841 }
3842
3843 /* First, ensure the minimum number of matches are present. */
3844
3845 for (i = 1; i <= min; i++)
3846 {
3847 if (eptr >= md->end_subject) FAIL;
3848 c = *eptr++;
3849 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003850 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3851 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003852
3853 if (md->runtime_caseless)
3854 {
3855 c = pcre_fcc[c];
3856 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003857 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3858 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003859 }
3860 FAIL;
3861 }
3862
3863 /* If max == min we can continue with the main loop without the
3864 need to recurse. */
3865
3866 if (min == max) continue;
3867
3868 /* If minimizing, keep testing the rest of the expression and advancing
3869 the pointer while it matches the class. */
3870
3871 if (minimize)
3872 {
3873 for (i = min;; i++)
3874 {
3875 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3876 if (i >= max || eptr >= md->end_subject) FAIL;
3877 c = *eptr++;
3878 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003879 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3880 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003881
3882 if (md->runtime_caseless)
3883 {
3884 c = pcre_fcc[c];
3885 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003886 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3887 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003888 }
3889 FAIL;
3890 }
3891 /* Control never gets here */
3892 }
3893
3894 /* If maximizing, find the longest possible run, then work backwards. */
3895
3896 else
3897 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003898 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00003899 for (i = min; i < max; eptr++, i++)
3900 {
3901 if (eptr >= md->end_subject) break;
3902 c = *eptr;
3903 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003904 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3905 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003906 if (md->runtime_caseless)
3907 {
3908 c = pcre_fcc[c];
3909 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003910 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3911 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003912 }
3913 break;
3914 }
3915
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003916 while (eptr >= pp)
3917 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3918 FAIL;
3919 }
3920 }
3921 /* Control never gets here */
3922
3923 /* Match a run of characters */
3924
3925 case OP_CHARS:
3926 {
3927 register int length = ecode[1];
3928 ecode += 2;
3929
Guido van Rossum557dea11997-12-22 22:46:52 +00003930#ifdef DEBUG /* Sigh. Some compilers never learn. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003931 if (eptr >= md->end_subject)
3932 printf("matching subject <null> against pattern ");
3933 else
3934 {
3935 printf("matching subject ");
3936 pchars(eptr, length, TRUE, md);
3937 printf(" against pattern ");
3938 }
3939 pchars(ecode, length, FALSE, md);
3940 printf("\n");
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003941#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003942
3943 if (length > md->end_subject - eptr) FAIL;
3944 if (md->caseless)
3945 {
3946 while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) FAIL;
3947 }
3948 else
3949 {
3950 while (length-- > 0) if (*ecode++ != *eptr++) FAIL;
3951 }
3952 }
3953 break;
3954
3955 /* Match a single character repeatedly; different opcodes share code. */
3956
3957 case OP_EXACT:
3958 min = max = (ecode[1] << 8) + ecode[2];
3959 ecode += 3;
3960 goto REPEATCHAR;
3961
3962 case OP_UPTO:
3963 case OP_MINUPTO:
3964 min = 0;
3965 max = (ecode[1] << 8) + ecode[2];
3966 minimize = *ecode == OP_MINUPTO;
3967 ecode += 3;
3968 goto REPEATCHAR;
3969
3970 case OP_STAR:
3971 case OP_MINSTAR:
3972 case OP_PLUS:
3973 case OP_MINPLUS:
3974 case OP_QUERY:
3975 case OP_MINQUERY:
3976 c = *ecode++ - OP_STAR;
3977 minimize = (c & 1) != 0;
3978 min = rep_min[c]; /* Pick up values from tables; */
3979 max = rep_max[c]; /* zero for max => infinity */
3980 if (max == 0) max = INT_MAX;
3981
3982 /* Common code for all repeated single-character matches. We can give
3983 up quickly if there are fewer than the minimum number of characters left in
3984 the subject. */
3985
3986 REPEATCHAR:
3987 if (min > md->end_subject - eptr) FAIL;
3988 c = *ecode++;
3989
3990 /* The code is duplicated for the caseless and caseful cases, for speed,
3991 since matching characters is likely to be quite common. First, ensure the
3992 minimum number of matches are present. If min = max, continue at the same
3993 level without recursing. Otherwise, if minimizing, keep trying the rest of
3994 the expression and advancing one matching character if failing, up to the
3995 maximum. Alternatively, if maximizing, find the maximum number of
3996 characters and work backwards. */
3997
Guido van Rossum557dea11997-12-22 22:46:52 +00003998 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
3999 max, eptr));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004000
4001 if (md->caseless)
4002 {
4003 c = pcre_lcc[c];
4004 for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) FAIL;
4005 if (min == max) continue;
4006 if (minimize)
4007 {
4008 for (i = min;; i++)
4009 {
4010 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4011 if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])
4012 FAIL;
4013 }
4014 /* Control never gets here */
4015 }
4016 else
4017 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004018 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004019 for (i = min; i < max; i++)
4020 {
4021 if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;
4022 eptr++;
4023 }
4024 while (eptr >= pp)
4025 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4026 FAIL;
4027 }
Guido van Rossum50700601997-12-08 17:15:20 +00004028 /* Control never gets here */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004029 }
4030
4031 /* Caseful comparisons */
4032
4033 else
4034 {
4035 for (i = 1; i <= min; i++) if (c != *eptr++) FAIL;
4036 if (min == max) continue;
4037 if (minimize)
4038 {
4039 for (i = min;; i++)
4040 {
4041 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4042 if (i >= max || eptr >= md->end_subject || c != *eptr++) FAIL;
4043 }
4044 /* Control never gets here */
4045 }
4046 else
4047 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004048 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004049 for (i = min; i < max; i++)
4050 {
4051 if (eptr >= md->end_subject || c != *eptr) break;
4052 eptr++;
4053 }
4054 while (eptr >= pp)
4055 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4056 FAIL;
4057 }
4058 }
4059 /* Control never gets here */
4060
Guido van Rossum50700601997-12-08 17:15:20 +00004061 /* Match a negated single character */
4062
4063 case OP_NOT:
Guido van Rossum557dea11997-12-22 22:46:52 +00004064 if (eptr >= md->end_subject) FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00004065 ecode++;
4066 if (md->caseless)
4067 {
4068 if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) FAIL;
4069 }
4070 else
4071 {
4072 if (*ecode++ == *eptr++) FAIL;
4073 }
4074 break;
4075
4076 /* Match a negated single character repeatedly. This is almost a repeat of
4077 the code for a repeated single character, but I haven't found a nice way of
4078 commoning these up that doesn't require a test of the positive/negative
4079 option for each character match. Maybe that wouldn't add very much to the
4080 time taken, but character matching *is* what this is all about... */
4081
4082 case OP_NOTEXACT:
4083 min = max = (ecode[1] << 8) + ecode[2];
4084 ecode += 3;
4085 goto REPEATNOTCHAR;
4086
4087 case OP_NOTUPTO:
4088 case OP_NOTMINUPTO:
4089 min = 0;
4090 max = (ecode[1] << 8) + ecode[2];
4091 minimize = *ecode == OP_NOTMINUPTO;
4092 ecode += 3;
4093 goto REPEATNOTCHAR;
4094
4095 case OP_NOTSTAR:
4096 case OP_NOTMINSTAR:
4097 case OP_NOTPLUS:
4098 case OP_NOTMINPLUS:
4099 case OP_NOTQUERY:
4100 case OP_NOTMINQUERY:
4101 c = *ecode++ - OP_NOTSTAR;
4102 minimize = (c & 1) != 0;
4103 min = rep_min[c]; /* Pick up values from tables; */
4104 max = rep_max[c]; /* zero for max => infinity */
4105 if (max == 0) max = INT_MAX;
4106
4107 /* Common code for all repeated single-character matches. We can give
4108 up quickly if there are fewer than the minimum number of characters left in
4109 the subject. */
4110
4111 REPEATNOTCHAR:
4112 if (min > md->end_subject - eptr) FAIL;
4113 c = *ecode++;
4114
4115 /* The code is duplicated for the caseless and caseful cases, for speed,
4116 since matching characters is likely to be quite common. First, ensure the
4117 minimum number of matches are present. If min = max, continue at the same
4118 level without recursing. Otherwise, if minimizing, keep trying the rest of
4119 the expression and advancing one matching character if failing, up to the
4120 maximum. Alternatively, if maximizing, find the maximum number of
4121 characters and work backwards. */
4122
Guido van Rossum557dea11997-12-22 22:46:52 +00004123 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
4124 max, eptr));
Guido van Rossum50700601997-12-08 17:15:20 +00004125
4126 if (md->caseless)
4127 {
4128 c = pcre_lcc[c];
4129 for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) FAIL;
4130 if (min == max) continue;
4131 if (minimize)
4132 {
4133 for (i = min;; i++)
4134 {
4135 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4136 if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])
4137 FAIL;
4138 }
4139 /* Control never gets here */
4140 }
4141 else
4142 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004143 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00004144 for (i = min; i < max; i++)
4145 {
4146 if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;
4147 eptr++;
4148 }
4149 while (eptr >= pp)
4150 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4151 FAIL;
4152 }
4153 /* Control never gets here */
4154 }
4155
4156 /* Caseful comparisons */
4157
4158 else
4159 {
4160 for (i = 1; i <= min; i++) if (c == *eptr++) FAIL;
4161 if (min == max) continue;
4162 if (minimize)
4163 {
4164 for (i = min;; i++)
4165 {
4166 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4167 if (i >= max || eptr >= md->end_subject || c == *eptr++) FAIL;
4168 }
4169 /* Control never gets here */
4170 }
4171 else
4172 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004173 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00004174 for (i = min; i < max; i++)
4175 {
4176 if (eptr >= md->end_subject || c == *eptr) break;
4177 eptr++;
4178 }
4179 while (eptr >= pp)
4180 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4181 FAIL;
4182 }
4183 }
4184 /* Control never gets here */
4185
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004186 /* Match a single character type repeatedly; several different opcodes
4187 share code. This is very similar to the code for single characters, but we
4188 repeat it in the interests of efficiency. */
4189
4190 case OP_TYPEEXACT:
4191 min = max = (ecode[1] << 8) + ecode[2];
4192 minimize = TRUE;
4193 ecode += 3;
4194 goto REPEATTYPE;
4195
4196 case OP_TYPEUPTO:
4197 case OP_TYPEMINUPTO:
4198 min = 0;
4199 max = (ecode[1] << 8) + ecode[2];
4200 minimize = *ecode == OP_TYPEMINUPTO;
4201 ecode += 3;
4202 goto REPEATTYPE;
4203
4204 case OP_TYPESTAR:
4205 case OP_TYPEMINSTAR:
4206 case OP_TYPEPLUS:
4207 case OP_TYPEMINPLUS:
4208 case OP_TYPEQUERY:
4209 case OP_TYPEMINQUERY:
4210 c = *ecode++ - OP_TYPESTAR;
4211 minimize = (c & 1) != 0;
4212 min = rep_min[c]; /* Pick up values from tables; */
4213 max = rep_max[c]; /* zero for max => infinity */
4214 if (max == 0) max = INT_MAX;
4215
4216 /* Common code for all repeated single character type matches */
4217
4218 REPEATTYPE:
4219 ctype = *ecode++; /* Code for the character type */
4220
4221 /* First, ensure the minimum number of matches are present. Use inline
4222 code for maximizing the speed, and do the type test once at the start
4223 (i.e. keep it out of the loop). Also test that there are at least the
4224 minimum number of characters before we start. */
4225
4226 if (min > md->end_subject - eptr) FAIL;
4227 if (min > 0) switch(ctype)
4228 {
4229 case OP_ANY:
4230 if (!md->dotall)
4231 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') FAIL; }
4232 else eptr += min;
4233 break;
4234
4235 case OP_NOT_DIGIT:
4236 for (i = 1; i <= min; i++)
4237 if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) FAIL;
4238 break;
4239
4240 case OP_DIGIT:
4241 for (i = 1; i <= min; i++)
4242 if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) FAIL;
4243 break;
4244
4245 case OP_NOT_WHITESPACE:
4246 for (i = 1; i <= min; i++)
4247 if ((pcre_ctypes[*eptr++] & ctype_space) != 0) FAIL;
4248 break;
4249
4250 case OP_WHITESPACE:
4251 for (i = 1; i <= min; i++)
4252 if ((pcre_ctypes[*eptr++] & ctype_space) == 0) FAIL;
4253 break;
4254
4255 case OP_NOT_WORDCHAR:
4256 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
4257 FAIL;
4258 break;
4259
4260 case OP_WORDCHAR:
4261 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
4262 FAIL;
4263 break;
Guido van Rossum50700601997-12-08 17:15:20 +00004264
4265 case OP_NOT_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00004266 for (i = 1; i <= min; i++, eptr++) if (*eptr=='_' || isalnum(*eptr))
Guido van Rossum557dea11997-12-22 22:46:52 +00004267 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00004268 break;
4269
4270 case OP_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00004271 for (i = 1; i <= min; i++, eptr++) if (*eptr!='_' && !isalnum(*eptr))
Guido van Rossum557dea11997-12-22 22:46:52 +00004272 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00004273 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004274 }
4275
4276 /* If min = max, continue at the same level without recursing */
4277
4278 if (min == max) continue;
4279
4280 /* If minimizing, we have to test the rest of the pattern before each
4281 subsequent match, so inlining isn't much help; just use the function. */
4282
4283 if (minimize)
4284 {
4285 for (i = min;; i++)
4286 {
4287 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4288 if (i >= max || eptr >= md->end_subject ||
4289 !match_type(ctype, *eptr++, md->dotall))
4290 FAIL;
4291 }
4292 /* Control never gets here */
4293 }
4294
4295 /* If maximizing it is worth using inline code for speed, doing the type
4296 test once at the start (i.e. keep it out of the loop). */
4297
4298 else
4299 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004300 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004301 switch(ctype)
4302 {
4303 case OP_ANY:
4304 if (!md->dotall)
4305 {
4306 for (i = min; i < max; i++)
4307 {
4308 if (eptr >= md->end_subject || *eptr == '\n') break;
4309 eptr++;
4310 }
4311 }
4312 else
4313 {
4314 c = max - min;
4315 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4316 eptr += c;
4317 }
4318 break;
4319
4320 case OP_NOT_DIGIT:
4321 for (i = min; i < max; i++)
4322 {
4323 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)
4324 break;
4325 eptr++;
4326 }
4327 break;
4328
4329 case OP_DIGIT:
4330 for (i = min; i < max; i++)
4331 {
4332 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)
4333 break;
4334 eptr++;
4335 }
4336 break;
4337
4338 case OP_NOT_WHITESPACE:
4339 for (i = min; i < max; i++)
4340 {
4341 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)
4342 break;
4343 eptr++;
4344 }
4345 break;
4346
4347 case OP_WHITESPACE:
4348 for (i = min; i < max; i++)
4349 {
4350 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)
4351 break;
4352 eptr++;
4353 }
4354 break;
4355
4356 case OP_NOT_WORDCHAR:
4357 for (i = min; i < max; i++)
4358 {
4359 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)
4360 break;
4361 eptr++;
4362 }
4363 break;
4364
4365 case OP_WORDCHAR:
4366 for (i = min; i < max; i++)
4367 {
Guido van Rossum50700601997-12-08 17:15:20 +00004368 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)
4369 break;
4370 eptr++;
4371 }
4372 break;
4373 case OP_NOT_WORDCHAR_L:
4374 for (i = min; i < max; i++)
4375 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004376 if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) ) )
Guido van Rossum50700601997-12-08 17:15:20 +00004377 break;
4378 eptr++;
4379 }
4380 break;
4381
4382 case OP_WORDCHAR_L:
4383 for (i = min; i < max; i++)
4384 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004385 if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) ) )
Guido van Rossum50700601997-12-08 17:15:20 +00004386 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004387 eptr++;
4388 }
4389 break;
4390 }
4391
4392 while (eptr >= pp)
4393 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4394 FAIL;
4395 }
4396 /* Control never gets here */
4397
4398 /* There's been some horrible disaster. */
4399
4400 default:
Guido van Rossum557dea11997-12-22 22:46:52 +00004401 DPRINTF(("Unknown opcode %d\n", *ecode));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004402 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4403 FAIL;
4404 }
4405
4406 /* Do not stick any code in here without much thought; it is assumed
4407 that "continue" in the code above comes out to here to repeat the main
4408 loop. */
4409
4410 } /* End of main loop */
4411/* Control never reaches here */
4412
4413fail:
4414 if (md->point > save_stack_position)
4415 {
4416 /* If there are still points remaining on the stack, pop the next one off */
Guido van Rossumc3861071997-10-08 02:07:40 +00004417 int off_num;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004418
4419 md->point--;
4420 offset_top = md->offset_top[md->point];
4421 eptr = md->eptr[md->point];
4422 ecode = md->ecode[md->point];
4423 off_num = md->off_num[md->point];
4424 md->offset_vector[off_num] = md->r1[md->point];
4425 md->offset_vector[off_num+1] = md->r2[md->point];
4426 goto match_loop;
4427 }
4428 /* Failure, and nothing left on the stack, so end this function call */
4429
4430 /* Restore the top of the stack to where it was before this function
4431 call. This lets us use one stack for everything; recursive calls
4432 can push and pop information, and may increase the stack. When
4433 the call returns, the parent function can resume pushing and
4434 popping wherever it was. */
4435
4436 md->point = save_stack_position;
4437 return FALSE;
4438
4439succeed:
4440 return TRUE;
4441}
4442
4443
Guido van Rossum50700601997-12-08 17:15:20 +00004444
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004445/*************************************************
Guido van Rossum557dea11997-12-22 22:46:52 +00004446* Segregate setjmp() *
4447*************************************************/
4448
4449/* The -Wall option of gcc gives warnings for all local variables when setjmp()
4450is used, even if the coding conforms to the rules of ANSI C. To avoid this, we
4451hide it in a separate function. This is called only when PCRE_EXTRA is set,
4452since it's needed only for the extension \X option, and with any luck, a good
4453compiler will spot the tail recursion and compile it efficiently.
4454
4455Arguments:
4456 eptr pointer in subject
4457 ecode position in code
4458 offset_top current top pointer
4459 md pointer to "static" info for the match
4460
4461Returns: TRUE if matched
4462*/
4463
4464static BOOL
4465match_with_setjmp(const uschar *eptr, const uschar *ecode, int offset_top,
4466 match_data *match_block)
4467{
4468return setjmp(match_block->fail_env) == 0 &&
4469 match(eptr, ecode, offset_top, match_block);
4470}
4471
4472
4473
4474/*************************************************
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004475* Execute a Regular Expression *
4476*************************************************/
4477
4478/* This function applies a compiled re to a subject string and picks out
4479portions of the string if it matches. Two elements in the vector are set for
4480each substring: the offsets to the start and end of the substring.
4481
4482Arguments:
Guido van Rossum50700601997-12-08 17:15:20 +00004483 external_re points to the compiled expression
4484 external_extra points to "hints" from pcre_study() or is NULL
4485 subject points to the subject string
4486 length length of subject string (may contain binary zeros)
4487 options option bits
4488 offsets points to a vector of ints to be filled in with offsets
4489 offsetcount the number of elements in the vector
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004490
Guido van Rossum50700601997-12-08 17:15:20 +00004491Returns: > 0 => success; value is the number of elements filled in
4492 = 0 => success, but offsets is not big enough
4493 -1 => failed to match
4494 < -1 => some kind of unexpected problem
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004495*/
4496
4497int
Guido van Rossum50700601997-12-08 17:15:20 +00004498pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
Guido van Rossum816671c1998-03-10 04:55:29 +00004499 const char *subject, int length, int start_pos, int options,
4500 int *offsets, int offsetcount)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004501{
Guido van Rossum58132c61997-12-17 00:24:13 +00004502 /* The "volatile" directives are to make gcc -Wall stop complaining
4503 that these variables can be clobbered by the longjmp. Hopefully
4504 they won't cost too much performance. */
Guido van Rossum042ff9e1998-04-03 21:13:31 +00004505volatile int resetcount, ocount;
4506volatile int first_char = -1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004507match_data match_block;
Guido van Rossum557dea11997-12-22 22:46:52 +00004508const uschar *start_bits = NULL;
Guido van Rossum816671c1998-03-10 04:55:29 +00004509const uschar *start_match = (const uschar *)subject + start_pos;
Guido van Rossum58132c61997-12-17 00:24:13 +00004510const uschar *end_subject;
4511const real_pcre *re = (const real_pcre *)external_re;
4512const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00004513volatile BOOL using_temporary_offsets = FALSE;
4514volatile BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4515volatile BOOL startline = (re->options & PCRE_STARTLINE) != 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004516
4517if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4518
4519if (re == NULL || subject == NULL ||
4520 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4521if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4522
Guido van Rossum58132c61997-12-17 00:24:13 +00004523match_block.start_subject = (const uschar *)subject;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004524match_block.end_subject = match_block.start_subject + length;
4525end_subject = match_block.end_subject;
4526
Guido van Rossum50700601997-12-08 17:15:20 +00004527match_block.caseless = ((re->options | options) & PCRE_CASELESS) != 0;
4528match_block.runtime_caseless = match_block.caseless &&
4529 (re->options & PCRE_CASELESS) == 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004530
Guido van Rossum50700601997-12-08 17:15:20 +00004531match_block.multiline = ((re->options | options) & PCRE_MULTILINE) != 0;
4532match_block.dotall = ((re->options | options) & PCRE_DOTALL) != 0;
4533match_block.endonly = ((re->options | options) & PCRE_DOLLAR_ENDONLY) != 0;
4534
4535match_block.notbol = (options & PCRE_NOTBOL) != 0;
4536match_block.noteol = (options & PCRE_NOTEOL) != 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004537
4538match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
4539
4540/* Set the stack state to empty */
4541 match_block.off_num = match_block.offset_top = NULL;
4542 match_block.r1 = match_block.r2 = NULL;
4543 match_block.eptr = match_block.ecode = NULL;
4544 match_block.point = match_block.length = 0;
4545
Guido van Rossum50700601997-12-08 17:15:20 +00004546/* If the expression has got more back references than the offsets supplied can
4547hold, we get a temporary bit of working store to use during the matching.
Guido van Rossum557dea11997-12-22 22:46:52 +00004548Otherwise, we can use the vector supplied, rounding down its size to a multiple
4549of 2. */
Guido van Rossum50700601997-12-08 17:15:20 +00004550
Guido van Rossum557dea11997-12-22 22:46:52 +00004551ocount = offsetcount & (-2);
4552if (re->top_backref > 0 && re->top_backref >= ocount/2)
Guido van Rossum50700601997-12-08 17:15:20 +00004553 {
4554 ocount = re->top_backref * 2 + 2;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00004555 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
Guido van Rossum50700601997-12-08 17:15:20 +00004556 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
Guido van Rossum557dea11997-12-22 22:46:52 +00004557 using_temporary_offsets = TRUE;
4558 DPRINTF(("Got memory to hold back references\n"));
Guido van Rossum50700601997-12-08 17:15:20 +00004559 }
4560else match_block.offset_vector = offsets;
4561
4562match_block.offset_end = ocount;
4563match_block.offset_overflow = FALSE;
4564
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004565/* Compute the minimum number of offsets that we need to reset each time. Doing
4566this makes a huge difference to execution time when there aren't many brackets
4567in the pattern. */
4568
4569resetcount = 2 + re->top_bracket * 2;
Guido van Rossum50700601997-12-08 17:15:20 +00004570if (resetcount > offsetcount) resetcount = ocount;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004571
4572/* If MULTILINE is set at exec time but was not set at compile time, and the
4573anchored flag is set, we must re-check because a setting provoked by ^ in the
4574pattern is not right in multi-line mode. Calling is_anchored() again here does
4575the right check, because multiline is now set. If it now yields FALSE, the
4576expression must have had ^ starting some of its branches. Check to see if
4577that is true for *all* branches, and if so, set the startline flag. */
4578
Guido van Rossum557dea11997-12-22 22:46:52 +00004579if (match_block.multiline && anchored && (re->options & PCRE_MULTILINE) == 0 &&
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004580 !is_anchored(re->code, match_block.multiline))
4581 {
4582 anchored = FALSE;
4583 if (is_startline(re->code)) startline = TRUE;
4584 }
4585
4586/* Set up the first character to match, if available. The first_char value is
4587never set for an anchored regular expression, but the anchoring may be forced
4588at run time, so we have to test for anchoring. The first char may be unset for
4589an unanchored pattern, of course. If there's no first char and the pattern was
4590studied, the may be a bitmap of possible first characters. However, we can
4591use this only if the caseless state of the studying was correct. */
4592
4593if (!anchored)
4594 {
4595 if ((re->options & PCRE_FIRSTSET) != 0)
4596 {
4597 first_char = re->first_char;
4598 if (match_block.caseless) first_char = pcre_lcc[first_char];
4599 }
4600 else
4601 if (!startline && extra != NULL &&
4602 (extra->options & PCRE_STUDY_MAPPED) != 0 &&
4603 ((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless)
4604 start_bits = extra->start_bits;
4605 }
4606
4607/* Loop for unanchored matches; for anchored regexps the loop runs just once. */
4608
4609do
4610 {
Guido van Rossum557dea11997-12-22 22:46:52 +00004611 int rc;
Guido van Rossum50700601997-12-08 17:15:20 +00004612 register int *iptr = match_block.offset_vector;
4613 register int *iend = iptr + resetcount;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004614
4615 /* Reset the maximum number of extractions we might see. */
4616
4617 while (iptr < iend) *iptr++ = -1;
4618
4619 /* Advance to a unique first char if possible */
4620
4621 if (first_char >= 0)
4622 {
4623 if (match_block.caseless)
4624 while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
4625 start_match++;
4626 else
4627 while (start_match < end_subject && *start_match != first_char)
4628 start_match++;
4629 }
4630
4631 /* Or to just after \n for a multiline match if possible */
4632
4633 else if (startline)
4634 {
4635 if (start_match > match_block.start_subject)
4636 {
4637 while (start_match < end_subject && start_match[-1] != '\n')
4638 start_match++;
4639 }
4640 }
4641
4642 /* Or to a non-unique first char */
4643
4644 else if (start_bits != NULL)
4645 {
4646 while (start_match < end_subject)
4647 {
4648 register int c = *start_match;
Guido van Rossum50700601997-12-08 17:15:20 +00004649 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004650 }
4651 }
4652
Guido van Rossum557dea11997-12-22 22:46:52 +00004653#ifdef DEBUG /* Sigh. Some compilers never learn. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004654 printf(">>>> Match against: ");
4655 pchars(start_match, end_subject - start_match, TRUE, &match_block);
4656 printf("\n");
Guido van Rossum57ba4f31997-12-02 20:40:28 +00004657#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004658
4659 /* When a match occurs, substrings will be set for all internal extractions;
4660 we just need to set up the whole thing as substring 0 before returning. If
Guido van Rossum50700601997-12-08 17:15:20 +00004661 there were too many extractions, set the return code to zero. In the case
4662 where we had to get some local store to hold offsets for backreferences, copy
4663 those back references that we can. In this case there need not be overflow
4664 if certain parts of the pattern were not used.
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004665
Guido van Rossum50700601997-12-08 17:15:20 +00004666 Before starting the match, we have to set up a longjmp() target to enable
Guido van Rossum557dea11997-12-22 22:46:52 +00004667 the "cut" operation to fail a match completely without backtracking. This
4668 is done in a separate function to avoid compiler warnings. We need not do
4669 it unless PCRE_EXTRA is set, since only in that case is the "cut" operation
4670 enabled. */
Guido van Rossum50700601997-12-08 17:15:20 +00004671
4672 /* To handle errors such as running out of memory for the failure
4673 stack, we need to save this location via setjmp(), so
4674 error-handling code can call longjmp() to jump out of deeply-nested code. */
4675 if (setjmp(match_block.error_env)==0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004676 {
Guido van Rossum50700601997-12-08 17:15:20 +00004677
Guido van Rossum557dea11997-12-22 22:46:52 +00004678 if ((re->options & PCRE_EXTRA) != 0)
Guido van Rossum50700601997-12-08 17:15:20 +00004679 {
Guido van Rossum557dea11997-12-22 22:46:52 +00004680 if (!match_with_setjmp(start_match, re->code, 2, &match_block))
4681 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004682 }
Guido van Rossum557dea11997-12-22 22:46:52 +00004683 else if (!match(start_match, re->code, 2, &match_block)) continue;
4684
4685 /* Copy the offset information from temporary store if necessary */
4686
4687 if (using_temporary_offsets)
4688 {
4689 if (offsetcount >= 4)
4690 {
4691 memcpy(offsets + 2, match_block.offset_vector + 2,
4692 (offsetcount - 2) * sizeof(int));
4693 DPRINTF(("Copied offsets from temporary memory\n"));
4694 }
4695 if (match_block.end_offset_top > offsetcount)
4696 match_block.offset_overflow = TRUE;
4697
4698 DPRINTF(("Freeing temporary memory\n"));
4699 (pcre_free)(match_block.offset_vector);
4700 }
4701
4702 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
4703
4704 if (match_block.offset_end < 2) rc = 0; else
4705 {
4706 offsets[0] = start_match - match_block.start_subject;
4707 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
4708 }
4709
4710 DPRINTF((">>>> returning %d\n", rc));
4711 free_stack(&match_block);
4712 return rc;
Guido van Rossum50700601997-12-08 17:15:20 +00004713 } /* End of (if setjmp(match_block.error_env)...) */
Guido van Rossum042ff9e1998-04-03 21:13:31 +00004714 free_stack(&match_block);
4715
Guido van Rossum50700601997-12-08 17:15:20 +00004716 /* Return an error code; pcremodule.c will preserve the exception */
4717 if (PyErr_Occurred()) return PCRE_ERROR_NOMEMORY;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004718 }
4719while (!anchored &&
4720 match_block.errorcode == PCRE_ERROR_NOMATCH &&
4721 start_match++ < end_subject);
4722
Guido van Rossum557dea11997-12-22 22:46:52 +00004723if (using_temporary_offsets)
4724 {
4725 DPRINTF(("Freeing temporary memory\n"));
4726 (pcre_free)(match_block.offset_vector);
4727 }
4728
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004729#ifdef DEBUG
4730printf(">>>> returning %d\n", match_block.errorcode);
4731#endif
Guido van Rossum50700601997-12-08 17:15:20 +00004732
Guido van Rossum557dea11997-12-22 22:46:52 +00004733 return match_block.errorcode;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004734}
4735
4736/* End of pcre.c */