blob: 1d2a9cade7a5e4db305bd3fa2e9cdd1faeb7d9f6 [file] [log] [blame]
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001
2/*************************************************
3* Perl-Compatible Regular Expressions *
4*************************************************/
5
6/* DO NOT EDIT THIS FILE! */
7
8/* This file is automatically written by the merge-files.py script
9included with the PCRE distribution for Python; it's produced from
10several C files, and code is removed in the process. If you want to
11modify the code or track down bugs, it will be much easier to work
12with the code in its original, multiple-file form. Don't edit this
13file by hand, or submit patches to it.
14
15The Python-specific PCRE distribution can be retrieved from
16 http://starship.skyport.net/crew/amk/regex/
17
Guido van Rossum58132c61997-12-17 00:24:13 +000018The unmodified original PCRE distribution is available at
19ftp://ftp.cus.cam.ac.uk/pub/software/programs/pcre/, and is originally
20written by: Philip Hazel <ph10@cam.ac.uk>
Guido van Rossum51b3aa31997-10-06 14:43:11 +000021
22Extensively modified by the Python String-SIG: <string-sig@python.org>
23Send bug reports to: <string-sig@python.org>
24(They'll figure out if it's a bug in PCRE or in the Python-specific
25changes.)
26
27 Copyright (c) 1997 University of Cambridge
28
29-----------------------------------------------------------------------------
30Permission is granted to anyone to use this software for any purpose on any
31computer system, and to redistribute it freely, subject to the following
32restrictions:
33
341. This software is distributed in the hope that it will be useful,
35 but WITHOUT ANY WARRANTY; without even the implied warranty of
36 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
37
382. The origin of this software must not be misrepresented, either by
39 explicit claim or by omission.
40
413. Altered versions must be plainly marked as such, and must not be
42 misrepresented as being the original software.
43-----------------------------------------------------------------------------
44*/
45
46
47#define FOR_PYTHON
Guido van Rossum58132c61997-12-17 00:24:13 +000048#include "pcre-int.h"
Guido van Rossum51b3aa31997-10-06 14:43:11 +000049#include "Python.h"
Guido van Rossum50700601997-12-08 17:15:20 +000050#include "mymalloc.h"
51#include <ctype.h>
Guido van Rossum51b3aa31997-10-06 14:43:11 +000052#include "graminit.h"
53
54/*************************************************
55* Perl-Compatible Regular Expressions *
56*************************************************/
57
58/* This file is automatically written by the makechartables auxiliary
59program. If you edit it by hand, you might like to edit the Makefile to
60prevent its ever being regenerated. */
61
62/* This table is a lower casing table. */
63
64unsigned char pcre_lcc[] = {
65 0, 1, 2, 3, 4, 5, 6, 7,
66 8, 9, 10, 11, 12, 13, 14, 15,
67 16, 17, 18, 19, 20, 21, 22, 23,
68 24, 25, 26, 27, 28, 29, 30, 31,
69 32, 33, 34, 35, 36, 37, 38, 39,
70 40, 41, 42, 43, 44, 45, 46, 47,
71 48, 49, 50, 51, 52, 53, 54, 55,
72 56, 57, 58, 59, 60, 61, 62, 63,
73 64, 97, 98, 99,100,101,102,103,
74 104,105,106,107,108,109,110,111,
75 112,113,114,115,116,117,118,119,
76 120,121,122, 91, 92, 93, 94, 95,
77 96, 97, 98, 99,100,101,102,103,
78 104,105,106,107,108,109,110,111,
79 112,113,114,115,116,117,118,119,
80 120,121,122,123,124,125,126,127,
81 128,129,130,131,132,133,134,135,
82 136,137,138,139,140,141,142,143,
83 144,145,146,147,148,149,150,151,
84 152,153,154,155,156,157,158,159,
85 160,161,162,163,164,165,166,167,
86 168,169,170,171,172,173,174,175,
87 176,177,178,179,180,181,182,183,
88 184,185,186,187,188,189,190,191,
89 192,193,194,195,196,197,198,199,
90 200,201,202,203,204,205,206,207,
91 208,209,210,211,212,213,214,215,
92 216,217,218,219,220,221,222,223,
93 224,225,226,227,228,229,230,231,
94 232,233,234,235,236,237,238,239,
95 240,241,242,243,244,245,246,247,
96 248,249,250,251,252,253,254,255 };
97
Guido van Rossum50700601997-12-08 17:15:20 +000098/* This table is a case flipping table. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +000099
Guido van Rossum50700601997-12-08 17:15:20 +0000100unsigned char pcre_fcc[] = {
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000101 0, 1, 2, 3, 4, 5, 6, 7,
102 8, 9, 10, 11, 12, 13, 14, 15,
103 16, 17, 18, 19, 20, 21, 22, 23,
104 24, 25, 26, 27, 28, 29, 30, 31,
105 32, 33, 34, 35, 36, 37, 38, 39,
106 40, 41, 42, 43, 44, 45, 46, 47,
107 48, 49, 50, 51, 52, 53, 54, 55,
108 56, 57, 58, 59, 60, 61, 62, 63,
Guido van Rossum50700601997-12-08 17:15:20 +0000109 64, 97, 98, 99,100,101,102,103,
110 104,105,106,107,108,109,110,111,
111 112,113,114,115,116,117,118,119,
112 120,121,122, 91, 92, 93, 94, 95,
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000113 96, 65, 66, 67, 68, 69, 70, 71,
114 72, 73, 74, 75, 76, 77, 78, 79,
115 80, 81, 82, 83, 84, 85, 86, 87,
116 88, 89, 90,123,124,125,126,127,
117 128,129,130,131,132,133,134,135,
118 136,137,138,139,140,141,142,143,
119 144,145,146,147,148,149,150,151,
120 152,153,154,155,156,157,158,159,
121 160,161,162,163,164,165,166,167,
122 168,169,170,171,172,173,174,175,
123 176,177,178,179,180,181,182,183,
124 184,185,186,187,188,189,190,191,
125 192,193,194,195,196,197,198,199,
126 200,201,202,203,204,205,206,207,
127 208,209,210,211,212,213,214,215,
128 216,217,218,219,220,221,222,223,
129 224,225,226,227,228,229,230,231,
130 232,233,234,235,236,237,238,239,
131 240,241,242,243,244,245,246,247,
132 248,249,250,251,252,253,254,255 };
133
Guido van Rossum50700601997-12-08 17:15:20 +0000134/* This table contains bit maps for digits, letters, 'word' chars, and
135white space. Each map is 32 bytes long and the bits run from the least
136significant end of each byte. */
137
138unsigned char pcre_cbits[] = {
139 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
140 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
141 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
142 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
143
144 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
145 0xfe,0xff,0xff,0x07,0xfe,0xff,0xff,0x07,
146 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
147 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
148
149 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
150 0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07,
151 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
152 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
153
154 0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00,
155 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
156 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
157 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 };
158
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000159/* This table identifies various classes of character by individual bits:
Guido van Rossum50700601997-12-08 17:15:20 +0000160 0x01 white space character
161 0x02 letter
162 0x04 decimal digit
163 0x08 hexadecimal digit
164 0x10 alphanumeric or '_'
165 0x80 regular expression metacharacter or binary zero
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000166*/
167
168unsigned char pcre_ctypes[] = {
169 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
170 0x00,0x01,0x01,0x01,0x01,0x01,0x00,0x00, /* 8- 15 */
171 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
172 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
173 0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /* - ' */
174 0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /* ( - / */
Guido van Rossum50700601997-12-08 17:15:20 +0000175 0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c, /* 0 - 7 */
176 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x80, /* 8 - ? */
177 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* @ - G */
178 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* H - O */
179 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* P - W */
180 0x12,0x12,0x12,0x80,0x00,0x00,0x80,0x10, /* X - _ */
181 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* ` - g */
182 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* h - o */
183 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* p - w */
184 0x12,0x12,0x12,0x80,0x80,0x00,0x00,0x00, /* x -127 */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000185 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
186 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
187 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
188 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
189 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
190 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
191 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
192 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
193 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
194 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
195 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
196 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
197 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
198 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
199 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
200 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
201
Guido van Rossum50700601997-12-08 17:15:20 +0000202/* End of chartables.c */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000203/*************************************************
204* Perl-Compatible Regular Expressions *
205*************************************************/
206
207/*
208This is a library of functions to support regular expressions whose syntax
209and semantics are as close as possible to those of the Perl 5 language. See
210the file Tech.Notes for some information on the internals.
211
212Written by: Philip Hazel <ph10@cam.ac.uk>
213
214 Copyright (c) 1997 University of Cambridge
215
216-----------------------------------------------------------------------------
217Permission is granted to anyone to use this software for any purpose on any
218computer system, and to redistribute it freely, subject to the following
219restrictions:
220
2211. This software is distributed in the hope that it will be useful,
222 but WITHOUT ANY WARRANTY; without even the implied warranty of
223 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
224
2252. The origin of this software must not be misrepresented, either by
226 explicit claim or by omission.
227
2283. Altered versions must be plainly marked as such, and must not be
229 misrepresented as being the original software.
230-----------------------------------------------------------------------------
231*/
232
233
234/* Include the internals header, which itself includes Standard C headers plus
235the external pcre header. */
236
237
238
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000239
240/*************************************************
241* Create bitmap of starting chars *
242*************************************************/
243
244/* This function scans a compiled unanchored expression and attempts to build a
245bitmap of the set of initial characters. If it can't, it returns FALSE. As time
246goes by, we may be able to get more clever at doing this.
247
248Arguments:
249 code points to an expression
250 start_bits points to a 32-byte table, initialized to 0
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000251
252Returns: TRUE if table built, FALSE otherwise
253*/
254
255static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +0000256set_start_bits(const uschar *code, uschar *start_bits)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000257{
Guido van Rossum50700601997-12-08 17:15:20 +0000258register int c;
259
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000260do
261 {
Guido van Rossum58132c61997-12-17 00:24:13 +0000262 const uschar *tcode = code + 3;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000263 BOOL try_next = TRUE;
264
265 while (try_next)
266 {
267 try_next = FALSE;
268
269 if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
270 {
Guido van Rossum50700601997-12-08 17:15:20 +0000271 if (!set_start_bits(tcode, start_bits)) return FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000272 }
273
274 else switch(*tcode)
275 {
276 default:
277 return FALSE;
278
279 /* BRAZERO does the bracket, but carries on. */
280
281 case OP_BRAZERO:
282 case OP_BRAMINZERO:
Guido van Rossum50700601997-12-08 17:15:20 +0000283 if (!set_start_bits(++tcode, start_bits)) return FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000284 do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
285 tcode += 3;
286 try_next = TRUE;
287 break;
288
289 /* Single-char * or ? sets the bit and tries the next item */
290
291 case OP_STAR:
292 case OP_MINSTAR:
293 case OP_QUERY:
294 case OP_MINQUERY:
Guido van Rossum50700601997-12-08 17:15:20 +0000295 start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000296 tcode += 2;
297 try_next = TRUE;
298 break;
299
300 /* Single-char upto sets the bit and tries the next */
301
302 case OP_UPTO:
303 case OP_MINUPTO:
Guido van Rossum50700601997-12-08 17:15:20 +0000304 start_bits[tcode[3]/8] |= (1 << (tcode[3]&7));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000305 tcode += 4;
306 try_next = TRUE;
307 break;
308
309 /* At least one single char sets the bit and stops */
310
311 case OP_EXACT: /* Fall through */
312 tcode++;
313
314 case OP_CHARS: /* Fall through */
315 tcode++;
316
317 case OP_PLUS:
318 case OP_MINPLUS:
Guido van Rossum50700601997-12-08 17:15:20 +0000319 start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000320 break;
321
322 /* Single character type sets the bits and stops */
323
324 case OP_NOT_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000325 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000326 break;
327
328 case OP_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000329 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000330 break;
331
332 case OP_NOT_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000333 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000334 break;
335
336 case OP_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000337 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000338 break;
339
340 case OP_NOT_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000341 for (c = 0; c < 32; c++)
342 start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000343 break;
344
345 case OP_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000346 for (c = 0; c < 32; c++)
347 start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000348 break;
349
350 /* One or more character type fudges the pointer and restarts, knowing
351 it will hit a single character type and stop there. */
352
353 case OP_TYPEPLUS:
354 case OP_TYPEMINPLUS:
355 tcode++;
356 try_next = TRUE;
357 break;
358
359 case OP_TYPEEXACT:
360 tcode += 3;
361 try_next = TRUE;
362 break;
363
364 /* Zero or more repeats of character types set the bits and then
365 try again. */
366
367 case OP_TYPEUPTO:
368 case OP_TYPEMINUPTO:
369 tcode += 2; /* Fall through */
370
371 case OP_TYPESTAR:
372 case OP_TYPEMINSTAR:
373 case OP_TYPEQUERY:
374 case OP_TYPEMINQUERY:
375 switch(tcode[1])
376 {
377 case OP_NOT_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000378 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000379 break;
380
381 case OP_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000382 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000383 break;
384
385 case OP_NOT_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000386 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000387 break;
388
389 case OP_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000390 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000391 break;
392
393 case OP_NOT_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000394 for (c = 0; c < 32; c++)
395 start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000396 break;
397
398 case OP_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000399 for (c = 0; c < 32; c++)
400 start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000401 break;
402 }
403
404 tcode += 2;
405 try_next = TRUE;
406 break;
407
408 /* Character class: set the bits and either carry on or not,
409 according to the repeat count. */
410
411 case OP_CLASS:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000412 {
Guido van Rossum50700601997-12-08 17:15:20 +0000413 tcode++;
414 for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
415 tcode += 32;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000416 switch (*tcode)
417 {
418 case OP_CRSTAR:
419 case OP_CRMINSTAR:
420 case OP_CRQUERY:
421 case OP_CRMINQUERY:
422 tcode++;
423 try_next = TRUE;
424 break;
425
426 case OP_CRRANGE:
427 case OP_CRMINRANGE:
428 if (((tcode[1] << 8) + tcode[2]) == 0)
429 {
430 tcode += 5;
431 try_next = TRUE;
432 }
433 break;
434 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000435 }
436 break; /* End of class handling */
437
438 } /* End of switch */
439 } /* End of try_next loop */
440
441 code += (code[1] << 8) + code[2]; /* Advance to next branch */
442 }
443while (*code == OP_ALT);
444return TRUE;
445}
446
447
448
449/*************************************************
450* Study a compiled expression *
451*************************************************/
452
453/* This function is handed a compiled expression that it must study to produce
454information that will speed up the matching. It returns a pcre_extra block
455which then gets handed back to pcre_exec().
456
457Arguments:
458 re points to the compiled expression
459 options contains option bits
460 errorptr points to where to place error messages;
461 set NULL unless error
462
463Returns: pointer to a pcre_extra block,
464 NULL on error or if no optimization possible
465*/
466
467pcre_extra *
Guido van Rossum58132c61997-12-17 00:24:13 +0000468pcre_study(const pcre *external_re, int options, const char **errorptr)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000469{
470BOOL caseless;
471uschar start_bits[32];
472real_pcre_extra *extra;
Guido van Rossum58132c61997-12-17 00:24:13 +0000473const real_pcre *re = (const real_pcre *)external_re;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000474
475*errorptr = NULL;
476
477if (re == NULL || re->magic_number != MAGIC_NUMBER)
478 {
479 *errorptr = "argument is not a compiled regular expression";
480 return NULL;
481 }
482
483if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
484 {
485 *errorptr = "unknown or incorrect option bit(s) set";
486 return NULL;
487 }
488
Guido van Rossum50700601997-12-08 17:15:20 +0000489/* Caseless can either be from the compiled regex or from options. */
490
491caseless = ((re->options | options) & PCRE_CASELESS) != 0;
492
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000493/* For an anchored pattern, or an unchored pattern that has a first char, or a
494multiline pattern that matches only at "line starts", no further processing at
495present. */
496
497if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
498 return NULL;
499
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000500/* See if we can find a fixed set of initial characters for the pattern. */
501
Guido van Rossum50700601997-12-08 17:15:20 +0000502memset(start_bits, 0, 32 * sizeof(uschar));
503if (!set_start_bits(re->code, start_bits)) return NULL;
504
505/* If this studying is caseless, scan the created bit map and duplicate the
506bits for any letters. */
507
508if (caseless)
509 {
510 register int c;
511 for (c = 0; c < 256; c++)
512 {
513 if ((start_bits[c/8] & (1 << (c&7))) != 0 &&
514 (pcre_ctypes[c] & ctype_letter) != 0)
515 {
516 int d = pcre_fcc[c];
517 start_bits[d/8] |= (1 << (d&7));
518 }
519 }
520 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000521
522/* Get an "extra" block and put the information therein. */
523
524extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
525
526if (extra == NULL)
527 {
528 *errorptr = "failed to get memory";
529 return NULL;
530 }
Guido van Rossum50700601997-12-08 17:15:20 +0000531
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000532extra->options = PCRE_STUDY_MAPPED | (caseless? PCRE_STUDY_CASELESS : 0);
Guido van Rossum50700601997-12-08 17:15:20 +0000533memcpy(extra->start_bits, start_bits, sizeof(start_bits));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000534
535return (pcre_extra *)extra;
536}
537
Guido van Rossum50700601997-12-08 17:15:20 +0000538/* End of study.c */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000539/*************************************************
540* Perl-Compatible Regular Expressions *
541*************************************************/
542
543/*
544This is a library of functions to support regular expressions whose syntax
545and semantics are as close as possible to those of the Perl 5 language. See
546the file Tech.Notes for some information on the internals.
547
548Written by: Philip Hazel <ph10@cam.ac.uk>
549
550 Copyright (c) 1997 University of Cambridge
551
552-----------------------------------------------------------------------------
553Permission is granted to anyone to use this software for any purpose on any
554computer system, and to redistribute it freely, subject to the following
555restrictions:
556
5571. This software is distributed in the hope that it will be useful,
558 but WITHOUT ANY WARRANTY; without even the implied warranty of
559 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
560
5612. The origin of this software must not be misrepresented, either by
562 explicit claim or by omission.
563
5643. Altered versions must be plainly marked as such, and must not be
565 misrepresented as being the original software.
566-----------------------------------------------------------------------------
567*/
568
569
570/* Define DEBUG to get debugging output on stdout. */
571
572/* #define DEBUG */
573
Guido van Rossum557dea11997-12-22 22:46:52 +0000574/* Use a macro for debugging printing, 'cause that eliminates the the use
575of #ifdef inline, and there are *still* stupid compilers about that don't like
576indented pre-processor statements. I suppose it's only been 10 years... */
577
578#ifdef DEBUG
579#define DPRINTF(p) printf p
580#else
581#define DPRINTF(p) /*nothing*/
582#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000583
584/* Include the internals header, which itself includes Standard C headers plus
585the external pcre header. */
586
587
Guido van Rossum50700601997-12-08 17:15:20 +0000588
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000589#ifndef Py_eval_input
590/* For Python 1.4, graminit.h has to be explicitly included */
591#define Py_eval_input eval_input
Guido van Rossum50700601997-12-08 17:15:20 +0000592
593#endif /* FOR_PYTHON */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000594
595/* Min and max values for the common repeats; for the maxima, 0 => infinity */
596
597static char rep_min[] = { 0, 0, 1, 1, 0, 0 };
598static char rep_max[] = { 0, 0, 0, 0, 1, 1 };
599
600/* Text forms of OP_ values and things, for debugging */
601
602#ifdef DEBUG
Guido van Rossum58132c61997-12-17 00:24:13 +0000603static const char *OP_names[] = {
604 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
Guido van Rossum50700601997-12-08 17:15:20 +0000605 "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z",
606 "localized \\B", "localized \\b", "localized \\W", "localized \\w",
607 "^", "$", "Any", "chars",
608 "not",
609 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000610 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
611 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
612 "*", "*?", "+", "+?", "?", "??", "{", "{",
Guido van Rossum50700601997-12-08 17:15:20 +0000613 "class", "classL", "Ref",
614 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000615 "Brazero", "Braminzero", "Bra"
616};
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000617#endif
618
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000619/* Table for handling escaped characters in the range '0'-'z'. Positive returns
620are simple data values; negative values are for special things like \d and so
621on. Zero means further processing is needed (for things like \x), or the escape
622is invalid. */
623
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000624static short int escapes[] = {
625 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
626 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
627 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
628 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
629 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
630 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
631 '`', 7, -ESC_b, 0, -ESC_d, 0, '\f', 0, /* ` - g */
632 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
633 0, 0, '\r', -ESC_s, '\t', 0, '\v', -ESC_w, /* p - w */
634 0, 0, 0 /* x - z */
635};
636
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000637/* Definition to allow mutual recursion */
638
Guido van Rossum58132c61997-12-17 00:24:13 +0000639static BOOL compile_regex(int, int *, uschar **, const uschar **,
640 const char **, PyObject *);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000641
642/* Structure for passing "static" information around between the functions
643doing the matching, so that they are thread-safe. */
644
645typedef struct match_data {
646 int errorcode; /* As it says */
647 int *offset_vector; /* Offset vector */
648 int offset_end; /* One past the end */
649 BOOL offset_overflow; /* Set if too many extractions */
650 BOOL caseless; /* Case-independent flag */
Guido van Rossum50700601997-12-08 17:15:20 +0000651 BOOL runtime_caseless; /* Caseless forced at run time */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000652 BOOL multiline; /* Multiline flag */
Guido van Rossum50700601997-12-08 17:15:20 +0000653 BOOL notbol; /* NOTBOL flag */
654 BOOL noteol; /* NOTEOL flag */
655 BOOL dotall; /* Dot matches any char */
656 BOOL endonly; /* Dollar not before final \n */
Guido van Rossum58132c61997-12-17 00:24:13 +0000657 const uschar *start_subject; /* Start of the subject string */
658 const uschar *end_subject; /* End of the subject string */
Guido van Rossum50700601997-12-08 17:15:20 +0000659 jmp_buf fail_env; /* Environment for longjump() break out */
Guido van Rossum58132c61997-12-17 00:24:13 +0000660 const uschar *end_match_ptr; /* Subject position at end match */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000661 int end_offset_top; /* Highwater mark at end of match */
Guido van Rossum50700601997-12-08 17:15:20 +0000662 jmp_buf error_env; /* For longjmp() if an error occurs deep inside a
663 matching operation */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000664 int length; /* Length of the allocated stacks */
665 int point; /* Point to add next item pushed onto stacks */
666 /* Pointers to the 6 stacks */
667 int *off_num, *offset_top, *r1, *r2;
Guido van Rossum58132c61997-12-17 00:24:13 +0000668 const uschar **eptr, **ecode;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000669} match_data;
670
671
672
673/*************************************************
Guido van Rossum50700601997-12-08 17:15:20 +0000674* Global variables *
675*************************************************/
676
677/* PCRE is thread-clean and doesn't use any global variables in the normal
678sense. However, it calls memory allocation and free functions via the two
679indirections below, which are can be changed by the caller, but are shared
680between all threads. */
681
682void *(*pcre_malloc)(size_t) = malloc;
683void (*pcre_free)(void *) = free;
684
685
686
687
688/*************************************************
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000689* Return version string *
690*************************************************/
691
Guido van Rossum58132c61997-12-17 00:24:13 +0000692const char *
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000693pcre_version(void)
694{
695return PCRE_VERSION;
696}
697
698
699
700
701/*************************************************
702* Return info about a compiled pattern *
703*************************************************/
704
705/* This function picks potentially useful data out of the private
706structure.
707
708Arguments:
709 external_re points to compiled code
710 optptr where to pass back the options
711 first_char where to pass back the first character,
712 or -1 if multiline and all branches start ^,
713 or -2 otherwise
714
715Returns: number of identifying extraction brackets
716 or negative values on error
717*/
718
719int
Guido van Rossum50700601997-12-08 17:15:20 +0000720pcre_info(const pcre *external_re, int *optptr, int *first_char)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000721{
Guido van Rossum58132c61997-12-17 00:24:13 +0000722const real_pcre *re = (real_pcre *)external_re;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000723if (re == NULL) return PCRE_ERROR_NULL;
724if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
725if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
726if (first_char != NULL)
727 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
728 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
729return re->top_bracket;
730}
731
732
733
734
735#ifdef DEBUG
736/*************************************************
737* Debugging function to print chars *
738*************************************************/
739
740/* Print a sequence of chars in printable format, stopping at the end of the
741subject if the requested.
742
743Arguments:
744 p points to characters
745 length number to print
746 is_subject TRUE if printing from within md->start_subject
747 md pointer to matching data block, if is_subject is TRUE
748
749Returns: nothing
750*/
751
Guido van Rossum557dea11997-12-22 22:46:52 +0000752static void
753pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000754{
755int c;
756if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
757while (length-- > 0)
758 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
759}
760#endif
761
762
763
764
765/*************************************************
766* Check subpattern for empty operand *
767*************************************************/
768
769/* This function checks a bracketed subpattern to see if any of the paths
770through it could match an empty string. This is used to diagnose an error if
771such a subpattern is followed by a quantifier with an unlimited upper bound.
772
773Argument:
774 code points to the opening bracket
775
776Returns: TRUE or FALSE
777*/
778
779static BOOL
780could_be_empty(uschar *code)
781{
782do {
783 uschar *cc = code + 3;
784
785 /* Scan along the opcodes for this branch; as soon as we find something
786 that matches a non-empty string, break out and advance to test the next
787 branch. If we get to the end of the branch, return TRUE for the whole
788 sub-expression. */
789
790 for (;;)
791 {
792 /* Test an embedded subpattern; if it could not be empty, break the
793 loop. Otherwise carry on in the branch. */
794
Guido van Rossum50700601997-12-08 17:15:20 +0000795 if ((int)(*cc) >= OP_BRA || (int)(*cc) == OP_ONCE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000796 {
797 if (!could_be_empty(cc)) break;
798 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
799 cc += 3;
800 }
801
802 else switch (*cc)
803 {
804 /* Reached end of a branch: the subpattern may match the empty string */
805
806 case OP_ALT:
807 case OP_KET:
808 case OP_KETRMAX:
809 case OP_KETRMIN:
810 return TRUE;
811
812 /* Skip over assertive subpatterns */
813
814 case OP_ASSERT:
815 case OP_ASSERT_NOT:
816 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
817 cc += 3;
818 break;
819
820 /* Skip over things that don't match chars */
821
822 case OP_SOD:
823 case OP_EOD:
824 case OP_CIRC:
825 case OP_DOLL:
826 case OP_BRAZERO:
827 case OP_BRAMINZERO:
828 case OP_NOT_WORD_BOUNDARY:
829 case OP_WORD_BOUNDARY:
Guido van Rossum50700601997-12-08 17:15:20 +0000830 case OP_NOT_WORD_BOUNDARY_L:
831 case OP_WORD_BOUNDARY_L:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000832 cc++;
833 break;
834
835 /* Skip over simple repeats with zero lower bound */
836
837 case OP_STAR:
838 case OP_MINSTAR:
839 case OP_QUERY:
840 case OP_MINQUERY:
Guido van Rossum50700601997-12-08 17:15:20 +0000841 case OP_NOTSTAR:
842 case OP_NOTMINSTAR:
843 case OP_NOTQUERY:
844 case OP_NOTMINQUERY:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000845 case OP_TYPESTAR:
846 case OP_TYPEMINSTAR:
847 case OP_TYPEQUERY:
848 case OP_TYPEMINQUERY:
849 cc += 2;
850 break;
851
852 /* Skip over UPTOs (lower bound is zero) */
853
854 case OP_UPTO:
855 case OP_MINUPTO:
856 case OP_TYPEUPTO:
857 case OP_TYPEMINUPTO:
858 cc += 4;
859 break;
860
861 /* Check a class or a back reference for a zero minimum */
862
863 case OP_CLASS:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000864 case OP_REF:
Guido van Rossum50700601997-12-08 17:15:20 +0000865 case OP_CLASS_L:
866 switch(*cc)
867 {
868 case (OP_REF): cc += 2; break;
869 case (OP_CLASS): cc += 1+32; break;
870 case (OP_CLASS_L): cc += 1+1+32; break;
871 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000872
873 switch (*cc)
874 {
875 case OP_CRSTAR:
876 case OP_CRMINSTAR:
877 case OP_CRQUERY:
878 case OP_CRMINQUERY:
879 cc++;
880 break;
881
882 case OP_CRRANGE:
883 case OP_CRMINRANGE:
884 if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
885 cc += 3;
886 break;
887
888 default:
889 goto NEXT_BRANCH;
890 }
891 break;
892
893 /* Anything else matches at least one character */
894
895 default:
896 goto NEXT_BRANCH;
897 }
898 }
899
900 NEXT_BRANCH:
901 code += (code[1] << 8) + code[2];
902 }
903while (*code == OP_ALT);
904
905/* No branches match the empty string */
906
907return FALSE;
908}
909
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000910/* Determine the length of a group ID in an expression like
911 (?P<foo_123>...)
912Arguments:
913 ptr pattern position pointer (say that 3 times fast)
914 finalchar the character that will mark the end of the ID
915 errorptr points to the pointer to the error message
916*/
917
918static int
Guido van Rossum58132c61997-12-17 00:24:13 +0000919get_group_id(const uschar *ptr, char finalchar, const char **errorptr)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000920{
Guido van Rossum58132c61997-12-17 00:24:13 +0000921 const uschar *start = ptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000922
923 /* If the first character is not in \w, or is in \w but is a digit,
924 report an error */
925 if (!(pcre_ctypes[*ptr] & ctype_word) ||
926 (pcre_ctypes[*ptr++] & ctype_digit))
927 {
928 *errorptr = "(?P identifier must start with a letter or underscore";
929 return 0;
930 }
931
932 /* Increment ptr until we either hit a null byte, the desired
933 final character, or a non-word character */
934 for(; (*ptr != 0) && (*ptr != finalchar) &&
935 (pcre_ctypes[*ptr] & ctype_word); ptr++)
936 {
Guido van Rossumc3861071997-10-08 02:07:40 +0000937 /* Empty loop body */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000938 }
939 if (*ptr==finalchar)
940 return ptr-start;
941 if (*ptr==0)
942 {
943 *errorptr = "unterminated (?P identifier";
944 return 0;
945 }
946 *errorptr = "illegal character in (?P identifier";
947 return 0;
948}
949
950/*************************************************
951* Handle escapes *
952*************************************************/
953
954/* This function is called when a \ has been encountered. It either returns a
955positive value for a simple escape such as \n, or a negative value which
956encodes one of the more complicated things such as \d. On entry, ptr is
957pointing at the \. On exit, it is on the final character of the escape
958sequence.
959
960Arguments:
961 ptrptr points to the pattern position pointer
962 errorptr points to the pointer to the error message
Guido van Rossum50700601997-12-08 17:15:20 +0000963 bracount number of previous extracting brackets
964 options the options bits
965 isclass TRUE if inside a character class
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000966
967Returns: zero or positive => a data character
968 negative => a special escape sequence
969 on error, errorptr is set
970*/
971
972static int
Guido van Rossum58132c61997-12-17 00:24:13 +0000973check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
974 int options, BOOL isclass)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000975{
Guido van Rossum58132c61997-12-17 00:24:13 +0000976const uschar *ptr = *ptrptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000977int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
978int i;
979
Guido van Rossum50700601997-12-08 17:15:20 +0000980if (c == 0) *errorptr = ERR1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000981
982/* Digits or letters may have special meaning; all others are literals. */
983
984else if (c < '0' || c > 'z') {}
985
986/* Do an initial lookup in a table. A non-zero result is something that can be
987returned immediately. Otherwise further processing may be required. */
988
989else if ((i = escapes[c - '0']) != 0) c = i;
990
991/* Escapes that need further processing, or are illegal. */
992
Guido van Rossum50700601997-12-08 17:15:20 +0000993else
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000994 {
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000995
Guido van Rossum50700601997-12-08 17:15:20 +0000996 switch (c)
997 {
998 /* The handling of escape sequences consisting of a string of digits
999 starting with one that is not zero is not straightforward. By experiment,
1000 the way Perl works seems to be as follows:
1001
1002 Outside a character class, the digits are read as a decimal number. If the
1003 number is less than 10, or if there are that many previous extracting
1004 left brackets, then it is a back reference. Otherwise, up to three octal
1005 digits are read to form an escaped byte. Thus \123 is likely to be octal
1006 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
1007 value is greater than 377, the least significant 8 bits are taken. Inside a
1008 character class, \ followed by a digit is always an octal number. */
1009
1010 case '1': case '2': case '3': case '4': case '5':
1011 case '6': case '7': case '8': case '9':
1012
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001013 {
1014 /* PYTHON: Try to compute an octal value for a character */
1015 for(c=0, i=0; c!=-1 && ptr[i]!=0 && i<3; i++)
1016 {
1017 if (( pcre_ctypes[ ptr[i] ] & ctype_odigit) != 0)
1018 c = c * 8 + ptr[i]-'0';
1019 else
1020 c = -1; /* Non-octal character */
1021 }
1022 /* Aha! There were 3 octal digits, so it must be a character */
1023 if (c != -1 && i == 3)
1024 {
1025 ptr += i-1;
1026 break;
1027 }
1028 c = ptr[0]; /* Restore the first character after the \ */
1029 c -= '0'; i = 1;
1030 while (i<2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0)
1031 {
1032 c = c * 10 + ptr[1] - '0';
1033 ptr++; i++;
1034 }
1035 if (c > 255 - ESC_REF) *errorptr = "back reference too big";
1036 c = -(ESC_REF + c);
1037 }
1038 break;
1039
Guido van Rossum50700601997-12-08 17:15:20 +00001040 /* \0 always starts an octal number, but we may drop through to here with a
1041 larger first octal digit */
1042
1043 case '0':
1044 c -= '0';
1045 while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&
1046 ptr[1] != '8' && ptr[1] != '9')
1047 c = c * 8 + *(++ptr) - '0';
1048 break;
1049
1050 /* Special escapes not starting with a digit are straightforward */
1051
1052 case 'x':
1053 c = 0;
1054 while ( (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001055 {
Guido van Rossum50700601997-12-08 17:15:20 +00001056 ptr++;
1057 c = c * 16 + pcre_lcc[*ptr] -
1058 (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
1059 c &= 255;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001060 }
1061 break;
1062
1063
Guido van Rossum50700601997-12-08 17:15:20 +00001064 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1065 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
1066 for Perl compatibility, it is a literal. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001067
Guido van Rossum50700601997-12-08 17:15:20 +00001068 default:
1069 if ((options & PCRE_EXTRA) != 0) switch(c)
1070 {
1071 case 'X':
1072 c = -ESC_X; /* This could be a lookup if it ever got into Perl */
1073 break;
1074
1075 default:
1076 *errorptr = ERR3;
1077 break;
1078 }
1079 break;
1080 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001081 }
1082
1083*ptrptr = ptr;
1084return c;
1085}
1086
1087
1088
1089/*************************************************
Guido van Rossum50700601997-12-08 17:15:20 +00001090* Check for counted repeat *
1091*************************************************/
1092
1093/* This function is called when a '{' is encountered in a place where it might
1094start a quantifier. It looks ahead to see if it really is a quantifier or not.
1095It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1096where the ddds are digits.
1097
1098Arguments:
1099 p pointer to the first char after '{'
1100
1101Returns: TRUE or FALSE
1102*/
1103
1104static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00001105is_counted_repeat(const uschar *p)
Guido van Rossum50700601997-12-08 17:15:20 +00001106{
1107if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1108while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1109if (*p == '}') return TRUE;
1110
1111if (*p++ != ',') return FALSE;
1112if (*p == '}') return TRUE;
1113
1114if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1115while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1116return (*p == '}');
1117}
1118
1119
1120
1121/*************************************************
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001122* Read repeat counts *
1123*************************************************/
1124
Guido van Rossum50700601997-12-08 17:15:20 +00001125/* Read an item of the form {n,m} and return the values. This is called only
1126after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1127so the syntax is guaranteed to be correct, but we need to check the values.
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001128
1129Arguments:
1130 p pointer to first char after '{'
1131 minp pointer to int for min
1132 maxp pointer to int for max
1133 returned as -1 if no max
1134 errorptr points to pointer to error message
1135
1136Returns: pointer to '}' on success;
1137 current ptr on error, with errorptr set
1138*/
1139
Guido van Rossum58132c61997-12-17 00:24:13 +00001140static const uschar *
1141read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001142{
1143int min = 0;
1144int max = -1;
1145
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001146while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
1147
1148if (*p == '}') max = min; else
1149 {
Guido van Rossum50700601997-12-08 17:15:20 +00001150 if (*(++p) != '}')
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001151 {
1152 max = 0;
1153 while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001154 if (max < min)
1155 {
Guido van Rossum50700601997-12-08 17:15:20 +00001156 *errorptr = ERR4;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001157 return p;
1158 }
1159 }
1160 }
1161
1162/* Do paranoid checks, then fill in the required variables, and pass back the
1163pointer to the terminating '}'. */
1164
Guido van Rossum50700601997-12-08 17:15:20 +00001165if (min > 65535 || max > 65535)
1166 *errorptr = ERR5;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001167else
1168 {
1169 *minp = min;
1170 *maxp = max;
1171 }
1172return p;
1173}
1174
1175
1176
1177/*************************************************
1178* Compile one branch *
1179*************************************************/
1180
1181/* Scan the pattern, compiling it into the code vector.
1182
1183Arguments:
Guido van Rossum50700601997-12-08 17:15:20 +00001184 options the option bits
1185 bracket points to number of brackets used
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001186 code points to the pointer to the current code point
1187 ptrptr points to the current pattern pointer
1188 errorptr points to pointer to error message
1189
1190Returns: TRUE on success
1191 FALSE, with *errorptr set on error
1192*/
1193
1194static BOOL
Guido van Rossum50700601997-12-08 17:15:20 +00001195compile_branch(int options, int *brackets, uschar **codeptr,
Guido van Rossum58132c61997-12-17 00:24:13 +00001196 const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001197{
1198int repeat_type, op_type;
1199int repeat_min, repeat_max;
1200int bravalue, length;
1201register int c;
1202register uschar *code = *codeptr;
Guido van Rossum58132c61997-12-17 00:24:13 +00001203const uschar *ptr = *ptrptr;
1204const uschar *oldptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001205uschar *previous = NULL;
Guido van Rossum50700601997-12-08 17:15:20 +00001206uschar class[32];
1207uschar *class_flag; /* Pointer to the single-byte flag for OP_CLASS_L */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001208
1209/* Switch on next character until the end of the branch */
1210
1211for (;; ptr++)
1212 {
Guido van Rossum50700601997-12-08 17:15:20 +00001213 BOOL negate_class;
1214 int class_charcount;
1215 int class_lastchar;
1216
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001217 c = *ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00001218 if ((options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001219 {
1220 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1221 if (c == '#')
1222 {
1223 while ((c = *(++ptr)) != 0 && c != '\n');
1224 continue;
1225 }
1226 }
1227
1228 switch(c)
1229 {
1230 /* The branch terminates at end of string, |, or ). */
1231
1232 case 0:
1233 case '|':
1234 case ')':
1235 *codeptr = code;
1236 *ptrptr = ptr;
1237 return TRUE;
1238
1239 /* Handle single-character metacharacters */
1240
1241 case '^':
1242 previous = NULL;
1243 *code++ = OP_CIRC;
1244 break;
1245
1246 case '$':
1247 previous = NULL;
1248 *code++ = OP_DOLL;
1249 break;
1250
1251 case '.':
1252 previous = code;
1253 *code++ = OP_ANY;
1254 break;
1255
Guido van Rossum50700601997-12-08 17:15:20 +00001256 /* Character classes. These always build a 32-byte bitmap of the permitted
1257 characters, except in the special case where there is only one character.
1258 For negated classes, we build the map as usual, then invert it at the end.
1259 */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001260
1261 case '[':
Guido van Rossum50700601997-12-08 17:15:20 +00001262 previous = code;
1263 if (options & PCRE_LOCALE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001264 {
Guido van Rossum50700601997-12-08 17:15:20 +00001265 *code++ = OP_CLASS_L;
1266 /* Set the flag for localized classes (like \w) to 0 */
1267 class_flag = code;
1268 *class_flag = 0;
1269 }
1270 else
1271 {
1272 *code++ = OP_CLASS;
1273 class_flag = NULL;
1274 }
1275
1276 /* If the first character is '^', set the negation flag */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001277
Guido van Rossum50700601997-12-08 17:15:20 +00001278 if ((c = *(++ptr)) == '^')
1279 {
1280 negate_class = TRUE;
1281 c = *(++ptr);
1282 }
1283 else negate_class = FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001284
Guido van Rossum50700601997-12-08 17:15:20 +00001285 /* Keep a count of chars so that we can optimize the case of just a single
1286 character. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001287
Guido van Rossum50700601997-12-08 17:15:20 +00001288 class_charcount = 0;
1289 class_lastchar = -1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001290
Guido van Rossum50700601997-12-08 17:15:20 +00001291 /* Initialize the 32-char bit map to all zeros. We have to build the
1292 map in a temporary bit of store, in case the class contains only 1
1293 character, because in that case the compiled code doesn't use the
1294 bit map. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001295
Guido van Rossum50700601997-12-08 17:15:20 +00001296 memset(class, 0, 32 * sizeof(uschar));
1297
1298 /* Process characters until ] is reached. By writing this as a "do" it
1299 means that an initial ] is taken as a data character. */
1300
1301 do
1302 {
1303 if (c == 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001304 {
Guido van Rossum50700601997-12-08 17:15:20 +00001305 *errorptr = ERR6;
1306 goto FAILED;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001307 }
1308
Guido van Rossum50700601997-12-08 17:15:20 +00001309 /* Backslash may introduce a single character, or it may introduce one
1310 of the specials, which just set a flag. Escaped items are checked for
1311 validity in the pre-compiling pass. The sequence \b is a special case.
Guido van Rossum58132c61997-12-17 00:24:13 +00001312 Inside a class (and only there) it is treated as backspace. Elsewhere
Guido van Rossum50700601997-12-08 17:15:20 +00001313 it marks a word boundary. Other escapes have preset maps ready to
1314 or into the one we are building. We assume they have more than one
1315 character in them, so set class_count bigger than one. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001316
Guido van Rossum50700601997-12-08 17:15:20 +00001317 if (c == '\\')
1318 {
1319 c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1320 if (-c == ESC_b) c = '\b';
1321 else if (c < 0)
1322 {
1323 class_charcount = 10;
1324 switch (-c)
1325 {
1326 case ESC_d:
Guido van Rossum50700601997-12-08 17:15:20 +00001327 {
1328 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];
1329 }
1330 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001331
Guido van Rossum50700601997-12-08 17:15:20 +00001332 case ESC_D:
Guido van Rossum50700601997-12-08 17:15:20 +00001333 {
1334 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];
1335 }
1336 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001337
Guido van Rossum50700601997-12-08 17:15:20 +00001338 case ESC_w:
1339 if (options & PCRE_LOCALE)
1340 {
1341 *class_flag |= 1;
1342 }
1343 else
1344 {
1345 for (c = 0; c < 32; c++)
1346 class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1347 }
1348 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001349
Guido van Rossum50700601997-12-08 17:15:20 +00001350 case ESC_W:
1351 if (options & PCRE_LOCALE)
1352 {
1353 *class_flag |= 2;
1354 }
1355 else
1356 {
1357 for (c = 0; c < 32; c++)
1358 class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1359 }
1360 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001361
Guido van Rossum50700601997-12-08 17:15:20 +00001362 case ESC_s:
Guido van Rossum50700601997-12-08 17:15:20 +00001363 {
1364 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];
1365 }
1366 continue;
1367
1368 case ESC_S:
Guido van Rossum50700601997-12-08 17:15:20 +00001369 {
1370 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];
1371 }
1372 continue;
1373
1374 default:
1375 *errorptr = ERR7;
1376 goto FAILED;
1377 }
1378 }
1379 /* Fall through if single character */
1380 }
1381
1382 /* A single character may be followed by '-' to form a range. However,
1383 Perl does not permit ']' to be the end of the range. A '-' character
1384 here is treated as a literal. */
1385
1386 if (ptr[1] == '-' && ptr[2] != ']')
1387 {
1388 int d;
1389 ptr += 2;
1390 d = *ptr;
1391
1392 if (d == 0)
1393 {
1394 *errorptr = ERR6;
1395 goto FAILED;
1396 }
1397
1398 /* The second part of a range can be a single-character escape, but
1399 not any of the other escapes. */
1400
1401 if (d == '\\')
1402 {
1403 d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1404 if (d < 0)
1405 {
1406 if (d == -ESC_b) d = '\b'; else
1407 {
1408 *errorptr = ERR7;
1409 goto FAILED;
1410 }
1411 }
1412 }
1413
1414 if (d < c)
1415 {
1416 *errorptr = ERR8;
1417 goto FAILED;
1418 }
1419
1420 for (; c <= d; c++)
1421 {
1422 class[c/8] |= (1 << (c&7));
1423 if ((options & PCRE_CASELESS) != 0)
1424 {
1425 int uc = pcre_fcc[c]; /* flip case */
1426 class[uc/8] |= (1 << (uc&7));
1427 }
1428 class_charcount++; /* in case a one-char range */
1429 class_lastchar = c;
1430 }
1431 continue; /* Go get the next char in the class */
1432 }
1433
1434 /* Handle a lone single character - we can get here for a normal
1435 non-escape char, or after \ that introduces a single character. */
1436
1437 class [c/8] |= (1 << (c&7));
1438 if ((options & PCRE_CASELESS) != 0)
1439 {
1440 c = pcre_fcc[c]; /* flip case */
1441 class[c/8] |= (1 << (c&7));
1442 }
1443 class_charcount++;
1444 class_lastchar = c;
1445 }
1446
1447 /* Loop until ']' reached; the check for end of string happens inside the
1448 loop. This "while" is the end of the "do" above. */
1449
1450 while ((c = *(++ptr)) != ']');
1451
1452 /* If class_charcount is 1 and class_lastchar is not negative, we saw
1453 precisely one character. This doesn't need the whole 32-byte bit map.
1454 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1455 it's negative. */
1456
1457 if (class_charcount == 1 && class_lastchar >= 0)
1458 {
1459 if (negate_class)
1460 {
1461 code[-1] = OP_NOT;
1462 }
1463 else
1464 {
1465 code[-1] = OP_CHARS;
1466 *code++ = 1;
1467 }
1468 *code++ = class_lastchar;
1469 }
1470
1471 /* Otherwise, negate the 32-byte map if necessary, and copy it into
1472 the code vector. */
1473
1474 else
1475 {
1476 /* If this is a localized opcode, bump the code pointer up */
1477 if (class_flag) code++;
1478 if (negate_class)
1479 {
1480 if (class_flag) *class_flag = (*class_flag) ^ 63;
1481 for (c = 0; c < 32; c++) code[c] = ~class[c];
1482 }
1483 else
1484 memcpy(code, class, 32);
1485 code += 32;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001486 }
1487 break;
1488
1489 /* Various kinds of repeat */
1490
1491 case '{':
Guido van Rossum50700601997-12-08 17:15:20 +00001492 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001493 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
1494 if (*errorptr != NULL) goto FAILED;
1495 goto REPEAT;
1496
1497 case '*':
1498 repeat_min = 0;
1499 repeat_max = -1;
1500 goto REPEAT;
1501
1502 case '+':
1503 repeat_min = 1;
1504 repeat_max = -1;
1505 goto REPEAT;
1506
1507 case '?':
1508 repeat_min = 0;
1509 repeat_max = 1;
1510
1511 REPEAT:
1512 if (previous == NULL)
1513 {
Guido van Rossum50700601997-12-08 17:15:20 +00001514 *errorptr = ERR9;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001515 goto FAILED;
1516 }
1517
1518 /* If the next character is '?' this is a minimizing repeat. Advance to the
1519 next character. */
1520
1521 if (ptr[1] == '?') { repeat_type = 1; ptr++; } else repeat_type = 0;
1522
Guido van Rossum50700601997-12-08 17:15:20 +00001523 /* If the maximum is zero then the minimum must also be zero; Perl allows
1524 this case, so we do too - by simply omitting the item altogether. */
1525
1526 if (repeat_max == 0) code = previous;
1527
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001528 /* If previous was a string of characters, chop off the last one and use it
1529 as the subject of the repeat. If there was only one character, we can
1530 abolish the previous item altogether. */
1531
Guido van Rossum50700601997-12-08 17:15:20 +00001532 else if (*previous == OP_CHARS)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001533 {
1534 int len = previous[1];
1535 if (len == 1)
1536 {
1537 c = previous[2];
1538 code = previous;
1539 }
1540 else
1541 {
1542 c = previous[len+1];
1543 previous[1]--;
1544 code--;
1545 }
1546 op_type = 0; /* Use single-char op codes */
1547 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1548 }
1549
Guido van Rossum50700601997-12-08 17:15:20 +00001550 /* If previous was a single negated character ([^a] or similar), we use
1551 one of the special opcodes, replacing it. The code is shared with single-
1552 character repeats by adding a suitable offset into repeat_type. */
1553
1554 else if ((int)*previous == OP_NOT)
1555 {
1556 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
1557 c = previous[1];
1558 code = previous;
1559 goto OUTPUT_SINGLE_REPEAT;
1560 }
1561
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001562 /* If previous was a character type match (\d or similar), abolish it and
1563 create a suitable repeat item. The code is shared with single-character
1564 repeats by adding a suitable offset into repeat_type. */
1565
Guido van Rossum50700601997-12-08 17:15:20 +00001566 else if ((int)*previous < OP_CIRC || *previous == OP_ANY)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001567 {
1568 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1569 c = *previous;
1570 code = previous;
1571
1572 OUTPUT_SINGLE_REPEAT:
1573 repeat_type += op_type; /* Combine both values for many cases */
1574
1575 /* A minimum of zero is handled either as the special case * or ?, or as
1576 an UPTO, with the maximum given. */
1577
1578 if (repeat_min == 0)
1579 {
1580 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1581 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1582 else
1583 {
1584 *code++ = OP_UPTO + repeat_type;
1585 *code++ = repeat_max >> 8;
1586 *code++ = (repeat_max & 255);
1587 }
1588 }
1589
1590 /* The case {1,} is handled as the special case + */
1591
1592 else if (repeat_min == 1 && repeat_max == -1)
1593 *code++ = OP_PLUS + repeat_type;
1594
1595 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1596 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1597
1598 else
1599 {
1600 if (repeat_min != 1)
1601 {
1602 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1603 *code++ = repeat_min >> 8;
1604 *code++ = (repeat_min & 255);
1605 }
1606
1607 /* If the mininum is 1 and the previous item was a character string,
1608 we either have to put back the item that got cancelled if the string
1609 length was 1, or add the character back onto the end of a longer
1610 string. For a character type nothing need be done; it will just get put
1611 back naturally. */
1612
1613 else if (*previous == OP_CHARS)
1614 {
1615 if (code == previous) code += 2; else previous[1]++;
1616 }
1617
Guido van Rossum557dea11997-12-22 22:46:52 +00001618 /* If the maximum is unlimited, insert an OP_STAR. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001619
Guido van Rossum557dea11997-12-22 22:46:52 +00001620 if (repeat_max < 0)
1621 {
1622 *code++ = c;
1623 *code++ = OP_STAR + repeat_type;
1624 }
1625
1626 /* Else insert an UPTO if the max is greater than the min. */
1627
1628 else if (repeat_max != repeat_min)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001629 {
1630 *code++ = c;
1631 repeat_max -= repeat_min;
1632 *code++ = OP_UPTO + repeat_type;
1633 *code++ = repeat_max >> 8;
1634 *code++ = (repeat_max & 255);
1635 }
1636 }
1637
1638 /* The character or character type itself comes last in all cases. */
1639
1640 *code++ = c;
1641 }
1642
1643 /* If previous was a character class or a back reference, we put the repeat
1644 stuff after it. */
1645
Guido van Rossum50700601997-12-08 17:15:20 +00001646 else if (*previous == OP_CLASS || *previous==OP_CLASS_L || *previous == OP_REF)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001647 {
1648 if (repeat_min == 0 && repeat_max == -1)
1649 *code++ = OP_CRSTAR + repeat_type;
1650 else if (repeat_min == 1 && repeat_max == -1)
1651 *code++ = OP_CRPLUS + repeat_type;
1652 else if (repeat_min == 0 && repeat_max == 1)
1653 *code++ = OP_CRQUERY + repeat_type;
1654 else
1655 {
1656 *code++ = OP_CRRANGE + repeat_type;
1657 *code++ = repeat_min >> 8;
1658 *code++ = repeat_min & 255;
1659 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1660 *code++ = repeat_max >> 8;
1661 *code++ = repeat_max & 255;
1662 }
1663 }
1664
1665 /* If previous was a bracket group, we may have to replicate it in certain
1666 cases. If the maximum repeat count is unlimited, check that the bracket
1667 group cannot match the empty string, and diagnose an error if it can. */
1668
1669 else if ((int)*previous >= OP_BRA)
1670 {
1671 int i;
Guido van Rossum557dea11997-12-22 22:46:52 +00001672 int len = code - previous;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001673
1674 if (repeat_max == -1 && could_be_empty(previous))
1675 {
Guido van Rossum50700601997-12-08 17:15:20 +00001676 *errorptr = ERR10;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001677 goto FAILED;
1678 }
1679
1680 /* If the minimum is greater than zero, and the maximum is unlimited or
1681 equal to the minimum, the first copy remains where it is, and is
1682 replicated up to the minimum number of times. This case includes the +
1683 repeat, but of course no replication is needed in that case. */
1684
1685 if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1686 {
1687 for (i = 1; i < repeat_min; i++)
1688 {
Guido van Rossum557dea11997-12-22 22:46:52 +00001689 memcpy(code, previous, len);
1690 code += len;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001691 }
1692 }
1693
1694 /* If the minimum is zero, stick BRAZERO in front of the first copy.
1695 Then, if there is a fixed upper limit, replicated up to that many times,
1696 sticking BRAZERO in front of all the optional ones. */
1697
1698 else
1699 {
1700 if (repeat_min == 0)
1701 {
Guido van Rossum557dea11997-12-22 22:46:52 +00001702 memmove(previous+1, previous, len);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001703 code++;
1704 *previous++ = OP_BRAZERO + repeat_type;
1705 }
1706
1707 for (i = 1; i < repeat_min; i++)
1708 {
Guido van Rossum557dea11997-12-22 22:46:52 +00001709 memcpy(code, previous, len);
1710 code += len;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001711 }
1712
1713 for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1714 {
1715 *code++ = OP_BRAZERO + repeat_type;
Guido van Rossum557dea11997-12-22 22:46:52 +00001716 memcpy(code, previous, len);
1717 code += len;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001718 }
1719 }
1720
1721 /* If the maximum is unlimited, set a repeater in the final copy. */
1722
1723 if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
1724 }
1725
1726 /* Else there's some kind of shambles */
1727
1728 else
1729 {
Guido van Rossum50700601997-12-08 17:15:20 +00001730 *errorptr = ERR11;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001731 goto FAILED;
1732 }
1733
1734 /* In all case we no longer have a previous item. */
1735
1736 previous = NULL;
1737 break;
1738
1739
1740 /* Start of nested bracket sub-expression, or comment or lookahead.
1741 First deal with special things that can come after a bracket; all are
1742 introduced by ?, and the appearance of any of them means that this is not a
1743 referencing group. They were checked for validity in the first pass over
1744 the string, so we don't have to check for syntax errors here. */
1745
1746 case '(':
1747 previous = code; /* Only real brackets can be repeated */
1748 if (*(++ptr) == '?')
1749 {
1750 bravalue = OP_BRA;
1751
1752 switch (*(++ptr))
1753 {
1754 case '#':
1755 case 'i':
Guido van Rossumbd49ac41997-12-10 23:05:53 +00001756 case 'L':
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001757 case 'm':
1758 case 's':
1759 case 'x':
1760 ptr++;
1761 while (*ptr != ')') ptr++;
1762 previous = NULL;
1763 continue;
1764
1765 case ':': /* Non-extracting bracket */
1766 ptr++;
1767 break;
1768
1769 case '=': /* Assertions can't be repeated */
1770 bravalue = OP_ASSERT;
1771 ptr++;
1772 previous = NULL;
1773 break;
1774
1775 case '!':
1776 bravalue = OP_ASSERT_NOT;
1777 ptr++;
1778 previous = NULL;
1779 break;
1780
1781 case ('P'):
1782 ptr++;
1783 if (*ptr=='<')
1784 {
1785 /* (?P<groupname>...) */
1786 int idlen;
1787 PyObject *string, *intobj;
1788
1789 ptr++;
1790 idlen = get_group_id(ptr, '>', errorptr);
1791 if (*errorptr) {
1792 goto FAILED;
1793 }
Guido van Rossum57ba4f31997-12-02 20:40:28 +00001794 string = PyString_FromStringAndSize((char*)ptr, idlen);
Guido van Rossum50700601997-12-08 17:15:20 +00001795 intobj = PyInt_FromLong( brackets[0] + 1 );
Guido van Rossum58132c61997-12-17 00:24:13 +00001796 if (intobj == NULL || string == NULL)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001797 {
1798 Py_XDECREF(string);
1799 Py_XDECREF(intobj);
1800 *errorptr = "exception raised";
1801 goto FAILED;
1802 }
1803 PyDict_SetItem(dictionary, string, intobj);
Guido van Rossum58132c61997-12-17 00:24:13 +00001804 Py_DECREF(string); Py_DECREF(intobj); /* XXX DECREF commented out! */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001805 ptr += idlen+1; /* Point to rest of expression */
1806 goto do_grouping_bracket;
1807 }
1808 if (*ptr=='=')
1809 {
1810 /* (?P=groupname) */
1811 int idlen, refnum;
1812 PyObject *string, *intobj;
1813
1814 ptr++;
1815 idlen = get_group_id(ptr, ')', errorptr);
1816 if (*errorptr) {
1817 goto FAILED;
1818 }
Guido van Rossum50700601997-12-08 17:15:20 +00001819 string = PyString_FromStringAndSize((char *)ptr, idlen);
Guido van Rossumc3861071997-10-08 02:07:40 +00001820 if (string==NULL) {
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001821 *errorptr = "exception raised";
1822 goto FAILED;
1823 }
1824 intobj = PyDict_GetItem(dictionary, string);
1825 if (intobj==NULL) {
Guido van Rossumc3861071997-10-08 02:07:40 +00001826 Py_DECREF(string);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001827 *errorptr = "?P= group identifier isn't defined";
1828 goto FAILED;
1829 }
1830
1831 refnum = PyInt_AsLong(intobj);
Guido van Rossum1eadb411997-12-15 17:33:24 +00001832 Py_DECREF(string);
Guido van Rossum58132c61997-12-17 00:24:13 +00001833 /* The caller doesn't own the reference to the value
1834 returned from PyDict_GetItem, so intobj is not
1835 DECREF'ed. */
1836
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001837 *code++ = OP_REF;
1838 *code++ = refnum;
1839 /* The continue will cause the top-level for() loop to
1840 be resumed, so ptr will be immediately incremented.
1841 Therefore, the following line adds just idlen, not
1842 idlen+1 */
1843 ptr += idlen;
1844 continue;
1845 }
1846 /* The character after ?P is neither < nor =, so
1847 report an error. Add more Python-extensions here. */
1848 *errorptr="unknown after (?P";
1849 goto FAILED;
1850 break;
Guido van Rossum50700601997-12-08 17:15:20 +00001851
1852 case '>': /* "Match once" brackets */
1853 if ((options & PCRE_EXTRA) != 0) /* Not yet standard */
1854 {
1855 bravalue = OP_ONCE;
1856 ptr++;
1857 previous = NULL;
1858 break;
1859 }
1860 /* Else fall through */
1861
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001862 default:
Guido van Rossum50700601997-12-08 17:15:20 +00001863 *errorptr = ERR12;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001864 goto FAILED;
1865 }
1866 }
1867
1868 /* Else we have a referencing group */
1869
1870 else
1871 {
1872 do_grouping_bracket:
Guido van Rossum50700601997-12-08 17:15:20 +00001873 if (++(*brackets) > EXTRACT_MAX)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001874 {
Guido van Rossum50700601997-12-08 17:15:20 +00001875 *errorptr = ERR13;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001876 goto FAILED;
1877 }
Guido van Rossum50700601997-12-08 17:15:20 +00001878 bravalue = OP_BRA + *brackets;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001879 }
1880
1881 /* Process nested bracketed re; at end pointer is on the bracket. We copy
1882 code into a non-register variable in order to be able to pass its address
1883 because some compilers complain otherwise. */
1884
1885 *code = bravalue;
1886 {
1887 uschar *mcode = code;
Guido van Rossum50700601997-12-08 17:15:20 +00001888 if (!compile_regex(options, brackets, &mcode, &ptr, errorptr, dictionary))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001889 goto FAILED;
1890 code = mcode;
1891 }
1892
1893 if (*ptr != ')')
1894 {
Guido van Rossum50700601997-12-08 17:15:20 +00001895 *errorptr = ERR14;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001896 goto FAILED;
1897 }
1898 break;
1899
1900 /* Check \ for being a real metacharacter; if not, fall through and handle
1901 it as a data character at the start of a string. Escape items are checked
1902 for validity in the pre-compiling pass. */
1903
1904 case '\\':
1905 oldptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00001906 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001907
1908 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1909 are arranged to be the negation of the corresponding OP_values. For the
1910 back references, the values are ESC_REF plus the reference number. Only
1911 back references and those types that consume a character may be repeated.
1912 We can test for values between ESC_b and ESC_Z for the latter; this may
1913 have to change if any new ones are ever created. */
1914
1915 if (c < 0)
1916 {
1917 if (-c >= ESC_REF)
1918 {
Guido van Rossum50700601997-12-08 17:15:20 +00001919 int refnum = -c - ESC_REF;
1920 if (*brackets < refnum)
1921 {
1922 *errorptr = ERR15;
1923 goto FAILED;
1924 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001925 previous = code;
1926 *code++ = OP_REF;
1927 *code++ = refnum;
1928 }
1929 else
1930 {
Guido van Rossum50700601997-12-08 17:15:20 +00001931 previous = (-c > ESC_b && -c < ESC_X)? code : NULL;
1932 if ( (options & PCRE_LOCALE) != 0)
1933 {
1934 switch (c)
1935 {
1936 case (-ESC_b): c = -OP_WORD_BOUNDARY_L; break;
1937 case (-ESC_B): c = -OP_NOT_WORD_BOUNDARY_L; break;
1938 case (-ESC_w): c = -OP_WORDCHAR_L; break;
1939 case (-ESC_W): c = -OP_NOT_WORDCHAR_L; break;
1940 }
1941 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001942 *code++ = -c;
1943 }
1944 continue;
1945 }
1946
Guido van Rossum58132c61997-12-17 00:24:13 +00001947 /* Data character: Reset and fall through */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001948
1949 ptr = oldptr;
1950 c = '\\';
1951
1952 /* Handle a run of data characters until a metacharacter is encountered.
1953 The first character is guaranteed not to be whitespace or # when the
1954 extended flag is set. */
1955
Guido van Rossum50700601997-12-08 17:15:20 +00001956 NORMAL_CHAR:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001957 default:
1958 previous = code;
1959 *code = OP_CHARS;
1960 code += 2;
1961 length = 0;
1962
1963 do
1964 {
Guido van Rossum50700601997-12-08 17:15:20 +00001965 if ((options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001966 {
1967 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1968 if (c == '#')
1969 {
1970 while ((c = *(++ptr)) != 0 && c != '\n');
1971 if (c == 0) break;
1972 continue;
1973 }
1974 }
1975
1976 /* Backslash may introduce a data char or a metacharacter. Escaped items
1977 are checked for validity in the pre-compiling pass. Stop the string
1978 before a metaitem. */
1979
1980 if (c == '\\')
1981 {
1982 oldptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00001983 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001984 if (c < 0) { ptr = oldptr; break; }
1985 }
1986
1987 /* Ordinary character or single-char escape */
1988
1989 *code++ = c;
1990 length++;
1991 }
1992
1993 /* This "while" is the end of the "do" above. */
1994
1995 while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
1996
1997 /* Compute the length and set it in the data vector, and advance to
1998 the next state. */
1999
2000 previous[1] = length;
2001 ptr--;
2002 break;
2003 }
2004 } /* end of big loop */
2005
2006/* Control never reaches here by falling through, only by a goto for all the
2007error states. Pass back the position in the pattern so that it can be displayed
2008to the user for diagnosing the error. */
2009
2010FAILED:
2011*ptrptr = ptr;
2012return FALSE;
2013}
2014
2015
2016
2017
2018/*************************************************
2019* Compile sequence of alternatives *
2020*************************************************/
2021
2022/* On entry, ptr is pointing past the bracket character, but on return
2023it points to the closing bracket, or vertical bar, or end of string.
2024The code variable is pointing at the byte into which the BRA operator has been
2025stored.
2026
2027Argument:
Guido van Rossum50700601997-12-08 17:15:20 +00002028 options the option bits
2029 brackets -> int containing the number of extracting brackets used
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002030 codeptr -> the address of the current code pointer
2031 ptrptr -> the address of the current pattern pointer
2032 errorptr -> pointer to error message
2033
2034Returns: TRUE on success
2035*/
2036
2037static BOOL
Guido van Rossum50700601997-12-08 17:15:20 +00002038compile_regex(int options, int *brackets, uschar **codeptr,
Guido van Rossum58132c61997-12-17 00:24:13 +00002039 const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002040{
Guido van Rossum58132c61997-12-17 00:24:13 +00002041const uschar *ptr = *ptrptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002042uschar *code = *codeptr;
2043uschar *start_bracket = code;
2044
2045for (;;)
2046 {
2047 int length;
2048 uschar *last_branch = code;
2049
2050 code += 3;
Guido van Rossum50700601997-12-08 17:15:20 +00002051 if (!compile_branch(options, brackets, &code, &ptr, errorptr, dictionary))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002052 {
2053 *ptrptr = ptr;
2054 return FALSE;
2055 }
2056
2057 /* Fill in the length of the last branch */
2058
2059 length = code - last_branch;
2060 last_branch[1] = length >> 8;
2061 last_branch[2] = length & 255;
2062
2063 /* Reached end of expression, either ')' or end of pattern. Insert a
2064 terminating ket and the length of the whole bracketed item, and return,
2065 leaving the pointer at the terminating char. */
2066
2067 if (*ptr != '|')
2068 {
2069 length = code - start_bracket;
2070 *code++ = OP_KET;
2071 *code++ = length >> 8;
2072 *code++ = length & 255;
2073 *codeptr = code;
2074 *ptrptr = ptr;
2075 return TRUE;
2076 }
2077
2078 /* Another branch follows; insert an "or" node and advance the pointer. */
2079
2080 *code = OP_ALT;
2081 ptr++;
2082 }
2083/* Control never reaches here */
2084}
2085
2086
2087
2088/*************************************************
2089* Check for anchored expression *
2090*************************************************/
2091
2092/* Try to find out if this is an anchored regular expression. Consider each
2093alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2094all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2095it's anchored. However, if this is a multiline pattern, then only OP_SOD
2096counts, since OP_CIRC can match in the middle.
2097
2098A branch is also implicitly anchored if it starts with .* because that will try
2099the rest of the pattern at all possible matching points, so there is no point
2100trying them again.
2101
2102Argument: points to start of expression (the bracket)
2103Returns: TRUE or FALSE
2104*/
2105
2106static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00002107is_anchored(register const uschar *code, BOOL multiline)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002108{
2109do {
2110 int op = (int)code[3];
Guido van Rossum50700601997-12-08 17:15:20 +00002111 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002112 { if (!is_anchored(code+3, multiline)) return FALSE; }
2113 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2114 { if (code[4] != OP_ANY) return FALSE; }
2115 else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;
2116 code += (code[1] << 8) + code[2];
2117 }
2118while (*code == OP_ALT);
2119return TRUE;
2120}
2121
2122
2123
2124/*************************************************
2125* Check for start with \n line expression *
2126*************************************************/
2127
2128/* This is called for multiline expressions to try to find out if every branch
2129starts with ^ so that "first char" processing can be done to speed things up.
2130
2131Argument: points to start of expression (the bracket)
2132Returns: TRUE or FALSE
2133*/
2134
2135static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00002136is_startline(const uschar *code)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002137{
2138do {
2139 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2140 { if (!is_startline(code+3)) return FALSE; }
2141 else if (code[3] != OP_CIRC) return FALSE;
2142 code += (code[1] << 8) + code[2];
2143 }
2144while (*code == OP_ALT);
2145return TRUE;
2146}
2147
2148
2149
2150/*************************************************
2151* Check for fixed first char *
2152*************************************************/
2153
2154/* Try to find out if there is a fixed first character. This is called for
2155unanchored expressions, as it speeds up their processing quite considerably.
2156Consider each alternative branch. If they all start with the same char, or with
2157a bracket all of whose alternatives start with the same char (recurse ad lib),
2158then we return that char, otherwise -1.
2159
2160Argument: points to start of expression (the bracket)
2161Returns: -1 or the fixed first char
2162*/
2163
2164static int
2165find_firstchar(uschar *code)
2166{
2167register int c = -1;
2168do
2169 {
2170 register int charoffset = 4;
2171
2172 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2173 {
2174 register int d;
2175 if ((d = find_firstchar(code+3)) < 0) return -1;
2176 if (c < 0) c = d; else if (c != d) return -1;
2177 }
2178
2179 else switch(code[3])
2180 {
2181 default:
2182 return -1;
2183
2184 case OP_EXACT: /* Fall through */
2185 charoffset++;
2186
2187 case OP_CHARS: /* Fall through */
2188 charoffset++;
2189
2190 case OP_PLUS:
2191 case OP_MINPLUS:
2192 if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;
2193 break;
2194 }
2195 code += (code[1] << 8) + code[2];
2196 }
2197while (*code == OP_ALT);
2198return c;
2199}
2200
2201
2202
2203/*************************************************
2204* Compile a Regular Expression *
2205*************************************************/
2206
2207/* This function takes a string and returns a pointer to a block of store
2208holding a compiled version of the expression.
2209
2210Arguments:
2211 pattern the regular expression
2212 options various option bits
2213 errorptr pointer to pointer to error text
2214 erroroffset ptr offset in pattern where error was detected
2215
2216Returns: pointer to compiled data block, or NULL on error,
2217 with errorptr and erroroffset set
2218*/
2219
2220pcre *
Guido van Rossum58132c61997-12-17 00:24:13 +00002221pcre_compile(const char *pattern, int options, const char **errorptr,
Guido van Rossum50700601997-12-08 17:15:20 +00002222 int *erroroffset, PyObject *dictionary)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002223{
2224real_pcre *re;
2225int spaces = 0;
2226int length = 3; /* For initial BRA plus length */
2227int runlength;
2228int c, size;
Guido van Rossum50700601997-12-08 17:15:20 +00002229int bracount = 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002230int brastack[200];
Guido van Rossum50700601997-12-08 17:15:20 +00002231int top_backref = 0;
Guido van Rossum58132c61997-12-17 00:24:13 +00002232unsigned int brastackptr = 0;
2233uschar *code;
2234const uschar *ptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002235
2236#ifdef DEBUG
2237uschar *code_base, *code_end;
2238#endif
2239
Guido van Rossum50700601997-12-08 17:15:20 +00002240/* We can't pass back an error message if errorptr is NULL; I guess the best we
2241can do is just return NULL. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002242
Guido van Rossum50700601997-12-08 17:15:20 +00002243if (errorptr == NULL) return NULL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002244*errorptr = NULL;
Guido van Rossum50700601997-12-08 17:15:20 +00002245
2246/* However, we can give a message for this error */
2247
2248if (erroroffset == NULL)
2249 {
2250 *errorptr = ERR16;
2251 return NULL;
2252 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002253*erroroffset = 0;
2254
2255if ((options & ~PUBLIC_OPTIONS) != 0)
2256 {
Guido van Rossum50700601997-12-08 17:15:20 +00002257 *errorptr = ERR17;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002258 return NULL;
2259 }
2260
Guido van Rossum557dea11997-12-22 22:46:52 +00002261DPRINTF(("------------------------------------------------------------------\n"));
2262DPRINTF(("%s\n", pattern));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002263
2264/* The first thing to do is to make a pass over the pattern to compute the
2265amount of store required to hold the compiled code. This does not have to be
2266perfect as long as errors are overestimates. At the same time we can detect any
2267internal flag settings. Make an attempt to correct for any counted white space
2268if an "extended" flag setting appears late in the pattern. We can't be so
2269clever for #-comments. */
2270
Guido van Rossum58132c61997-12-17 00:24:13 +00002271ptr = (const uschar *)(pattern - 1);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002272while ((c = *(++ptr)) != 0)
2273 {
Guido van Rossum50700601997-12-08 17:15:20 +00002274 int min, max;
2275 int class_charcount;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002276
2277 if ((pcre_ctypes[c] & ctype_space) != 0)
2278 {
Guido van Rossum50700601997-12-08 17:15:20 +00002279 if ((options & PCRE_EXTENDED) != 0) continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002280 spaces++;
2281 }
2282
Guido van Rossum50700601997-12-08 17:15:20 +00002283 if (c == '#' && (options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002284 {
2285 while ((c = *(++ptr)) != 0 && c != '\n');
2286 continue;
2287 }
2288
2289 switch(c)
2290 {
2291 /* A backslashed item may be an escaped "normal" character or a
2292 character type. For a "normal" character, put the pointers and
2293 character back so that tests for whitespace etc. in the input
2294 are done correctly. */
2295
2296 case '\\':
2297 {
Guido van Rossum58132c61997-12-17 00:24:13 +00002298 const uschar *save_ptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00002299 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002300 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2301 if (c >= 0)
2302 {
2303 ptr = save_ptr;
2304 c = '\\';
2305 goto NORMAL_CHAR;
2306 }
2307 }
2308 length++;
2309
2310 /* A back reference needs an additional char, plus either one or 5
Guido van Rossum50700601997-12-08 17:15:20 +00002311 bytes for a repeat. We also need to keep the value of the highest
2312 back reference. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002313
2314 if (c <= -ESC_REF)
2315 {
Guido van Rossum50700601997-12-08 17:15:20 +00002316 int refnum = -c - ESC_REF;
2317 if (refnum > top_backref) top_backref = refnum;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002318 length++; /* For single back reference */
Guido van Rossum50700601997-12-08 17:15:20 +00002319 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002320 {
2321 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2322 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2323 if ((min == 0 && (max == 1 || max == -1)) ||
2324 (min == 1 && max == -1))
2325 length++;
2326 else length += 5;
2327 if (ptr[1] == '?') ptr++;
2328 }
2329 }
2330 continue;
2331
2332 case '^':
2333 case '.':
2334 case '$':
2335 case '*': /* These repeats won't be after brackets; */
2336 case '+': /* those are handled separately */
2337 case '?':
2338 length++;
2339 continue;
2340
2341 /* This covers the cases of repeats after a single char, metachar, class,
2342 or back reference. */
2343
2344 case '{':
Guido van Rossum50700601997-12-08 17:15:20 +00002345 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002346 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
2347 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2348 if ((min == 0 && (max == 1 || max == -1)) ||
2349 (min == 1 && max == -1))
2350 length++;
2351 else
2352 {
2353 length--; /* Uncount the original char or metachar */
2354 if (min == 1) length++; else if (min > 0) length += 4;
2355 if (max > 0) length += 4; else length += 2;
2356 }
2357 if (ptr[1] == '?') ptr++;
2358 continue;
2359
2360 /* An alternation contains an offset to the next branch or ket. */
2361 case '|':
2362 length += 3;
2363 continue;
2364
Guido van Rossum50700601997-12-08 17:15:20 +00002365 /* A character class uses 33 characters. Don't worry about character types
2366 that aren't allowed in classes - they'll get picked up during the compile.
2367 A character class that contains only one character uses 2 or 3 bytes,
2368 depending on whether it is negated or not. Notice this where we can. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002369
2370 case '[':
Guido van Rossum50700601997-12-08 17:15:20 +00002371 class_charcount = 0;
2372 if (*(++ptr) == '^') ptr++;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002373 do
2374 {
Guido van Rossum50700601997-12-08 17:15:20 +00002375 if (*ptr == '\\')
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002376 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002377 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002378 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
Guido van Rossum557dea11997-12-22 22:46:52 +00002379 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002380 }
Guido van Rossum50700601997-12-08 17:15:20 +00002381 else class_charcount++;
2382 ptr++;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002383 }
2384 while (*ptr != 0 && *ptr != ']');
2385
Guido van Rossum50700601997-12-08 17:15:20 +00002386 /* Repeats for negated single chars are handled by the general code */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002387
Guido van Rossum50700601997-12-08 17:15:20 +00002388 if (class_charcount == 1) length += 3; else
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002389 {
Guido van Rossum50700601997-12-08 17:15:20 +00002390 length += 33;
2391 if (options & PCRE_LOCALE) length++; /* Add a byte for the localization flag */
2392
2393 /* A repeat needs either 1 or 5 bytes. */
2394
Guido van Rossum557dea11997-12-22 22:46:52 +00002395 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
Guido van Rossum50700601997-12-08 17:15:20 +00002396 {
2397 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2398 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2399 if ((min == 0 && (max == 1 || max == -1)) ||
2400 (min == 1 && max == -1))
2401 length++;
2402 else length += 5;
2403 if (ptr[1] == '?') ptr++;
2404 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002405 }
2406 continue;
2407
2408 /* Brackets may be genuine groups or special things */
2409
2410 case '(':
2411
2412 /* Handle special forms of bracket, which all start (? */
2413
2414 if (ptr[1] == '?') switch (c = ptr[2])
2415 {
2416 /* Skip over comments entirely */
2417 case '#':
2418 ptr += 3;
2419 while (*ptr != 0 && *ptr != ')') ptr++;
2420 if (*ptr == 0)
2421 {
Guido van Rossum50700601997-12-08 17:15:20 +00002422 *errorptr = ERR18;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002423 goto PCRE_ERROR_RETURN;
2424 }
2425 continue;
2426
2427 /* Non-referencing groups and lookaheads just move the pointer on, and
Guido van Rossum50700601997-12-08 17:15:20 +00002428 then behave like a non-special bracket, except that they don't increment
2429 the count of extracting brackets. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002430
2431 case ':':
2432 case '=':
2433 case '!':
2434 ptr += 2;
2435 break;
2436
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002437 case ('P'):
2438 {
2439 int idlen;
2440 switch (*ptr++) {
2441 case ('<'):
2442 idlen = get_group_id(ptr++, '>', errorptr);
2443 if (*errorptr) goto PCRE_ERROR_RETURN;
2444 ptr += idlen+1;
2445 break;
2446 case ('='):
2447 idlen = get_group_id(ptr++, ')', errorptr);
2448 if (*errorptr) goto PCRE_ERROR_RETURN;
2449 ptr += idlen+1;
2450 length++;
2451 break;
2452 }
2453 }
2454 break;
2455
Guido van Rossum50700601997-12-08 17:15:20 +00002456 /* Ditto for the "once only" bracket, allowed only if the extra bit
2457 is set. */
2458
2459 case '>':
2460 if ((options & PCRE_EXTRA) != 0)
2461 {
2462 ptr += 2;
2463 break;
2464 }
2465 /* Else fall thourh */
2466
2467 /* Else loop setting valid options until ) is met. Anything else is an
2468 error. */
2469
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002470 default:
2471 ptr += 2;
2472 for (;; ptr++)
2473 {
2474 if ((c = *ptr) == 'i')
2475 {
2476 options |= PCRE_CASELESS;
2477 continue;
2478 }
Guido van Rossumbd49ac41997-12-10 23:05:53 +00002479 else if ((c = *ptr) == 'L')
Guido van Rossum50700601997-12-08 17:15:20 +00002480 {
2481 options |= PCRE_LOCALE;
2482 continue;
2483 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002484 else if ((c = *ptr) == 'm')
2485 {
2486 options |= PCRE_MULTILINE;
2487 continue;
2488 }
Guido van Rossum50700601997-12-08 17:15:20 +00002489 else if (c == 's')
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002490 {
2491 options |= PCRE_DOTALL;
2492 continue;
2493 }
2494 else if (c == 'x')
2495 {
2496 options |= PCRE_EXTENDED;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002497 length -= spaces; /* Already counted spaces */
2498 continue;
2499 }
2500 else if (c == ')') break;
2501
Guido van Rossum50700601997-12-08 17:15:20 +00002502 *errorptr = ERR12;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002503 goto PCRE_ERROR_RETURN;
2504 }
2505 continue; /* End of this bracket handling */
2506 }
2507
Guido van Rossum50700601997-12-08 17:15:20 +00002508 /* Extracting brackets must be counted so we can process escapes in a
2509 Perlish way. */
2510
2511 else bracount++;
2512
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002513 /* Non-special forms of bracket. Save length for computing whole length
2514 at end if there's a repeat that requires duplication of the group. */
2515
2516 if (brastackptr >= sizeof(brastack)/sizeof(int))
2517 {
Guido van Rossum50700601997-12-08 17:15:20 +00002518 *errorptr = ERR19;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002519 goto PCRE_ERROR_RETURN;
2520 }
2521
2522 brastack[brastackptr++] = length;
2523 length += 3;
2524 continue;
2525
2526 /* Handle ket. Look for subsequent max/min; for certain sets of values we
Guido van Rossum557dea11997-12-22 22:46:52 +00002527 have to replicate this bracket up to that many times. If brastackptr is
2528 0 this is an unmatched bracket which will generate an error, but take care
2529 not to try to access brastack[-1]. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002530
2531 case ')':
2532 length += 3;
2533 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002534 int minval = 1;
2535 int maxval = 1;
2536 int duplength = (brastackptr > 0)? length - brastack[--brastackptr] : 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002537
2538 /* Leave ptr at the final char; for read_repeat_counts this happens
2539 automatically; for the others we need an increment. */
2540
Guido van Rossum50700601997-12-08 17:15:20 +00002541 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002542 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002543 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002544 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2545 }
Guido van Rossum557dea11997-12-22 22:46:52 +00002546 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2547 else if (c == '+') { maxval = -1; ptr++; }
2548 else if (c == '?') { minval = 0; ptr++; }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002549
Guido van Rossum557dea11997-12-22 22:46:52 +00002550 /* If there is a minimum > 1 we have to replicate up to minval-1 times;
2551 if there is a limited maximum we have to replicate up to maxval-1 times
2552 and allow for a BRAZERO item before each optional copy, as we also have
2553 to do before the first copy if the minimum is zero. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002554
Guido van Rossum557dea11997-12-22 22:46:52 +00002555 if (minval == 0) length++;
2556 else if (minval > 1) length += (minval - 1) * duplength;
2557 if (maxval > minval) length += (maxval - minval) * (duplength + 1);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002558 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002559 continue;
2560
2561 /* Non-special character. For a run of such characters the length required
2562 is the number of characters + 2, except that the maximum run length is 255.
2563 We won't get a skipped space or a non-data escape or the start of a #
2564 comment as the first character, so the length can't be zero. */
2565
2566 NORMAL_CHAR:
2567 default:
2568 length += 2;
2569 runlength = 0;
2570 do
2571 {
2572 if ((pcre_ctypes[c] & ctype_space) != 0)
2573 {
Guido van Rossum50700601997-12-08 17:15:20 +00002574 if ((options & PCRE_EXTENDED) != 0) continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002575 spaces++;
2576 }
2577
Guido van Rossum50700601997-12-08 17:15:20 +00002578 if (c == '#' && (options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002579 {
2580 while ((c = *(++ptr)) != 0 && c != '\n');
2581 continue;
2582 }
2583
2584 /* Backslash may introduce a data char or a metacharacter; stop the
2585 string before the latter. */
2586
2587 if (c == '\\')
2588 {
Guido van Rossum58132c61997-12-17 00:24:13 +00002589 const uschar *saveptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00002590 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002591 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2592 if (c < 0) { ptr = saveptr; break; }
2593 }
2594
2595 /* Ordinary character or single-char escape */
2596
2597 runlength++;
2598 }
2599
2600 /* This "while" is the end of the "do" above. */
2601
2602 while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2603
2604 ptr--;
2605 length += runlength;
2606 continue;
2607 }
2608 }
2609
2610length += 4; /* For final KET and END */
2611
2612if (length > 65539)
2613 {
Guido van Rossum50700601997-12-08 17:15:20 +00002614 *errorptr = ERR20;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002615 return NULL;
2616 }
2617
2618/* Compute the size of data block needed and get it, either from malloc or
Guido van Rossum557dea11997-12-22 22:46:52 +00002619externally provided function. We specify "code[0]" in the offsetof() expression
2620rather than just "code", because it has been reported that one broken compiler
2621fails on "code" because it is also an independent variable. It should make no
2622difference to the value of the offsetof(). */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002623
Guido van Rossum557dea11997-12-22 22:46:52 +00002624size = length + offsetof(real_pcre, code[0]);
Guido van Rossum50700601997-12-08 17:15:20 +00002625re = (real_pcre *)(pcre_malloc)(size+50);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002626
2627if (re == NULL)
2628 {
Guido van Rossum50700601997-12-08 17:15:20 +00002629 *errorptr = ERR21;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002630 return NULL;
2631 }
2632
Guido van Rossum557dea11997-12-22 22:46:52 +00002633/* Put in the magic number and the options. */
2634
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002635re->magic_number = MAGIC_NUMBER;
2636re->options = options;
2637
2638/* Set up a starting, non-extracting bracket, then compile the expression. On
2639error, *errorptr will be set non-NULL, so we don't need to look at the result
2640of the function here. */
2641
Guido van Rossum58132c61997-12-17 00:24:13 +00002642ptr = (const uschar *)pattern;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002643code = re->code;
2644*code = OP_BRA;
Guido van Rossum50700601997-12-08 17:15:20 +00002645bracount = 0;
2646(void)compile_regex(options, &bracount, &code, &ptr, errorptr, dictionary);
2647re->top_bracket = bracount;
2648re->top_backref = top_backref;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002649
2650/* If not reached end of pattern on success, there's an excess bracket. */
2651
Guido van Rossum50700601997-12-08 17:15:20 +00002652if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
2653
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002654/* Fill in the terminating state and check for disastrous overflow, but
2655if debugging, leave the test till after things are printed out. */
2656
2657*code++ = OP_END;
2658
Guido van Rossum50700601997-12-08 17:15:20 +00002659
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002660#ifndef DEBUG
Guido van Rossum50700601997-12-08 17:15:20 +00002661if (code - re->code > length) *errorptr = ERR23;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002662#endif
2663
2664/* Failed to compile */
2665
2666if (*errorptr != NULL)
2667 {
2668 (pcre_free)(re);
2669 PCRE_ERROR_RETURN:
Guido van Rossum58132c61997-12-17 00:24:13 +00002670 *erroroffset = ptr - (const uschar *)pattern;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002671 return NULL;
2672 }
2673
2674/* If the anchored option was not passed, set flag if we can determine that it
2675is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
2676we can determine what the first character has to be, because that speeds up
2677unanchored matches no end. In the case of multiline matches, an alternative is
2678to set the PCRE_STARTLINE flag if all branches start with ^. */
2679
2680if ((options & PCRE_ANCHORED) == 0)
2681 {
2682 if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))
2683 re->options |= PCRE_ANCHORED;
2684 else
2685 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002686 int ch = find_firstchar(re->code);
2687 if (ch >= 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002688 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002689 re->first_char = ch;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002690 re->options |= PCRE_FIRSTSET;
2691 }
2692 else if (is_startline(re->code))
2693 re->options |= PCRE_STARTLINE;
2694 }
2695 }
2696
2697/* Print out the compiled data for debugging */
2698
2699#ifdef DEBUG
2700
Guido van Rossum50700601997-12-08 17:15:20 +00002701printf("Length = %d top_bracket = %d top_backref=%d\n",
2702 length, re->top_bracket, re->top_backref);
2703
2704if (re->options != 0)
2705 {
2706 printf("%s%s%s%s%s%s%s\n",
2707 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2708 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2709 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2710 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2711 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2712 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
2713 ((re->options & PCRE_EXTRA) != 0)? "extra " : "");
2714 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002715
2716if ((re->options & PCRE_FIRSTSET) != 0)
2717 {
2718 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2719 else printf("First char = \\x%02x\n", re->first_char);
2720 }
2721
2722code_end = code;
2723code_base = code = re->code;
2724
2725while (code < code_end)
2726 {
2727 int charlength;
2728
2729 printf("%3d ", code - code_base);
2730
2731 if (*code >= OP_BRA)
2732 {
2733 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2734 code += 2;
2735 }
2736
2737 else switch(*code)
2738 {
2739 case OP_CHARS:
2740 charlength = *(++code);
2741 printf("%3d ", charlength);
2742 while (charlength-- > 0)
2743 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2744 break;
2745
2746 case OP_KETRMAX:
2747 case OP_KETRMIN:
2748 case OP_ALT:
2749 case OP_KET:
2750 case OP_ASSERT:
2751 case OP_ASSERT_NOT:
Guido van Rossum50700601997-12-08 17:15:20 +00002752 case OP_ONCE:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002753 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2754 code += 2;
2755 break;
2756
2757 case OP_STAR:
2758 case OP_MINSTAR:
2759 case OP_PLUS:
2760 case OP_MINPLUS:
2761 case OP_QUERY:
2762 case OP_MINQUERY:
2763 case OP_TYPESTAR:
2764 case OP_TYPEMINSTAR:
2765 case OP_TYPEPLUS:
2766 case OP_TYPEMINPLUS:
2767 case OP_TYPEQUERY:
2768 case OP_TYPEMINQUERY:
2769 if (*code >= OP_TYPESTAR)
2770 printf(" %s", OP_names[code[1]]);
2771 else if (isprint(c = code[1])) printf(" %c", c);
2772 else printf(" \\x%02x", c);
2773 printf("%s", OP_names[*code++]);
2774 break;
2775
2776 case OP_EXACT:
2777 case OP_UPTO:
2778 case OP_MINUPTO:
2779 if (isprint(c = code[3])) printf(" %c{", c);
2780 else printf(" \\x%02x{", c);
Guido van Rossum557dea11997-12-22 22:46:52 +00002781 if (*code != OP_EXACT) printf("0,");
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002782 printf("%d}", (code[1] << 8) + code[2]);
2783 if (*code == OP_MINUPTO) printf("?");
2784 code += 3;
2785 break;
2786
2787 case OP_TYPEEXACT:
2788 case OP_TYPEUPTO:
2789 case OP_TYPEMINUPTO:
2790 printf(" %s{", OP_names[code[3]]);
2791 if (*code != OP_TYPEEXACT) printf(",");
2792 printf("%d}", (code[1] << 8) + code[2]);
2793 if (*code == OP_TYPEMINUPTO) printf("?");
2794 code += 3;
2795 break;
2796
Guido van Rossum50700601997-12-08 17:15:20 +00002797 case OP_NOT:
2798 if (isprint(c = *(++code))) printf(" [^%c]", c);
2799 else printf(" [^\\x%02x]", c);
2800 break;
2801
2802 case OP_NOTSTAR:
2803 case OP_NOTMINSTAR:
2804 case OP_NOTPLUS:
2805 case OP_NOTMINPLUS:
2806 case OP_NOTQUERY:
2807 case OP_NOTMINQUERY:
2808 if (isprint(c = code[1])) printf(" [^%c]", c);
2809 else printf(" [^\\x%02x]", c);
2810 printf("%s", OP_names[*code++]);
2811 break;
2812
2813 case OP_NOTEXACT:
2814 case OP_NOTUPTO:
2815 case OP_NOTMINUPTO:
2816 if (isprint(c = code[3])) printf(" [^%c]{", c);
2817 else printf(" [^\\x%02x]{", c);
2818 if (*code != OP_NOTEXACT) printf(",");
2819 printf("%d}", (code[1] << 8) + code[2]);
2820 if (*code == OP_NOTMINUPTO) printf("?");
2821 code += 3;
2822 break;
2823
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002824 case OP_REF:
2825 printf(" \\%d", *(++code));
Guido van Rossum557dea11997-12-22 22:46:52 +00002826 code ++;
2827 goto CLASS_REF_REPEAT;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002828
2829 case OP_CLASS:
Guido van Rossum50700601997-12-08 17:15:20 +00002830 case OP_CLASS_L:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002831 {
2832 int i, min, max;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002833
Guido van Rossum50700601997-12-08 17:15:20 +00002834 if (*code==OP_CLASS_L)
2835 {
2836 code++;
2837 printf("Locflag = %i ", *code++);
2838 }
2839 else
2840 code++;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002841
Guido van Rossum50700601997-12-08 17:15:20 +00002842 printf(" [");
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002843
Guido van Rossum50700601997-12-08 17:15:20 +00002844 for (i = 0; i < 256; i++)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002845 {
Guido van Rossum50700601997-12-08 17:15:20 +00002846 if ((code[i/8] & (1 << (i&7))) != 0)
2847 {
2848 int j;
2849 for (j = i+1; j < 256; j++)
2850 if ((code[j/8] & (1 << (j&7))) == 0) break;
2851 if (i == '-' || i == ']') printf("\\");
2852 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2853 if (--j > i)
2854 {
2855 printf("-");
2856 if (j == '-' || j == ']') printf("\\");
2857 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2858 }
2859 i = j;
2860 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002861 }
2862 printf("]");
Guido van Rossum50700601997-12-08 17:15:20 +00002863 code += 32;
2864 /* code ++;*/
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002865
Guido van Rossum557dea11997-12-22 22:46:52 +00002866 CLASS_REF_REPEAT:
2867
Guido van Rossum50700601997-12-08 17:15:20 +00002868 switch(*code)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002869 {
2870 case OP_CRSTAR:
2871 case OP_CRMINSTAR:
2872 case OP_CRPLUS:
2873 case OP_CRMINPLUS:
2874 case OP_CRQUERY:
2875 case OP_CRMINQUERY:
2876 printf("%s", OP_names[*code]);
2877 break;
2878
2879 case OP_CRRANGE:
2880 case OP_CRMINRANGE:
2881 min = (code[1] << 8) + code[2];
2882 max = (code[3] << 8) + code[4];
2883 if (max == 0) printf("{%d,}", min);
2884 else printf("{%d,%d}", min, max);
2885 if (*code == OP_CRMINRANGE) printf("?");
2886 code += 4;
2887 break;
2888
2889 default:
2890 code--;
2891 }
2892 }
2893 break;
2894
2895 /* Anything else is just a one-node item */
2896
2897 default:
2898 printf(" %s", OP_names[*code]);
2899 break;
2900 }
2901
2902 code++;
2903 printf("\n");
2904 }
2905printf("------------------------------------------------------------------\n");
2906
2907/* This check is done here in the debugging case so that the code that
2908was compiled can be seen. */
2909
2910if (code - re->code > length)
2911 {
Guido van Rossum50700601997-12-08 17:15:20 +00002912 printf("length=%i, code length=%i\n", length, code-re->code);
2913 *errorptr = ERR23;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002914 (pcre_free)(re);
2915 *erroroffset = ptr - (uschar *)pattern;
2916 return NULL;
2917 }
2918#endif
2919
2920return (pcre *)re;
2921}
2922
2923
2924
2925/*************************************************
2926* Match a character type *
2927*************************************************/
2928
2929/* Not used in all the places it might be as it's sometimes faster
2930to put the code inline.
2931
2932Arguments:
2933 type the character type
2934 c the character
Guido van Rossum50700601997-12-08 17:15:20 +00002935 dotall the dotall flag
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002936
2937Returns: TRUE if character is of the type
2938*/
2939
2940static BOOL
2941match_type(int type, int c, BOOL dotall)
2942{
2943
2944#ifdef DEBUG
2945if (isprint(c)) printf("matching subject %c against ", c);
2946 else printf("matching subject \\x%02x against ", c);
2947printf("%s\n", OP_names[type]);
2948#endif
2949
2950switch(type)
2951 {
2952 case OP_ANY: return dotall || c != '\n';
2953 case OP_NOT_DIGIT: return (pcre_ctypes[c] & ctype_digit) == 0;
2954 case OP_DIGIT: return (pcre_ctypes[c] & ctype_digit) != 0;
2955 case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
2956 case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0;
2957 case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0;
2958 case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0;
Guido van Rossum58132c61997-12-17 00:24:13 +00002959 case OP_NOT_WORDCHAR_L: return (c!='_' && !isalnum(c));
2960 case OP_WORDCHAR_L: return (c=='_' || isalnum(c));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002961 }
2962return FALSE;
2963}
2964
2965
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002966
2967/*************************************************
2968* Match a back-reference *
2969*************************************************/
2970
2971/* If a back reference hasn't been set, the match fails.
2972
2973Arguments:
2974 number reference number
2975 eptr points into the subject
2976 length length to be matched
2977 md points to match data block
2978
2979Returns: TRUE if matched
2980*/
2981
2982static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00002983match_ref(int number, register const uschar *eptr, int length, match_data *md)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002984{
Guido van Rossum58132c61997-12-17 00:24:13 +00002985const uschar *p = md->start_subject + md->offset_vector[number];
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002986
2987#ifdef DEBUG
2988if (eptr >= md->end_subject)
2989 printf("matching subject <null>");
2990else
2991 {
2992 printf("matching subject ");
2993 pchars(eptr, length, TRUE, md);
2994 }
2995printf(" against backref ");
2996pchars(p, length, FALSE, md);
2997printf("\n");
2998#endif
2999
3000/* Always fail if not enough characters left */
3001
3002if (length > md->end_subject - p) return FALSE;
3003
Guido van Rossum58132c61997-12-17 00:24:13 +00003004/* Separate the caseless case for speed */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003005
3006if (md->caseless)
3007 { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }
3008else
3009 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3010
3011return TRUE;
3012}
3013
3014static int free_stack(match_data *md)
3015{
3016/* Free any stack space that was allocated by the call to match(). */
3017if (md->off_num) free(md->off_num);
3018if (md->offset_top) free(md->offset_top);
3019if (md->r1) free(md->r1);
3020if (md->r2) free(md->r2);
3021if (md->eptr) free(md->eptr);
Guido van Rossumc3861071997-10-08 02:07:40 +00003022if (md->ecode) free(md->ecode);
3023return 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003024}
3025
3026static int grow_stack(match_data *md)
3027{
Guido van Rossum50700601997-12-08 17:15:20 +00003028 if (md->length != 0)
3029 {
3030 md->length = md->length + md->length/2;
3031 }
3032 else
3033 {
3034 int string_len = md->end_subject - md->start_subject + 1;
3035 if (string_len < 80) {md->length = string_len; }
3036 else {md->length = 80;}
3037 }
3038 PyMem_RESIZE(md->offset_top, int, md->length);
Guido van Rossum58132c61997-12-17 00:24:13 +00003039 PyMem_RESIZE(md->eptr, const uschar *, md->length);
3040 PyMem_RESIZE(md->ecode, const uschar *, md->length);
Guido van Rossum50700601997-12-08 17:15:20 +00003041 PyMem_RESIZE(md->off_num, int, md->length);
3042 PyMem_RESIZE(md->r1, int, md->length);
3043 PyMem_RESIZE(md->r2, int, md->length);
3044 if (md->offset_top == NULL || md->eptr == NULL || md->ecode == NULL ||
3045 md->off_num == NULL || md->r1 == NULL || md->r2 == NULL)
3046 {
3047 PyErr_SetString(PyExc_MemoryError, "Can't increase failure stack for re operation");
3048 longjmp(md->error_env, 1);
3049 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003050 return 0;
3051}
3052
Guido van Rossum50700601997-12-08 17:15:20 +00003053
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003054/*************************************************
3055* Match from current position *
3056*************************************************/
3057
3058/* On entry ecode points to the first opcode, and eptr to the first character.
3059
3060Arguments:
3061 eptr pointer in subject
3062 ecode position in code
3063 offset_top current top pointer
3064 md pointer to "static" info for the match
3065
3066Returns: TRUE if matched
3067*/
3068
3069static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00003070match(register const uschar *eptr, register const uschar *ecode, int offset_top,
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003071 match_data *md)
3072{
3073 int save_stack_position = md->point;
3074match_loop:
3075
3076#define SUCCEED goto succeed
3077#define FAIL goto fail
3078
3079for (;;)
3080 {
3081 int min, max, ctype;
3082 register int i;
3083 register int c;
Guido van Rossum58132c61997-12-17 00:24:13 +00003084 BOOL minimize = FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003085
3086 /* Opening bracket. Check the alternative branches in turn, failing if none
3087 match. We have to set the start offset if required and there is space
3088 in the offset vector so that it is available for subsequent back references
3089 if the bracket matches. However, if the bracket fails, we must put back the
3090 previous value of both offsets in case they were set by a previous copy of
3091 the same bracket. Don't worry about setting the flag for the error case here;
3092 that is handled in the code for KET. */
3093
3094 if ((int)*ecode >= OP_BRA)
3095 {
3096 int number = (*ecode - OP_BRA) << 1;
Guido van Rossum58132c61997-12-17 00:24:13 +00003097 int save_offset1 = 0, save_offset2 = 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003098
Guido van Rossum557dea11997-12-22 22:46:52 +00003099 DPRINTF(("start bracket %d\n", number/2));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003100
3101 if (number > 0 && number < md->offset_end)
3102 {
3103 save_offset1 = md->offset_vector[number];
3104 save_offset2 = md->offset_vector[number+1];
3105 md->offset_vector[number] = eptr - md->start_subject;
3106
Guido van Rossum557dea11997-12-22 22:46:52 +00003107 DPRINTF(("saving %d %d\n", save_offset1, save_offset2));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003108 }
3109
3110 /* Recurse for all the alternatives. */
3111
3112 do
3113 {
3114 if (match(eptr, ecode+3, offset_top, md)) SUCCEED;
3115 ecode += (ecode[1] << 8) + ecode[2];
3116 }
3117 while (*ecode == OP_ALT);
3118
Guido van Rossum557dea11997-12-22 22:46:52 +00003119 DPRINTF(("bracket %d failed\n", number/2));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003120
3121 if (number > 0 && number < md->offset_end)
3122 {
3123 md->offset_vector[number] = save_offset1;
3124 md->offset_vector[number+1] = save_offset2;
3125 }
3126
3127 FAIL;
3128 }
3129
3130 /* Other types of node can be handled by a switch */
3131
3132 switch(*ecode)
3133 {
3134 case OP_END:
3135 md->end_match_ptr = eptr; /* Record where we ended */
3136 md->end_offset_top = offset_top; /* and how many extracts were taken */
3137 SUCCEED;
3138
Guido van Rossum50700601997-12-08 17:15:20 +00003139 /* The equivalent of Prolog's "cut" - if the rest doesn't match, the
3140 whole thing doesn't match, so we have to get out via a longjmp(). */
3141
3142 case OP_CUT:
3143 if (match(eptr, ecode+1, offset_top, md)) SUCCEED;
3144 longjmp(md->fail_env, 1);
3145
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003146 /* Assertion brackets. Check the alternative branches in turn - the
3147 matching won't pass the KET for an assertion. If any one branch matches,
3148 the assertion is true. */
3149
3150 case OP_ASSERT:
3151 do
3152 {
3153 if (match(eptr, ecode+3, offset_top, md)) break;
3154 ecode += (ecode[1] << 8) + ecode[2];
3155 }
3156 while (*ecode == OP_ALT);
3157 if (*ecode == OP_KET) FAIL;
3158
3159 /* Continue from after the assertion, updating the offsets high water
3160 mark, since extracts may have been taken during the assertion. */
3161
3162 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3163 ecode += 3;
3164 offset_top = md->end_offset_top;
3165 continue;
3166
3167 /* Negative assertion: all branches must fail to match */
3168
3169 case OP_ASSERT_NOT:
3170 do
3171 {
3172 if (match(eptr, ecode+3, offset_top, md)) FAIL;
3173 ecode += (ecode[1] << 8) + ecode[2];
3174 }
3175 while (*ecode == OP_ALT);
3176 ecode += 3;
3177 continue;
3178
Guido van Rossum50700601997-12-08 17:15:20 +00003179 /* "Once" brackets are like assertion brackets except that after a match,
3180 the point in the subject string is not moved back. Thus there can never be
3181 a move back into the brackets. Check the alternative branches in turn - the
3182 matching won't pass the KET for this kind of subpattern. If any one branch
3183 matches, we carry on, leaving the subject pointer. */
3184
3185 case OP_ONCE:
3186 do
3187 {
3188 if (match(eptr, ecode+3, offset_top, md)) break;
3189 ecode += (ecode[1] << 8) + ecode[2];
3190 }
3191 while (*ecode == OP_ALT);
Guido van Rossum557dea11997-12-22 22:46:52 +00003192 if (*ecode == OP_KET) FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00003193
3194 /* Continue as from after the assertion, updating the offsets high water
3195 mark, since extracts may have been taken. */
3196
3197 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3198 ecode += 3;
3199 offset_top = md->end_offset_top;
3200 eptr = md->end_match_ptr;
3201 continue;
3202
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003203 /* An alternation is the end of a branch; scan along to find the end of the
3204 bracketed group and go to there. */
3205
3206 case OP_ALT:
3207 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3208 break;
3209
3210 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3211 that it may occur zero times. It may repeat infinitely, or not at all -
3212 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3213 repeat limits are compiled as a number of copies, with the optional ones
3214 preceded by BRAZERO or BRAMINZERO. */
3215
3216 case OP_BRAZERO:
3217 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003218 const uschar *next = ecode+1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003219 if (match(eptr, next, offset_top, md)) SUCCEED;
3220 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3221 ecode = next + 3;
3222 }
3223 break;
3224
3225 case OP_BRAMINZERO:
3226 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003227 const uschar *next = ecode+1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003228 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3229 if (match(eptr, next+3, offset_top, md)) SUCCEED;
3230 ecode++;
3231 }
3232 break;;
3233
3234 /* End of a group, repeated or non-repeating. If we are at the end of
3235 an assertion "group", stop matching and SUCCEED, but record the
3236 current high water mark for use by positive assertions. */
3237
3238 case OP_KET:
3239 case OP_KETRMIN:
3240 case OP_KETRMAX:
3241 {
Guido van Rossum50700601997-12-08 17:15:20 +00003242 int number;
Guido van Rossum58132c61997-12-17 00:24:13 +00003243 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003244
Guido van Rossum50700601997-12-08 17:15:20 +00003245 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003246 {
Guido van Rossum50700601997-12-08 17:15:20 +00003247 md->end_match_ptr = eptr; /* For ONCE */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003248 md->end_offset_top = offset_top;
3249 SUCCEED;
3250 }
3251
3252 /* In all other cases we have to check the group number back at the
3253 start and if necessary complete handling an extraction by setting the
3254 final offset and bumping the high water mark. */
3255
3256 number = (*prev - OP_BRA) << 1;
3257
Guido van Rossum557dea11997-12-22 22:46:52 +00003258 DPRINTF(("end bracket %d\n", number/2));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003259
3260 if (number > 0)
3261 {
3262 if (number >= md->offset_end) md->offset_overflow = TRUE; else
3263 {
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003264 md->offset_vector[number+1] = eptr - md->start_subject;
3265 if (offset_top <= number) offset_top = number + 2;
3266 }
3267 }
3268
3269 /* For a non-repeating ket, just advance to the next node and continue at
3270 this level. */
3271
3272 if (*ecode == OP_KET)
3273 {
3274 ecode += 3;
3275 break;
3276 }
3277
3278 /* The repeating kets try the rest of the pattern or restart from the
3279 preceding bracket, in the appropriate order. */
3280
3281 if (*ecode == OP_KETRMIN)
3282 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003283 const uschar *ptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003284 if (match(eptr, ecode+3, offset_top, md)) goto succeed;
3285 /* Handle alternation inside the BRA...KET; push the additional
Guido van Rossum58132c61997-12-17 00:24:13 +00003286 alternatives onto the stack */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003287 ptr=prev;
3288 do {
3289 ptr += (ptr[1]<<8)+ ptr[2];
3290 if (*ptr==OP_ALT)
3291 {
Guido van Rossum50700601997-12-08 17:15:20 +00003292 if (md->length == md->point)
3293 {
3294 grow_stack(md);
3295 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003296 md->offset_top[md->point] = offset_top;
3297 md->eptr[md->point] = eptr;
3298 md->ecode[md->point] = ptr+3;
3299 md->r1[md->point] = 0;
3300 md->r2[md->point] = 0;
3301 md->off_num[md->point] = 0;
3302 md->point++;
3303 }
3304 } while (*ptr==OP_ALT);
3305 ecode=prev+3; goto match_loop;
3306 }
3307 else /* OP_KETRMAX */
3308 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003309 const uschar *ptr;
3310 /*int points_pushed=0;*/
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003311
3312 /* Push one failure point, that will resume matching at the code after
3313 the KETRMAX opcode. */
Guido van Rossum50700601997-12-08 17:15:20 +00003314 if (md->length == md->point)
3315 {
3316 grow_stack(md);
3317 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003318 md->offset_top[md->point] = offset_top;
3319 md->eptr[md->point] = eptr;
3320 md->ecode[md->point] = ecode+3;
3321 md->r1[md->point] = md->offset_vector[number];
3322 md->r2[md->point] = md->offset_vector[number+1];
3323 md->off_num[md->point] = number;
3324 md->point++;
3325
3326 md->offset_vector[number] = eptr - md->start_subject;
3327 /* Handle alternation inside the BRA...KET; push each of the
Guido van Rossum58132c61997-12-17 00:24:13 +00003328 additional alternatives onto the stack */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003329 ptr=prev;
3330 do {
3331 ptr += (ptr[1]<<8)+ ptr[2];
3332 if (*ptr==OP_ALT)
3333 {
Guido van Rossum50700601997-12-08 17:15:20 +00003334 if (md->length == md->point)
3335 if (md->length == md->point)
3336 {
3337 grow_stack(md);
3338 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003339 md->offset_top[md->point] = offset_top;
3340 md->eptr[md->point] = eptr;
3341 md->ecode[md->point] = ptr+3;
3342 md->r1[md->point] = 0;
3343 md->r2[md->point] = 0;
3344 md->off_num[md->point] = 0;
3345 md->point++;
Guido van Rossum58132c61997-12-17 00:24:13 +00003346 /*points_pushed++;*/
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003347 }
3348 } while (*ptr==OP_ALT);
3349 /* Jump to the first (or only) alternative and resume trying to match */
3350 ecode=prev+3; goto match_loop;
3351 }
3352 }
Guido van Rossum58132c61997-12-17 00:24:13 +00003353 break;
3354
Guido van Rossum50700601997-12-08 17:15:20 +00003355 /* Start of subject unless notbol, or after internal newline if multiline */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003356
3357 case OP_CIRC:
Guido van Rossum50700601997-12-08 17:15:20 +00003358 if (md->notbol && eptr == md->start_subject) FAIL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003359 if (md->multiline)
3360 {
3361 if (eptr != md->start_subject && eptr[-1] != '\n') FAIL;
3362 ecode++;
3363 break;
3364 }
3365 /* ... else fall through */
3366
3367 /* Start of subject assertion */
3368
3369 case OP_SOD:
3370 if (eptr != md->start_subject) FAIL;
3371 ecode++;
3372 break;
3373
Guido van Rossum50700601997-12-08 17:15:20 +00003374 /* Assert before internal newline if multiline, or before
3375 a terminating newline unless endonly is set, else end of subject unless
3376 noteol is set. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003377
3378 case OP_DOLL:
Guido van Rossum50700601997-12-08 17:15:20 +00003379 if (md->noteol && eptr >= md->end_subject) FAIL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003380 if (md->multiline)
3381 {
3382 if (eptr < md->end_subject && *eptr != '\n') FAIL;
3383 ecode++;
3384 break;
3385 }
Guido van Rossum50700601997-12-08 17:15:20 +00003386 else if (!md->endonly)
3387 {
3388 if (eptr < md->end_subject - 1 ||
3389 (eptr == md->end_subject - 1 && *eptr != '\n')) FAIL;
3390 ecode++;
3391 break;
3392 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003393 /* ... else fall through */
3394
3395 /* End of subject assertion */
3396
3397 case OP_EOD:
3398 if (eptr < md->end_subject) FAIL;
3399 ecode++;
3400 break;
3401
3402 /* Word boundary assertions */
3403
3404 case OP_NOT_WORD_BOUNDARY:
3405 case OP_WORD_BOUNDARY:
3406 {
3407 BOOL prev_is_word = (eptr != md->start_subject) &&
3408 ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
3409 BOOL cur_is_word = (eptr < md->end_subject) &&
3410 ((pcre_ctypes[*eptr] & ctype_word) != 0);
3411 if ((*ecode++ == OP_WORD_BOUNDARY)?
3412 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3413 FAIL;
3414 }
3415 break;
3416
Guido van Rossum50700601997-12-08 17:15:20 +00003417 case OP_NOT_WORD_BOUNDARY_L:
3418 case OP_WORD_BOUNDARY_L:
3419 {
3420 BOOL prev_is_word = (eptr != md->start_subject) &&
Guido van Rossum58132c61997-12-17 00:24:13 +00003421 (isalnum(eptr[-1]) || eptr[-1]=='_');
Guido van Rossum50700601997-12-08 17:15:20 +00003422 BOOL cur_is_word = (eptr < md->end_subject) &&
Guido van Rossum58132c61997-12-17 00:24:13 +00003423 (isalnum(*eptr) || *eptr=='_');
Guido van Rossum50700601997-12-08 17:15:20 +00003424 if ((*ecode++ == OP_WORD_BOUNDARY_L)?
3425 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3426 FAIL;
3427 }
3428 break;
3429
3430
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003431 /* Match a single character type; inline for speed */
3432
3433 case OP_ANY:
3434 if (!md->dotall && eptr < md->end_subject && *eptr == '\n') FAIL;
3435 if (eptr++ >= md->end_subject) FAIL;
3436 ecode++;
3437 break;
3438
3439 case OP_NOT_DIGIT:
3440 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)
3441 FAIL;
3442 ecode++;
3443 break;
3444
3445 case OP_DIGIT:
3446 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)
3447 FAIL;
3448 ecode++;
3449 break;
3450
3451 case OP_NOT_WHITESPACE:
3452 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)
3453 FAIL;
3454 ecode++;
3455 break;
3456
3457 case OP_WHITESPACE:
3458 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)
3459 FAIL;
3460 ecode++;
3461 break;
3462
3463 case OP_NOT_WORDCHAR:
3464 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)
3465 FAIL;
3466 ecode++;
3467 break;
3468
3469 case OP_WORDCHAR:
3470 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)
3471 FAIL;
3472 ecode++;
3473 break;
3474
Guido van Rossum50700601997-12-08 17:15:20 +00003475 case OP_NOT_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00003476 if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) ))
Guido van Rossum557dea11997-12-22 22:46:52 +00003477 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00003478 eptr++;
3479 ecode++;
3480 break;
3481
3482 case OP_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00003483 if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) ))
Guido van Rossum557dea11997-12-22 22:46:52 +00003484 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00003485 eptr++;
3486 ecode++;
3487 break;
3488
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003489 /* Match a back reference, possibly repeatedly. Look past the end of the
3490 item to see if there is repeat information following. The code is similar
3491 to that for character classes, but repeated for efficiency. Then obey
3492 similar code to character type repeats - written out again for speed.
3493 However, if the referenced string is the empty string, always treat
3494 it as matched, any number of times (otherwise there could be infinite
3495 loops). */
3496
3497 case OP_REF:
3498 {
3499 int length;
3500 int number = ecode[1] << 1; /* Doubled reference number */
3501 ecode += 2; /* Advance past the item */
3502
3503 if (number >= offset_top || md->offset_vector[number] < 0)
3504 {
3505 md->errorcode = PCRE_ERROR_BADREF;
3506 FAIL;
3507 }
3508
3509 length = md->offset_vector[number+1] - md->offset_vector[number];
3510
3511 switch (*ecode)
3512 {
3513 case OP_CRSTAR:
3514 case OP_CRMINSTAR:
3515 case OP_CRPLUS:
3516 case OP_CRMINPLUS:
3517 case OP_CRQUERY:
3518 case OP_CRMINQUERY:
3519 c = *ecode++ - OP_CRSTAR;
3520 minimize = (c & 1) != 0;
3521 min = rep_min[c]; /* Pick up values from tables; */
3522 max = rep_max[c]; /* zero for max => infinity */
3523 if (max == 0) max = INT_MAX;
3524 break;
3525
3526 case OP_CRRANGE:
3527 case OP_CRMINRANGE:
3528 minimize = (*ecode == OP_CRMINRANGE);
3529 min = (ecode[1] << 8) + ecode[2];
3530 max = (ecode[3] << 8) + ecode[4];
3531 if (max == 0) max = INT_MAX;
3532 ecode += 5;
3533 break;
3534
3535 default: /* No repeat follows */
3536 if (!match_ref(number, eptr, length, md)) FAIL;
3537 eptr += length;
3538 continue; /* With the main loop */
3539 }
3540
3541 /* If the length of the reference is zero, just continue with the
3542 main loop. */
3543
3544 if (length == 0) continue;
3545
3546 /* First, ensure the minimum number of matches are present. We get back
3547 the length of the reference string explicitly rather than passing the
3548 address of eptr, so that eptr can be a register variable. */
3549
3550 for (i = 1; i <= min; i++)
3551 {
3552 if (!match_ref(number, eptr, length, md)) FAIL;
3553 eptr += length;
3554 }
3555
3556 /* If min = max, continue at the same level without recursion.
3557 They are not both allowed to be zero. */
3558
3559 if (min == max) continue;
3560
3561 /* If minimizing, keep trying and advancing the pointer */
3562
3563 if (minimize)
3564 {
3565 for (i = min;; i++)
3566 {
3567 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3568 if (i >= max || !match_ref(number, eptr, length, md))
3569 FAIL;
3570 eptr += length;
3571 }
3572 /* Control never gets here */
3573 }
3574
3575 /* If maximizing, find the longest string and work backwards */
3576
3577 else
3578 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003579 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003580 for (i = min; i < max; i++)
3581 {
3582 if (!match_ref(number, eptr, length, md)) break;
3583 eptr += length;
3584 }
3585 while (eptr >= pp)
3586 {
3587 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3588 eptr -= length;
3589 }
3590 FAIL;
3591 }
3592 }
3593 /* Control never gets here */
3594
3595 /* Match a character class, possibly repeatedly. Look past the end of the
3596 item to see if there is repeat information following. Then obey similar
Guido van Rossum50700601997-12-08 17:15:20 +00003597 code to character type repeats - written out again for speed. If caseless
3598 matching was set at runtime but not at compile time, we have to check both
3599 versions of a character. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003600
3601 case OP_CLASS:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003602 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003603 const uschar *data = ecode + 1; /* Save for matching */
3604 ecode += 33; /* Advance past the item */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003605
3606 switch (*ecode)
3607 {
3608 case OP_CRSTAR:
3609 case OP_CRMINSTAR:
3610 case OP_CRPLUS:
3611 case OP_CRMINPLUS:
3612 case OP_CRQUERY:
3613 case OP_CRMINQUERY:
3614 c = *ecode++ - OP_CRSTAR;
3615 minimize = (c & 1) != 0;
3616 min = rep_min[c]; /* Pick up values from tables; */
3617 max = rep_max[c]; /* zero for max => infinity */
3618 if (max == 0) max = INT_MAX;
3619 break;
3620
3621 case OP_CRRANGE:
3622 case OP_CRMINRANGE:
3623 minimize = (*ecode == OP_CRMINRANGE);
3624 min = (ecode[1] << 8) + ecode[2];
3625 max = (ecode[3] << 8) + ecode[4];
3626 if (max == 0) max = INT_MAX;
3627 ecode += 5;
3628 break;
3629
3630 default: /* No repeat follows */
Guido van Rossum50700601997-12-08 17:15:20 +00003631 if (eptr >= md->end_subject) FAIL;
3632 c = *eptr++;
3633 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
3634 if (md->runtime_caseless)
3635 {
3636 c = pcre_fcc[c];
3637 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
3638 }
3639 FAIL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003640 }
3641
3642 /* First, ensure the minimum number of matches are present. */
3643
3644 for (i = 1; i <= min; i++)
Guido van Rossum50700601997-12-08 17:15:20 +00003645 {
3646 if (eptr >= md->end_subject) FAIL;
3647 c = *eptr++;
3648 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3649 if (md->runtime_caseless)
3650 {
3651 c = pcre_fcc[c];
3652 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3653 }
3654 FAIL;
3655 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003656
3657 /* If max == min we can continue with the main loop without the
3658 need to recurse. */
3659
3660 if (min == max) continue;
3661
3662 /* If minimizing, keep testing the rest of the expression and advancing
3663 the pointer while it matches the class. */
3664
3665 if (minimize)
3666 {
3667 for (i = min;; i++)
3668 {
3669 if (match(eptr, ecode, offset_top, md)) SUCCEED;
Guido van Rossum50700601997-12-08 17:15:20 +00003670 if (i >= max || eptr >= md->end_subject) FAIL;
3671 c = *eptr++;
3672 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3673 if (md->runtime_caseless)
3674 {
3675 c = pcre_fcc[c];
3676 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3677 }
3678 FAIL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003679 }
3680 /* Control never gets here */
3681 }
3682
3683 /* If maximizing, find the longest possible run, then work backwards. */
3684
3685 else
3686 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003687 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00003688 for (i = min; i < max; eptr++, i++)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003689 {
Guido van Rossum50700601997-12-08 17:15:20 +00003690 if (eptr >= md->end_subject) break;
3691 c = *eptr;
3692 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3693 if (md->runtime_caseless)
3694 {
3695 c = pcre_fcc[c];
3696 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3697 }
3698 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003699 }
Guido van Rossum50700601997-12-08 17:15:20 +00003700
3701 while (eptr >= pp)
3702 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3703 FAIL;
3704 }
3705 }
3706 /* Control never gets here */
3707
3708 /* OP_CLASS_L opcode: handles localized character classes */
3709
3710 case OP_CLASS_L:
3711 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003712 const uschar *data = ecode + 1; /* Save for matching */
3713 const uschar locale_flag = *data;
Guido van Rossum50700601997-12-08 17:15:20 +00003714 ecode++; data++; /* The localization support adds an extra byte */
3715
3716 ecode += 33; /* Advance past the item */
3717
3718 switch (*ecode)
3719 {
3720 case OP_CRSTAR:
3721 case OP_CRMINSTAR:
3722 case OP_CRPLUS:
3723 case OP_CRMINPLUS:
3724 case OP_CRQUERY:
3725 case OP_CRMINQUERY:
3726 c = *ecode++ - OP_CRSTAR;
3727 minimize = (c & 1) != 0;
3728 min = rep_min[c]; /* Pick up values from tables; */
3729 max = rep_max[c]; /* zero for max => infinity */
3730 if (max == 0) max = INT_MAX;
3731 break;
3732
3733 case OP_CRRANGE:
3734 case OP_CRMINRANGE:
3735 minimize = (*ecode == OP_CRMINRANGE);
3736 min = (ecode[1] << 8) + ecode[2];
3737 max = (ecode[3] << 8) + ecode[4];
3738 if (max == 0) max = INT_MAX;
3739 ecode += 5;
3740 break;
3741
3742 default: /* No repeat follows */
3743 if (eptr >= md->end_subject) FAIL;
3744 c = *eptr++;
3745 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
Guido van Rossum58132c61997-12-17 00:24:13 +00003746 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3747 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003748#if 0
3749 if ( (locale_flag & 4) && isdigit(c) ) continue; /* Locale \d */
3750 if ( (locale_flag & 8) && !isdigit(c) ) continue; /* Locale \D */
3751 if ( (locale_flag & 16) && isspace(c) ) continue; /* Locale \s */
3752 if ( (locale_flag & 32) && !isspace(c) ) continue; /* Locale \S */
3753#endif
3754
3755 if (md->runtime_caseless)
3756 {
3757 c = pcre_fcc[c];
3758 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
3759
Guido van Rossum58132c61997-12-17 00:24:13 +00003760 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3761 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003762 }
3763 FAIL;
3764 }
3765
3766 /* First, ensure the minimum number of matches are present. */
3767
3768 for (i = 1; i <= min; i++)
3769 {
3770 if (eptr >= md->end_subject) FAIL;
3771 c = *eptr++;
3772 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003773 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3774 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003775
3776 if (md->runtime_caseless)
3777 {
3778 c = pcre_fcc[c];
3779 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003780 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3781 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003782 }
3783 FAIL;
3784 }
3785
3786 /* If max == min we can continue with the main loop without the
3787 need to recurse. */
3788
3789 if (min == max) continue;
3790
3791 /* If minimizing, keep testing the rest of the expression and advancing
3792 the pointer while it matches the class. */
3793
3794 if (minimize)
3795 {
3796 for (i = min;; i++)
3797 {
3798 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3799 if (i >= max || eptr >= md->end_subject) FAIL;
3800 c = *eptr++;
3801 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003802 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3803 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003804
3805 if (md->runtime_caseless)
3806 {
3807 c = pcre_fcc[c];
3808 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003809 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3810 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003811 }
3812 FAIL;
3813 }
3814 /* Control never gets here */
3815 }
3816
3817 /* If maximizing, find the longest possible run, then work backwards. */
3818
3819 else
3820 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003821 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00003822 for (i = min; i < max; eptr++, i++)
3823 {
3824 if (eptr >= md->end_subject) break;
3825 c = *eptr;
3826 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003827 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3828 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003829 if (md->runtime_caseless)
3830 {
3831 c = pcre_fcc[c];
3832 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003833 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3834 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003835 }
3836 break;
3837 }
3838
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003839 while (eptr >= pp)
3840 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3841 FAIL;
3842 }
3843 }
3844 /* Control never gets here */
3845
3846 /* Match a run of characters */
3847
3848 case OP_CHARS:
3849 {
3850 register int length = ecode[1];
3851 ecode += 2;
3852
Guido van Rossum557dea11997-12-22 22:46:52 +00003853#ifdef DEBUG /* Sigh. Some compilers never learn. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003854 if (eptr >= md->end_subject)
3855 printf("matching subject <null> against pattern ");
3856 else
3857 {
3858 printf("matching subject ");
3859 pchars(eptr, length, TRUE, md);
3860 printf(" against pattern ");
3861 }
3862 pchars(ecode, length, FALSE, md);
3863 printf("\n");
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003864#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003865
3866 if (length > md->end_subject - eptr) FAIL;
3867 if (md->caseless)
3868 {
3869 while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) FAIL;
3870 }
3871 else
3872 {
3873 while (length-- > 0) if (*ecode++ != *eptr++) FAIL;
3874 }
3875 }
3876 break;
3877
3878 /* Match a single character repeatedly; different opcodes share code. */
3879
3880 case OP_EXACT:
3881 min = max = (ecode[1] << 8) + ecode[2];
3882 ecode += 3;
3883 goto REPEATCHAR;
3884
3885 case OP_UPTO:
3886 case OP_MINUPTO:
3887 min = 0;
3888 max = (ecode[1] << 8) + ecode[2];
3889 minimize = *ecode == OP_MINUPTO;
3890 ecode += 3;
3891 goto REPEATCHAR;
3892
3893 case OP_STAR:
3894 case OP_MINSTAR:
3895 case OP_PLUS:
3896 case OP_MINPLUS:
3897 case OP_QUERY:
3898 case OP_MINQUERY:
3899 c = *ecode++ - OP_STAR;
3900 minimize = (c & 1) != 0;
3901 min = rep_min[c]; /* Pick up values from tables; */
3902 max = rep_max[c]; /* zero for max => infinity */
3903 if (max == 0) max = INT_MAX;
3904
3905 /* Common code for all repeated single-character matches. We can give
3906 up quickly if there are fewer than the minimum number of characters left in
3907 the subject. */
3908
3909 REPEATCHAR:
3910 if (min > md->end_subject - eptr) FAIL;
3911 c = *ecode++;
3912
3913 /* The code is duplicated for the caseless and caseful cases, for speed,
3914 since matching characters is likely to be quite common. First, ensure the
3915 minimum number of matches are present. If min = max, continue at the same
3916 level without recursing. Otherwise, if minimizing, keep trying the rest of
3917 the expression and advancing one matching character if failing, up to the
3918 maximum. Alternatively, if maximizing, find the maximum number of
3919 characters and work backwards. */
3920
Guido van Rossum557dea11997-12-22 22:46:52 +00003921 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
3922 max, eptr));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003923
3924 if (md->caseless)
3925 {
3926 c = pcre_lcc[c];
3927 for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) FAIL;
3928 if (min == max) continue;
3929 if (minimize)
3930 {
3931 for (i = min;; i++)
3932 {
3933 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3934 if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])
3935 FAIL;
3936 }
3937 /* Control never gets here */
3938 }
3939 else
3940 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003941 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003942 for (i = min; i < max; i++)
3943 {
3944 if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;
3945 eptr++;
3946 }
3947 while (eptr >= pp)
3948 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3949 FAIL;
3950 }
Guido van Rossum50700601997-12-08 17:15:20 +00003951 /* Control never gets here */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003952 }
3953
3954 /* Caseful comparisons */
3955
3956 else
3957 {
3958 for (i = 1; i <= min; i++) if (c != *eptr++) FAIL;
3959 if (min == max) continue;
3960 if (minimize)
3961 {
3962 for (i = min;; i++)
3963 {
3964 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3965 if (i >= max || eptr >= md->end_subject || c != *eptr++) FAIL;
3966 }
3967 /* Control never gets here */
3968 }
3969 else
3970 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003971 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003972 for (i = min; i < max; i++)
3973 {
3974 if (eptr >= md->end_subject || c != *eptr) break;
3975 eptr++;
3976 }
3977 while (eptr >= pp)
3978 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3979 FAIL;
3980 }
3981 }
3982 /* Control never gets here */
3983
Guido van Rossum50700601997-12-08 17:15:20 +00003984 /* Match a negated single character */
3985
3986 case OP_NOT:
Guido van Rossum557dea11997-12-22 22:46:52 +00003987 if (eptr >= md->end_subject) FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00003988 ecode++;
3989 if (md->caseless)
3990 {
3991 if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) FAIL;
3992 }
3993 else
3994 {
3995 if (*ecode++ == *eptr++) FAIL;
3996 }
3997 break;
3998
3999 /* Match a negated single character repeatedly. This is almost a repeat of
4000 the code for a repeated single character, but I haven't found a nice way of
4001 commoning these up that doesn't require a test of the positive/negative
4002 option for each character match. Maybe that wouldn't add very much to the
4003 time taken, but character matching *is* what this is all about... */
4004
4005 case OP_NOTEXACT:
4006 min = max = (ecode[1] << 8) + ecode[2];
4007 ecode += 3;
4008 goto REPEATNOTCHAR;
4009
4010 case OP_NOTUPTO:
4011 case OP_NOTMINUPTO:
4012 min = 0;
4013 max = (ecode[1] << 8) + ecode[2];
4014 minimize = *ecode == OP_NOTMINUPTO;
4015 ecode += 3;
4016 goto REPEATNOTCHAR;
4017
4018 case OP_NOTSTAR:
4019 case OP_NOTMINSTAR:
4020 case OP_NOTPLUS:
4021 case OP_NOTMINPLUS:
4022 case OP_NOTQUERY:
4023 case OP_NOTMINQUERY:
4024 c = *ecode++ - OP_NOTSTAR;
4025 minimize = (c & 1) != 0;
4026 min = rep_min[c]; /* Pick up values from tables; */
4027 max = rep_max[c]; /* zero for max => infinity */
4028 if (max == 0) max = INT_MAX;
4029
4030 /* Common code for all repeated single-character matches. We can give
4031 up quickly if there are fewer than the minimum number of characters left in
4032 the subject. */
4033
4034 REPEATNOTCHAR:
4035 if (min > md->end_subject - eptr) FAIL;
4036 c = *ecode++;
4037
4038 /* The code is duplicated for the caseless and caseful cases, for speed,
4039 since matching characters is likely to be quite common. First, ensure the
4040 minimum number of matches are present. If min = max, continue at the same
4041 level without recursing. Otherwise, if minimizing, keep trying the rest of
4042 the expression and advancing one matching character if failing, up to the
4043 maximum. Alternatively, if maximizing, find the maximum number of
4044 characters and work backwards. */
4045
Guido van Rossum557dea11997-12-22 22:46:52 +00004046 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
4047 max, eptr));
Guido van Rossum50700601997-12-08 17:15:20 +00004048
4049 if (md->caseless)
4050 {
4051 c = pcre_lcc[c];
4052 for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) FAIL;
4053 if (min == max) continue;
4054 if (minimize)
4055 {
4056 for (i = min;; i++)
4057 {
4058 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4059 if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])
4060 FAIL;
4061 }
4062 /* Control never gets here */
4063 }
4064 else
4065 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004066 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00004067 for (i = min; i < max; i++)
4068 {
4069 if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;
4070 eptr++;
4071 }
4072 while (eptr >= pp)
4073 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4074 FAIL;
4075 }
4076 /* Control never gets here */
4077 }
4078
4079 /* Caseful comparisons */
4080
4081 else
4082 {
4083 for (i = 1; i <= min; i++) if (c == *eptr++) FAIL;
4084 if (min == max) continue;
4085 if (minimize)
4086 {
4087 for (i = min;; i++)
4088 {
4089 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4090 if (i >= max || eptr >= md->end_subject || c == *eptr++) FAIL;
4091 }
4092 /* Control never gets here */
4093 }
4094 else
4095 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004096 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00004097 for (i = min; i < max; i++)
4098 {
4099 if (eptr >= md->end_subject || c == *eptr) break;
4100 eptr++;
4101 }
4102 while (eptr >= pp)
4103 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4104 FAIL;
4105 }
4106 }
4107 /* Control never gets here */
4108
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004109 /* Match a single character type repeatedly; several different opcodes
4110 share code. This is very similar to the code for single characters, but we
4111 repeat it in the interests of efficiency. */
4112
4113 case OP_TYPEEXACT:
4114 min = max = (ecode[1] << 8) + ecode[2];
4115 minimize = TRUE;
4116 ecode += 3;
4117 goto REPEATTYPE;
4118
4119 case OP_TYPEUPTO:
4120 case OP_TYPEMINUPTO:
4121 min = 0;
4122 max = (ecode[1] << 8) + ecode[2];
4123 minimize = *ecode == OP_TYPEMINUPTO;
4124 ecode += 3;
4125 goto REPEATTYPE;
4126
4127 case OP_TYPESTAR:
4128 case OP_TYPEMINSTAR:
4129 case OP_TYPEPLUS:
4130 case OP_TYPEMINPLUS:
4131 case OP_TYPEQUERY:
4132 case OP_TYPEMINQUERY:
4133 c = *ecode++ - OP_TYPESTAR;
4134 minimize = (c & 1) != 0;
4135 min = rep_min[c]; /* Pick up values from tables; */
4136 max = rep_max[c]; /* zero for max => infinity */
4137 if (max == 0) max = INT_MAX;
4138
4139 /* Common code for all repeated single character type matches */
4140
4141 REPEATTYPE:
4142 ctype = *ecode++; /* Code for the character type */
4143
4144 /* First, ensure the minimum number of matches are present. Use inline
4145 code for maximizing the speed, and do the type test once at the start
4146 (i.e. keep it out of the loop). Also test that there are at least the
4147 minimum number of characters before we start. */
4148
4149 if (min > md->end_subject - eptr) FAIL;
4150 if (min > 0) switch(ctype)
4151 {
4152 case OP_ANY:
4153 if (!md->dotall)
4154 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') FAIL; }
4155 else eptr += min;
4156 break;
4157
4158 case OP_NOT_DIGIT:
4159 for (i = 1; i <= min; i++)
4160 if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) FAIL;
4161 break;
4162
4163 case OP_DIGIT:
4164 for (i = 1; i <= min; i++)
4165 if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) FAIL;
4166 break;
4167
4168 case OP_NOT_WHITESPACE:
4169 for (i = 1; i <= min; i++)
4170 if ((pcre_ctypes[*eptr++] & ctype_space) != 0) FAIL;
4171 break;
4172
4173 case OP_WHITESPACE:
4174 for (i = 1; i <= min; i++)
4175 if ((pcre_ctypes[*eptr++] & ctype_space) == 0) FAIL;
4176 break;
4177
4178 case OP_NOT_WORDCHAR:
4179 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
4180 FAIL;
4181 break;
4182
4183 case OP_WORDCHAR:
4184 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
4185 FAIL;
4186 break;
Guido van Rossum50700601997-12-08 17:15:20 +00004187
4188 case OP_NOT_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00004189 for (i = 1; i <= min; i++, eptr++) if (*eptr=='_' || isalnum(*eptr))
Guido van Rossum557dea11997-12-22 22:46:52 +00004190 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00004191 break;
4192
4193 case OP_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00004194 for (i = 1; i <= min; i++, eptr++) if (*eptr!='_' && !isalnum(*eptr))
Guido van Rossum557dea11997-12-22 22:46:52 +00004195 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00004196 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004197 }
4198
4199 /* If min = max, continue at the same level without recursing */
4200
4201 if (min == max) continue;
4202
4203 /* If minimizing, we have to test the rest of the pattern before each
4204 subsequent match, so inlining isn't much help; just use the function. */
4205
4206 if (minimize)
4207 {
4208 for (i = min;; i++)
4209 {
4210 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4211 if (i >= max || eptr >= md->end_subject ||
4212 !match_type(ctype, *eptr++, md->dotall))
4213 FAIL;
4214 }
4215 /* Control never gets here */
4216 }
4217
4218 /* If maximizing it is worth using inline code for speed, doing the type
4219 test once at the start (i.e. keep it out of the loop). */
4220
4221 else
4222 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004223 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004224 switch(ctype)
4225 {
4226 case OP_ANY:
4227 if (!md->dotall)
4228 {
4229 for (i = min; i < max; i++)
4230 {
4231 if (eptr >= md->end_subject || *eptr == '\n') break;
4232 eptr++;
4233 }
4234 }
4235 else
4236 {
4237 c = max - min;
4238 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4239 eptr += c;
4240 }
4241 break;
4242
4243 case OP_NOT_DIGIT:
4244 for (i = min; i < max; i++)
4245 {
4246 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)
4247 break;
4248 eptr++;
4249 }
4250 break;
4251
4252 case OP_DIGIT:
4253 for (i = min; i < max; i++)
4254 {
4255 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)
4256 break;
4257 eptr++;
4258 }
4259 break;
4260
4261 case OP_NOT_WHITESPACE:
4262 for (i = min; i < max; i++)
4263 {
4264 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)
4265 break;
4266 eptr++;
4267 }
4268 break;
4269
4270 case OP_WHITESPACE:
4271 for (i = min; i < max; i++)
4272 {
4273 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)
4274 break;
4275 eptr++;
4276 }
4277 break;
4278
4279 case OP_NOT_WORDCHAR:
4280 for (i = min; i < max; i++)
4281 {
4282 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)
4283 break;
4284 eptr++;
4285 }
4286 break;
4287
4288 case OP_WORDCHAR:
4289 for (i = min; i < max; i++)
4290 {
Guido van Rossum50700601997-12-08 17:15:20 +00004291 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)
4292 break;
4293 eptr++;
4294 }
4295 break;
4296 case OP_NOT_WORDCHAR_L:
4297 for (i = min; i < max; i++)
4298 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004299 if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) ) )
Guido van Rossum50700601997-12-08 17:15:20 +00004300 break;
4301 eptr++;
4302 }
4303 break;
4304
4305 case OP_WORDCHAR_L:
4306 for (i = min; i < max; i++)
4307 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004308 if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) ) )
Guido van Rossum50700601997-12-08 17:15:20 +00004309 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004310 eptr++;
4311 }
4312 break;
4313 }
4314
4315 while (eptr >= pp)
4316 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4317 FAIL;
4318 }
4319 /* Control never gets here */
4320
4321 /* There's been some horrible disaster. */
4322
4323 default:
Guido van Rossum557dea11997-12-22 22:46:52 +00004324 DPRINTF(("Unknown opcode %d\n", *ecode));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004325 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4326 FAIL;
4327 }
4328
4329 /* Do not stick any code in here without much thought; it is assumed
4330 that "continue" in the code above comes out to here to repeat the main
4331 loop. */
4332
4333 } /* End of main loop */
4334/* Control never reaches here */
4335
4336fail:
4337 if (md->point > save_stack_position)
4338 {
4339 /* If there are still points remaining on the stack, pop the next one off */
Guido van Rossumc3861071997-10-08 02:07:40 +00004340 int off_num;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004341
4342 md->point--;
4343 offset_top = md->offset_top[md->point];
4344 eptr = md->eptr[md->point];
4345 ecode = md->ecode[md->point];
4346 off_num = md->off_num[md->point];
4347 md->offset_vector[off_num] = md->r1[md->point];
4348 md->offset_vector[off_num+1] = md->r2[md->point];
4349 goto match_loop;
4350 }
4351 /* Failure, and nothing left on the stack, so end this function call */
4352
4353 /* Restore the top of the stack to where it was before this function
4354 call. This lets us use one stack for everything; recursive calls
4355 can push and pop information, and may increase the stack. When
4356 the call returns, the parent function can resume pushing and
4357 popping wherever it was. */
4358
4359 md->point = save_stack_position;
4360 return FALSE;
4361
4362succeed:
4363 return TRUE;
4364}
4365
4366
Guido van Rossum50700601997-12-08 17:15:20 +00004367
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004368/*************************************************
Guido van Rossum557dea11997-12-22 22:46:52 +00004369* Segregate setjmp() *
4370*************************************************/
4371
4372/* The -Wall option of gcc gives warnings for all local variables when setjmp()
4373is used, even if the coding conforms to the rules of ANSI C. To avoid this, we
4374hide it in a separate function. This is called only when PCRE_EXTRA is set,
4375since it's needed only for the extension \X option, and with any luck, a good
4376compiler will spot the tail recursion and compile it efficiently.
4377
4378Arguments:
4379 eptr pointer in subject
4380 ecode position in code
4381 offset_top current top pointer
4382 md pointer to "static" info for the match
4383
4384Returns: TRUE if matched
4385*/
4386
4387static BOOL
4388match_with_setjmp(const uschar *eptr, const uschar *ecode, int offset_top,
4389 match_data *match_block)
4390{
4391return setjmp(match_block->fail_env) == 0 &&
4392 match(eptr, ecode, offset_top, match_block);
4393}
4394
4395
4396
4397/*************************************************
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004398* Execute a Regular Expression *
4399*************************************************/
4400
4401/* This function applies a compiled re to a subject string and picks out
4402portions of the string if it matches. Two elements in the vector are set for
4403each substring: the offsets to the start and end of the substring.
4404
4405Arguments:
Guido van Rossum50700601997-12-08 17:15:20 +00004406 external_re points to the compiled expression
4407 external_extra points to "hints" from pcre_study() or is NULL
4408 subject points to the subject string
4409 length length of subject string (may contain binary zeros)
4410 options option bits
4411 offsets points to a vector of ints to be filled in with offsets
4412 offsetcount the number of elements in the vector
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004413
Guido van Rossum50700601997-12-08 17:15:20 +00004414Returns: > 0 => success; value is the number of elements filled in
4415 = 0 => success, but offsets is not big enough
4416 -1 => failed to match
4417 < -1 => some kind of unexpected problem
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004418*/
4419
4420int
Guido van Rossum50700601997-12-08 17:15:20 +00004421pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4422 const char *subject, int length, int options, int *offsets, int offsetcount)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004423{
Guido van Rossum58132c61997-12-17 00:24:13 +00004424 /* The "volatile" directives are to make gcc -Wall stop complaining
4425 that these variables can be clobbered by the longjmp. Hopefully
4426 they won't cost too much performance. */
Guido van Rossum557dea11997-12-22 22:46:52 +00004427int resetcount, ocount;
4428int first_char = -1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004429match_data match_block;
Guido van Rossum557dea11997-12-22 22:46:52 +00004430const uschar *start_bits = NULL;
4431const uschar *start_match = (const uschar *)subject;
Guido van Rossum58132c61997-12-17 00:24:13 +00004432const uschar *end_subject;
4433const real_pcre *re = (const real_pcre *)external_re;
4434const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
Guido van Rossum557dea11997-12-22 22:46:52 +00004435BOOL using_temporary_offsets = FALSE;
4436BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4437BOOL startline = (re->options & PCRE_STARTLINE) != 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004438
4439if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4440
4441if (re == NULL || subject == NULL ||
4442 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4443if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4444
Guido van Rossum58132c61997-12-17 00:24:13 +00004445match_block.start_subject = (const uschar *)subject;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004446match_block.end_subject = match_block.start_subject + length;
4447end_subject = match_block.end_subject;
4448
Guido van Rossum50700601997-12-08 17:15:20 +00004449match_block.caseless = ((re->options | options) & PCRE_CASELESS) != 0;
4450match_block.runtime_caseless = match_block.caseless &&
4451 (re->options & PCRE_CASELESS) == 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004452
Guido van Rossum50700601997-12-08 17:15:20 +00004453match_block.multiline = ((re->options | options) & PCRE_MULTILINE) != 0;
4454match_block.dotall = ((re->options | options) & PCRE_DOTALL) != 0;
4455match_block.endonly = ((re->options | options) & PCRE_DOLLAR_ENDONLY) != 0;
4456
4457match_block.notbol = (options & PCRE_NOTBOL) != 0;
4458match_block.noteol = (options & PCRE_NOTEOL) != 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004459
4460match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
4461
4462/* Set the stack state to empty */
4463 match_block.off_num = match_block.offset_top = NULL;
4464 match_block.r1 = match_block.r2 = NULL;
4465 match_block.eptr = match_block.ecode = NULL;
4466 match_block.point = match_block.length = 0;
4467
Guido van Rossum50700601997-12-08 17:15:20 +00004468/* If the expression has got more back references than the offsets supplied can
4469hold, we get a temporary bit of working store to use during the matching.
Guido van Rossum557dea11997-12-22 22:46:52 +00004470Otherwise, we can use the vector supplied, rounding down its size to a multiple
4471of 2. */
Guido van Rossum50700601997-12-08 17:15:20 +00004472
Guido van Rossum557dea11997-12-22 22:46:52 +00004473ocount = offsetcount & (-2);
4474if (re->top_backref > 0 && re->top_backref >= ocount/2)
Guido van Rossum50700601997-12-08 17:15:20 +00004475 {
4476 ocount = re->top_backref * 2 + 2;
4477 match_block.offset_vector = (pcre_malloc)(ocount * sizeof(int));
4478 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
Guido van Rossum557dea11997-12-22 22:46:52 +00004479 using_temporary_offsets = TRUE;
4480 DPRINTF(("Got memory to hold back references\n"));
Guido van Rossum50700601997-12-08 17:15:20 +00004481 }
4482else match_block.offset_vector = offsets;
4483
4484match_block.offset_end = ocount;
4485match_block.offset_overflow = FALSE;
4486
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004487/* Compute the minimum number of offsets that we need to reset each time. Doing
4488this makes a huge difference to execution time when there aren't many brackets
4489in the pattern. */
4490
4491resetcount = 2 + re->top_bracket * 2;
Guido van Rossum50700601997-12-08 17:15:20 +00004492if (resetcount > offsetcount) resetcount = ocount;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004493
4494/* If MULTILINE is set at exec time but was not set at compile time, and the
4495anchored flag is set, we must re-check because a setting provoked by ^ in the
4496pattern is not right in multi-line mode. Calling is_anchored() again here does
4497the right check, because multiline is now set. If it now yields FALSE, the
4498expression must have had ^ starting some of its branches. Check to see if
4499that is true for *all* branches, and if so, set the startline flag. */
4500
Guido van Rossum557dea11997-12-22 22:46:52 +00004501if (match_block.multiline && anchored && (re->options & PCRE_MULTILINE) == 0 &&
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004502 !is_anchored(re->code, match_block.multiline))
4503 {
4504 anchored = FALSE;
4505 if (is_startline(re->code)) startline = TRUE;
4506 }
4507
4508/* Set up the first character to match, if available. The first_char value is
4509never set for an anchored regular expression, but the anchoring may be forced
4510at run time, so we have to test for anchoring. The first char may be unset for
4511an unanchored pattern, of course. If there's no first char and the pattern was
4512studied, the may be a bitmap of possible first characters. However, we can
4513use this only if the caseless state of the studying was correct. */
4514
4515if (!anchored)
4516 {
4517 if ((re->options & PCRE_FIRSTSET) != 0)
4518 {
4519 first_char = re->first_char;
4520 if (match_block.caseless) first_char = pcre_lcc[first_char];
4521 }
4522 else
4523 if (!startline && extra != NULL &&
4524 (extra->options & PCRE_STUDY_MAPPED) != 0 &&
4525 ((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless)
4526 start_bits = extra->start_bits;
4527 }
4528
4529/* Loop for unanchored matches; for anchored regexps the loop runs just once. */
4530
4531do
4532 {
Guido van Rossum557dea11997-12-22 22:46:52 +00004533 int rc;
Guido van Rossum50700601997-12-08 17:15:20 +00004534 register int *iptr = match_block.offset_vector;
4535 register int *iend = iptr + resetcount;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004536
4537 /* Reset the maximum number of extractions we might see. */
4538
4539 while (iptr < iend) *iptr++ = -1;
4540
4541 /* Advance to a unique first char if possible */
4542
4543 if (first_char >= 0)
4544 {
4545 if (match_block.caseless)
4546 while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
4547 start_match++;
4548 else
4549 while (start_match < end_subject && *start_match != first_char)
4550 start_match++;
4551 }
4552
4553 /* Or to just after \n for a multiline match if possible */
4554
4555 else if (startline)
4556 {
4557 if (start_match > match_block.start_subject)
4558 {
4559 while (start_match < end_subject && start_match[-1] != '\n')
4560 start_match++;
4561 }
4562 }
4563
4564 /* Or to a non-unique first char */
4565
4566 else if (start_bits != NULL)
4567 {
4568 while (start_match < end_subject)
4569 {
4570 register int c = *start_match;
Guido van Rossum50700601997-12-08 17:15:20 +00004571 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004572 }
4573 }
4574
Guido van Rossum557dea11997-12-22 22:46:52 +00004575#ifdef DEBUG /* Sigh. Some compilers never learn. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004576 printf(">>>> Match against: ");
4577 pchars(start_match, end_subject - start_match, TRUE, &match_block);
4578 printf("\n");
Guido van Rossum57ba4f31997-12-02 20:40:28 +00004579#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004580
4581 /* When a match occurs, substrings will be set for all internal extractions;
4582 we just need to set up the whole thing as substring 0 before returning. If
Guido van Rossum50700601997-12-08 17:15:20 +00004583 there were too many extractions, set the return code to zero. In the case
4584 where we had to get some local store to hold offsets for backreferences, copy
4585 those back references that we can. In this case there need not be overflow
4586 if certain parts of the pattern were not used.
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004587
Guido van Rossum50700601997-12-08 17:15:20 +00004588 Before starting the match, we have to set up a longjmp() target to enable
Guido van Rossum557dea11997-12-22 22:46:52 +00004589 the "cut" operation to fail a match completely without backtracking. This
4590 is done in a separate function to avoid compiler warnings. We need not do
4591 it unless PCRE_EXTRA is set, since only in that case is the "cut" operation
4592 enabled. */
Guido van Rossum50700601997-12-08 17:15:20 +00004593
4594 /* To handle errors such as running out of memory for the failure
4595 stack, we need to save this location via setjmp(), so
4596 error-handling code can call longjmp() to jump out of deeply-nested code. */
4597 if (setjmp(match_block.error_env)==0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004598 {
Guido van Rossum50700601997-12-08 17:15:20 +00004599
Guido van Rossum557dea11997-12-22 22:46:52 +00004600 if ((re->options & PCRE_EXTRA) != 0)
Guido van Rossum50700601997-12-08 17:15:20 +00004601 {
Guido van Rossum557dea11997-12-22 22:46:52 +00004602 if (!match_with_setjmp(start_match, re->code, 2, &match_block))
4603 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004604 }
Guido van Rossum557dea11997-12-22 22:46:52 +00004605 else if (!match(start_match, re->code, 2, &match_block)) continue;
4606
4607 /* Copy the offset information from temporary store if necessary */
4608
4609 if (using_temporary_offsets)
4610 {
4611 if (offsetcount >= 4)
4612 {
4613 memcpy(offsets + 2, match_block.offset_vector + 2,
4614 (offsetcount - 2) * sizeof(int));
4615 DPRINTF(("Copied offsets from temporary memory\n"));
4616 }
4617 if (match_block.end_offset_top > offsetcount)
4618 match_block.offset_overflow = TRUE;
4619
4620 DPRINTF(("Freeing temporary memory\n"));
4621 (pcre_free)(match_block.offset_vector);
4622 }
4623
4624 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
4625
4626 if (match_block.offset_end < 2) rc = 0; else
4627 {
4628 offsets[0] = start_match - match_block.start_subject;
4629 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
4630 }
4631
4632 DPRINTF((">>>> returning %d\n", rc));
4633 free_stack(&match_block);
4634 return rc;
Guido van Rossum50700601997-12-08 17:15:20 +00004635 } /* End of (if setjmp(match_block.error_env)...) */
4636 /* Return an error code; pcremodule.c will preserve the exception */
4637 if (PyErr_Occurred()) return PCRE_ERROR_NOMEMORY;
4638
4639 free_stack(&match_block);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004640 }
4641while (!anchored &&
4642 match_block.errorcode == PCRE_ERROR_NOMATCH &&
4643 start_match++ < end_subject);
4644
Guido van Rossum557dea11997-12-22 22:46:52 +00004645if (using_temporary_offsets)
4646 {
4647 DPRINTF(("Freeing temporary memory\n"));
4648 (pcre_free)(match_block.offset_vector);
4649 }
4650
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004651#ifdef DEBUG
4652printf(">>>> returning %d\n", match_block.errorcode);
4653#endif
Guido van Rossum50700601997-12-08 17:15:20 +00004654
Guido van Rossum557dea11997-12-22 22:46:52 +00004655 return match_block.errorcode;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004656}
4657
4658/* End of pcre.c */