blob: 69d6c221b1d68f55384cfdbac370eb4b3f643ff7 [file] [log] [blame]
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001
2/*************************************************
3* Perl-Compatible Regular Expressions *
4*************************************************/
5
6/* DO NOT EDIT THIS FILE! */
7
8/* This file is automatically written by the merge-files.py script
9included with the PCRE distribution for Python; it's produced from
10several C files, and code is removed in the process. If you want to
11modify the code or track down bugs, it will be much easier to work
12with the code in its original, multiple-file form. Don't edit this
13file by hand, or submit patches to it.
14
15The Python-specific PCRE distribution can be retrieved from
16 http://starship.skyport.net/crew/amk/regex/
17
Guido van Rossum58132c61997-12-17 00:24:13 +000018The unmodified original PCRE distribution is available at
19ftp://ftp.cus.cam.ac.uk/pub/software/programs/pcre/, and is originally
20written by: Philip Hazel <ph10@cam.ac.uk>
Guido van Rossum51b3aa31997-10-06 14:43:11 +000021
22Extensively modified by the Python String-SIG: <string-sig@python.org>
23Send bug reports to: <string-sig@python.org>
24(They'll figure out if it's a bug in PCRE or in the Python-specific
25changes.)
26
27 Copyright (c) 1997 University of Cambridge
28
29-----------------------------------------------------------------------------
30Permission is granted to anyone to use this software for any purpose on any
31computer system, and to redistribute it freely, subject to the following
32restrictions:
33
341. This software is distributed in the hope that it will be useful,
35 but WITHOUT ANY WARRANTY; without even the implied warranty of
36 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
37
382. The origin of this software must not be misrepresented, either by
39 explicit claim or by omission.
40
413. Altered versions must be plainly marked as such, and must not be
42 misrepresented as being the original software.
43-----------------------------------------------------------------------------
44*/
45
46
47#define FOR_PYTHON
Guido van Rossum58132c61997-12-17 00:24:13 +000048#include "pcre-int.h"
Guido van Rossum51b3aa31997-10-06 14:43:11 +000049#include "Python.h"
Guido van Rossum50700601997-12-08 17:15:20 +000050#include "mymalloc.h"
51#include <ctype.h>
Guido van Rossum51b3aa31997-10-06 14:43:11 +000052#include "graminit.h"
53
54/*************************************************
55* Perl-Compatible Regular Expressions *
56*************************************************/
57
58/* This file is automatically written by the makechartables auxiliary
59program. If you edit it by hand, you might like to edit the Makefile to
60prevent its ever being regenerated. */
61
62/* This table is a lower casing table. */
63
64unsigned char pcre_lcc[] = {
65 0, 1, 2, 3, 4, 5, 6, 7,
66 8, 9, 10, 11, 12, 13, 14, 15,
67 16, 17, 18, 19, 20, 21, 22, 23,
68 24, 25, 26, 27, 28, 29, 30, 31,
69 32, 33, 34, 35, 36, 37, 38, 39,
70 40, 41, 42, 43, 44, 45, 46, 47,
71 48, 49, 50, 51, 52, 53, 54, 55,
72 56, 57, 58, 59, 60, 61, 62, 63,
73 64, 97, 98, 99,100,101,102,103,
74 104,105,106,107,108,109,110,111,
75 112,113,114,115,116,117,118,119,
76 120,121,122, 91, 92, 93, 94, 95,
77 96, 97, 98, 99,100,101,102,103,
78 104,105,106,107,108,109,110,111,
79 112,113,114,115,116,117,118,119,
80 120,121,122,123,124,125,126,127,
81 128,129,130,131,132,133,134,135,
82 136,137,138,139,140,141,142,143,
83 144,145,146,147,148,149,150,151,
84 152,153,154,155,156,157,158,159,
85 160,161,162,163,164,165,166,167,
86 168,169,170,171,172,173,174,175,
87 176,177,178,179,180,181,182,183,
88 184,185,186,187,188,189,190,191,
89 192,193,194,195,196,197,198,199,
90 200,201,202,203,204,205,206,207,
91 208,209,210,211,212,213,214,215,
92 216,217,218,219,220,221,222,223,
93 224,225,226,227,228,229,230,231,
94 232,233,234,235,236,237,238,239,
95 240,241,242,243,244,245,246,247,
96 248,249,250,251,252,253,254,255 };
97
Guido van Rossum50700601997-12-08 17:15:20 +000098/* This table is a case flipping table. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +000099
Guido van Rossum50700601997-12-08 17:15:20 +0000100unsigned char pcre_fcc[] = {
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000101 0, 1, 2, 3, 4, 5, 6, 7,
102 8, 9, 10, 11, 12, 13, 14, 15,
103 16, 17, 18, 19, 20, 21, 22, 23,
104 24, 25, 26, 27, 28, 29, 30, 31,
105 32, 33, 34, 35, 36, 37, 38, 39,
106 40, 41, 42, 43, 44, 45, 46, 47,
107 48, 49, 50, 51, 52, 53, 54, 55,
108 56, 57, 58, 59, 60, 61, 62, 63,
Guido van Rossum50700601997-12-08 17:15:20 +0000109 64, 97, 98, 99,100,101,102,103,
110 104,105,106,107,108,109,110,111,
111 112,113,114,115,116,117,118,119,
112 120,121,122, 91, 92, 93, 94, 95,
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000113 96, 65, 66, 67, 68, 69, 70, 71,
114 72, 73, 74, 75, 76, 77, 78, 79,
115 80, 81, 82, 83, 84, 85, 86, 87,
116 88, 89, 90,123,124,125,126,127,
117 128,129,130,131,132,133,134,135,
118 136,137,138,139,140,141,142,143,
119 144,145,146,147,148,149,150,151,
120 152,153,154,155,156,157,158,159,
121 160,161,162,163,164,165,166,167,
122 168,169,170,171,172,173,174,175,
123 176,177,178,179,180,181,182,183,
124 184,185,186,187,188,189,190,191,
125 192,193,194,195,196,197,198,199,
126 200,201,202,203,204,205,206,207,
127 208,209,210,211,212,213,214,215,
128 216,217,218,219,220,221,222,223,
129 224,225,226,227,228,229,230,231,
130 232,233,234,235,236,237,238,239,
131 240,241,242,243,244,245,246,247,
132 248,249,250,251,252,253,254,255 };
133
Guido van Rossum50700601997-12-08 17:15:20 +0000134/* This table contains bit maps for digits, letters, 'word' chars, and
135white space. Each map is 32 bytes long and the bits run from the least
136significant end of each byte. */
137
138unsigned char pcre_cbits[] = {
139 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
140 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
141 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
142 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
143
144 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
145 0xfe,0xff,0xff,0x07,0xfe,0xff,0xff,0x07,
146 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
147 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
148
149 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
150 0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07,
151 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
152 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
153
154 0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00,
155 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
156 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
157 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 };
158
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000159/* This table identifies various classes of character by individual bits:
Guido van Rossum50700601997-12-08 17:15:20 +0000160 0x01 white space character
161 0x02 letter
162 0x04 decimal digit
163 0x08 hexadecimal digit
164 0x10 alphanumeric or '_'
165 0x80 regular expression metacharacter or binary zero
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000166*/
167
168unsigned char pcre_ctypes[] = {
169 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
170 0x00,0x01,0x01,0x01,0x01,0x01,0x00,0x00, /* 8- 15 */
171 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
172 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
173 0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /* - ' */
174 0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /* ( - / */
Guido van Rossum50700601997-12-08 17:15:20 +0000175 0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c, /* 0 - 7 */
176 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x80, /* 8 - ? */
177 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* @ - G */
178 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* H - O */
179 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* P - W */
180 0x12,0x12,0x12,0x80,0x00,0x00,0x80,0x10, /* X - _ */
181 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* ` - g */
182 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* h - o */
183 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* p - w */
184 0x12,0x12,0x12,0x80,0x80,0x00,0x00,0x00, /* x -127 */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000185 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
186 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
187 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
188 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
189 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
190 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
191 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
192 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
193 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
194 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
195 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
196 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
197 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
198 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
199 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
200 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
201
Guido van Rossum50700601997-12-08 17:15:20 +0000202/* End of chartables.c */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000203/*************************************************
204* Perl-Compatible Regular Expressions *
205*************************************************/
206
207/*
208This is a library of functions to support regular expressions whose syntax
209and semantics are as close as possible to those of the Perl 5 language. See
210the file Tech.Notes for some information on the internals.
211
212Written by: Philip Hazel <ph10@cam.ac.uk>
213
214 Copyright (c) 1997 University of Cambridge
215
216-----------------------------------------------------------------------------
217Permission is granted to anyone to use this software for any purpose on any
218computer system, and to redistribute it freely, subject to the following
219restrictions:
220
2211. This software is distributed in the hope that it will be useful,
222 but WITHOUT ANY WARRANTY; without even the implied warranty of
223 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
224
2252. The origin of this software must not be misrepresented, either by
226 explicit claim or by omission.
227
2283. Altered versions must be plainly marked as such, and must not be
229 misrepresented as being the original software.
230-----------------------------------------------------------------------------
231*/
232
233
234/* Include the internals header, which itself includes Standard C headers plus
235the external pcre header. */
236
237
238
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000239
240/*************************************************
241* Create bitmap of starting chars *
242*************************************************/
243
244/* This function scans a compiled unanchored expression and attempts to build a
245bitmap of the set of initial characters. If it can't, it returns FALSE. As time
246goes by, we may be able to get more clever at doing this.
247
248Arguments:
249 code points to an expression
250 start_bits points to a 32-byte table, initialized to 0
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000251
252Returns: TRUE if table built, FALSE otherwise
253*/
254
255static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +0000256set_start_bits(const uschar *code, uschar *start_bits)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000257{
Guido van Rossum50700601997-12-08 17:15:20 +0000258register int c;
259
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000260do
261 {
Guido van Rossum58132c61997-12-17 00:24:13 +0000262 const uschar *tcode = code + 3;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000263 BOOL try_next = TRUE;
264
265 while (try_next)
266 {
267 try_next = FALSE;
268
269 if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
270 {
Guido van Rossum50700601997-12-08 17:15:20 +0000271 if (!set_start_bits(tcode, start_bits)) return FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000272 }
273
274 else switch(*tcode)
275 {
276 default:
277 return FALSE;
278
279 /* BRAZERO does the bracket, but carries on. */
280
281 case OP_BRAZERO:
282 case OP_BRAMINZERO:
Guido van Rossum50700601997-12-08 17:15:20 +0000283 if (!set_start_bits(++tcode, start_bits)) return FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000284 do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
285 tcode += 3;
286 try_next = TRUE;
287 break;
288
289 /* Single-char * or ? sets the bit and tries the next item */
290
291 case OP_STAR:
292 case OP_MINSTAR:
293 case OP_QUERY:
294 case OP_MINQUERY:
Guido van Rossum50700601997-12-08 17:15:20 +0000295 start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000296 tcode += 2;
297 try_next = TRUE;
298 break;
299
300 /* Single-char upto sets the bit and tries the next */
301
302 case OP_UPTO:
303 case OP_MINUPTO:
Guido van Rossum50700601997-12-08 17:15:20 +0000304 start_bits[tcode[3]/8] |= (1 << (tcode[3]&7));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000305 tcode += 4;
306 try_next = TRUE;
307 break;
308
309 /* At least one single char sets the bit and stops */
310
311 case OP_EXACT: /* Fall through */
312 tcode++;
313
314 case OP_CHARS: /* Fall through */
315 tcode++;
316
317 case OP_PLUS:
318 case OP_MINPLUS:
Guido van Rossum50700601997-12-08 17:15:20 +0000319 start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000320 break;
321
322 /* Single character type sets the bits and stops */
323
324 case OP_NOT_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000325 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000326 break;
327
328 case OP_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000329 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000330 break;
331
332 case OP_NOT_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000333 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000334 break;
335
336 case OP_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000337 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000338 break;
339
340 case OP_NOT_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000341 for (c = 0; c < 32; c++)
342 start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000343 break;
344
345 case OP_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000346 for (c = 0; c < 32; c++)
347 start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000348 break;
349
350 /* One or more character type fudges the pointer and restarts, knowing
351 it will hit a single character type and stop there. */
352
353 case OP_TYPEPLUS:
354 case OP_TYPEMINPLUS:
355 tcode++;
356 try_next = TRUE;
357 break;
358
359 case OP_TYPEEXACT:
360 tcode += 3;
361 try_next = TRUE;
362 break;
363
364 /* Zero or more repeats of character types set the bits and then
365 try again. */
366
367 case OP_TYPEUPTO:
368 case OP_TYPEMINUPTO:
369 tcode += 2; /* Fall through */
370
371 case OP_TYPESTAR:
372 case OP_TYPEMINSTAR:
373 case OP_TYPEQUERY:
374 case OP_TYPEMINQUERY:
375 switch(tcode[1])
376 {
377 case OP_NOT_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000378 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000379 break;
380
381 case OP_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000382 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000383 break;
384
385 case OP_NOT_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000386 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000387 break;
388
389 case OP_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000390 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000391 break;
392
393 case OP_NOT_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000394 for (c = 0; c < 32; c++)
395 start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000396 break;
397
398 case OP_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000399 for (c = 0; c < 32; c++)
400 start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000401 break;
402 }
403
404 tcode += 2;
405 try_next = TRUE;
406 break;
407
408 /* Character class: set the bits and either carry on or not,
409 according to the repeat count. */
410
411 case OP_CLASS:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000412 {
Guido van Rossum50700601997-12-08 17:15:20 +0000413 tcode++;
414 for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
415 tcode += 32;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000416 switch (*tcode)
417 {
418 case OP_CRSTAR:
419 case OP_CRMINSTAR:
420 case OP_CRQUERY:
421 case OP_CRMINQUERY:
422 tcode++;
423 try_next = TRUE;
424 break;
425
426 case OP_CRRANGE:
427 case OP_CRMINRANGE:
428 if (((tcode[1] << 8) + tcode[2]) == 0)
429 {
430 tcode += 5;
431 try_next = TRUE;
432 }
433 break;
434 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000435 }
436 break; /* End of class handling */
437
438 } /* End of switch */
439 } /* End of try_next loop */
440
441 code += (code[1] << 8) + code[2]; /* Advance to next branch */
442 }
443while (*code == OP_ALT);
444return TRUE;
445}
446
447
448
449/*************************************************
450* Study a compiled expression *
451*************************************************/
452
453/* This function is handed a compiled expression that it must study to produce
454information that will speed up the matching. It returns a pcre_extra block
455which then gets handed back to pcre_exec().
456
457Arguments:
458 re points to the compiled expression
459 options contains option bits
460 errorptr points to where to place error messages;
461 set NULL unless error
462
463Returns: pointer to a pcre_extra block,
464 NULL on error or if no optimization possible
465*/
466
467pcre_extra *
Guido van Rossum58132c61997-12-17 00:24:13 +0000468pcre_study(const pcre *external_re, int options, const char **errorptr)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000469{
470BOOL caseless;
471uschar start_bits[32];
472real_pcre_extra *extra;
Guido van Rossum58132c61997-12-17 00:24:13 +0000473const real_pcre *re = (const real_pcre *)external_re;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000474
475*errorptr = NULL;
476
477if (re == NULL || re->magic_number != MAGIC_NUMBER)
478 {
479 *errorptr = "argument is not a compiled regular expression";
480 return NULL;
481 }
482
483if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
484 {
485 *errorptr = "unknown or incorrect option bit(s) set";
486 return NULL;
487 }
488
Guido van Rossum50700601997-12-08 17:15:20 +0000489/* Caseless can either be from the compiled regex or from options. */
490
491caseless = ((re->options | options) & PCRE_CASELESS) != 0;
492
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000493/* For an anchored pattern, or an unchored pattern that has a first char, or a
494multiline pattern that matches only at "line starts", no further processing at
495present. */
496
497if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
498 return NULL;
499
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000500/* See if we can find a fixed set of initial characters for the pattern. */
501
Guido van Rossum50700601997-12-08 17:15:20 +0000502memset(start_bits, 0, 32 * sizeof(uschar));
503if (!set_start_bits(re->code, start_bits)) return NULL;
504
505/* If this studying is caseless, scan the created bit map and duplicate the
506bits for any letters. */
507
508if (caseless)
509 {
510 register int c;
511 for (c = 0; c < 256; c++)
512 {
513 if ((start_bits[c/8] & (1 << (c&7))) != 0 &&
514 (pcre_ctypes[c] & ctype_letter) != 0)
515 {
516 int d = pcre_fcc[c];
517 start_bits[d/8] |= (1 << (d&7));
518 }
519 }
520 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000521
522/* Get an "extra" block and put the information therein. */
523
524extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
525
526if (extra == NULL)
527 {
528 *errorptr = "failed to get memory";
529 return NULL;
530 }
Guido van Rossum50700601997-12-08 17:15:20 +0000531
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000532extra->options = PCRE_STUDY_MAPPED | (caseless? PCRE_STUDY_CASELESS : 0);
Guido van Rossum50700601997-12-08 17:15:20 +0000533memcpy(extra->start_bits, start_bits, sizeof(start_bits));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000534
535return (pcre_extra *)extra;
536}
537
Guido van Rossum50700601997-12-08 17:15:20 +0000538/* End of study.c */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000539/*************************************************
540* Perl-Compatible Regular Expressions *
541*************************************************/
542
543/*
544This is a library of functions to support regular expressions whose syntax
545and semantics are as close as possible to those of the Perl 5 language. See
546the file Tech.Notes for some information on the internals.
547
548Written by: Philip Hazel <ph10@cam.ac.uk>
549
550 Copyright (c) 1997 University of Cambridge
551
552-----------------------------------------------------------------------------
553Permission is granted to anyone to use this software for any purpose on any
554computer system, and to redistribute it freely, subject to the following
555restrictions:
556
5571. This software is distributed in the hope that it will be useful,
558 but WITHOUT ANY WARRANTY; without even the implied warranty of
559 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
560
5612. The origin of this software must not be misrepresented, either by
562 explicit claim or by omission.
563
5643. Altered versions must be plainly marked as such, and must not be
565 misrepresented as being the original software.
566-----------------------------------------------------------------------------
567*/
568
569
570/* Define DEBUG to get debugging output on stdout. */
571
572/* #define DEBUG */
573
Guido van Rossum557dea11997-12-22 22:46:52 +0000574/* Use a macro for debugging printing, 'cause that eliminates the the use
575of #ifdef inline, and there are *still* stupid compilers about that don't like
576indented pre-processor statements. I suppose it's only been 10 years... */
577
578#ifdef DEBUG
579#define DPRINTF(p) printf p
580#else
581#define DPRINTF(p) /*nothing*/
582#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000583
584/* Include the internals header, which itself includes Standard C headers plus
585the external pcre header. */
586
587
Guido van Rossum50700601997-12-08 17:15:20 +0000588
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000589#ifndef Py_eval_input
590/* For Python 1.4, graminit.h has to be explicitly included */
591#define Py_eval_input eval_input
Guido van Rossum50700601997-12-08 17:15:20 +0000592
593#endif /* FOR_PYTHON */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000594
595/* Min and max values for the common repeats; for the maxima, 0 => infinity */
596
597static char rep_min[] = { 0, 0, 1, 1, 0, 0 };
598static char rep_max[] = { 0, 0, 0, 0, 1, 1 };
599
600/* Text forms of OP_ values and things, for debugging */
601
602#ifdef DEBUG
Guido van Rossum58132c61997-12-17 00:24:13 +0000603static const char *OP_names[] = {
604 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
Guido van Rossum50700601997-12-08 17:15:20 +0000605 "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z",
606 "localized \\B", "localized \\b", "localized \\W", "localized \\w",
607 "^", "$", "Any", "chars",
608 "not",
609 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000610 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
611 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
612 "*", "*?", "+", "+?", "?", "??", "{", "{",
Guido van Rossum50700601997-12-08 17:15:20 +0000613 "class", "classL", "Ref",
614 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000615 "Brazero", "Braminzero", "Bra"
616};
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000617#endif
618
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000619/* Table for handling escaped characters in the range '0'-'z'. Positive returns
620are simple data values; negative values are for special things like \d and so
621on. Zero means further processing is needed (for things like \x), or the escape
622is invalid. */
623
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000624static short int escapes[] = {
625 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
626 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
627 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
628 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
629 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
630 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
631 '`', 7, -ESC_b, 0, -ESC_d, 0, '\f', 0, /* ` - g */
632 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
633 0, 0, '\r', -ESC_s, '\t', 0, '\v', -ESC_w, /* p - w */
634 0, 0, 0 /* x - z */
635};
636
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000637/* Definition to allow mutual recursion */
638
Guido van Rossum58132c61997-12-17 00:24:13 +0000639static BOOL compile_regex(int, int *, uschar **, const uschar **,
640 const char **, PyObject *);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000641
642/* Structure for passing "static" information around between the functions
643doing the matching, so that they are thread-safe. */
644
645typedef struct match_data {
646 int errorcode; /* As it says */
647 int *offset_vector; /* Offset vector */
648 int offset_end; /* One past the end */
649 BOOL offset_overflow; /* Set if too many extractions */
650 BOOL caseless; /* Case-independent flag */
Guido van Rossum50700601997-12-08 17:15:20 +0000651 BOOL runtime_caseless; /* Caseless forced at run time */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000652 BOOL multiline; /* Multiline flag */
Guido van Rossum50700601997-12-08 17:15:20 +0000653 BOOL notbol; /* NOTBOL flag */
654 BOOL noteol; /* NOTEOL flag */
655 BOOL dotall; /* Dot matches any char */
656 BOOL endonly; /* Dollar not before final \n */
Guido van Rossum58132c61997-12-17 00:24:13 +0000657 const uschar *start_subject; /* Start of the subject string */
658 const uschar *end_subject; /* End of the subject string */
Guido van Rossum50700601997-12-08 17:15:20 +0000659 jmp_buf fail_env; /* Environment for longjump() break out */
Guido van Rossum58132c61997-12-17 00:24:13 +0000660 const uschar *end_match_ptr; /* Subject position at end match */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000661 int end_offset_top; /* Highwater mark at end of match */
Guido van Rossum50700601997-12-08 17:15:20 +0000662 jmp_buf error_env; /* For longjmp() if an error occurs deep inside a
663 matching operation */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000664 int length; /* Length of the allocated stacks */
665 int point; /* Point to add next item pushed onto stacks */
666 /* Pointers to the 6 stacks */
667 int *off_num, *offset_top, *r1, *r2;
Guido van Rossum58132c61997-12-17 00:24:13 +0000668 const uschar **eptr, **ecode;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000669} match_data;
670
671
672
673/*************************************************
Guido van Rossum50700601997-12-08 17:15:20 +0000674* Global variables *
675*************************************************/
676
677/* PCRE is thread-clean and doesn't use any global variables in the normal
678sense. However, it calls memory allocation and free functions via the two
679indirections below, which are can be changed by the caller, but are shared
680between all threads. */
681
682void *(*pcre_malloc)(size_t) = malloc;
683void (*pcre_free)(void *) = free;
684
685
686
687
688/*************************************************
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000689* Return version string *
690*************************************************/
691
Guido van Rossum58132c61997-12-17 00:24:13 +0000692const char *
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000693pcre_version(void)
694{
695return PCRE_VERSION;
696}
697
698
699
700
701/*************************************************
702* Return info about a compiled pattern *
703*************************************************/
704
705/* This function picks potentially useful data out of the private
706structure.
707
708Arguments:
709 external_re points to compiled code
710 optptr where to pass back the options
711 first_char where to pass back the first character,
712 or -1 if multiline and all branches start ^,
713 or -2 otherwise
714
715Returns: number of identifying extraction brackets
716 or negative values on error
717*/
718
719int
Guido van Rossum50700601997-12-08 17:15:20 +0000720pcre_info(const pcre *external_re, int *optptr, int *first_char)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000721{
Guido van Rossum58132c61997-12-17 00:24:13 +0000722const real_pcre *re = (real_pcre *)external_re;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000723if (re == NULL) return PCRE_ERROR_NULL;
724if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
725if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
726if (first_char != NULL)
727 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
728 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
729return re->top_bracket;
730}
731
732
733
734
735#ifdef DEBUG
736/*************************************************
737* Debugging function to print chars *
738*************************************************/
739
740/* Print a sequence of chars in printable format, stopping at the end of the
741subject if the requested.
742
743Arguments:
744 p points to characters
745 length number to print
746 is_subject TRUE if printing from within md->start_subject
747 md pointer to matching data block, if is_subject is TRUE
748
749Returns: nothing
750*/
751
Guido van Rossum557dea11997-12-22 22:46:52 +0000752static void
753pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000754{
755int c;
756if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
757while (length-- > 0)
758 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
759}
760#endif
761
762
763
764
765/*************************************************
766* Check subpattern for empty operand *
767*************************************************/
768
769/* This function checks a bracketed subpattern to see if any of the paths
770through it could match an empty string. This is used to diagnose an error if
771such a subpattern is followed by a quantifier with an unlimited upper bound.
772
773Argument:
774 code points to the opening bracket
775
776Returns: TRUE or FALSE
777*/
778
779static BOOL
780could_be_empty(uschar *code)
781{
782do {
783 uschar *cc = code + 3;
784
785 /* Scan along the opcodes for this branch; as soon as we find something
786 that matches a non-empty string, break out and advance to test the next
787 branch. If we get to the end of the branch, return TRUE for the whole
788 sub-expression. */
789
790 for (;;)
791 {
792 /* Test an embedded subpattern; if it could not be empty, break the
793 loop. Otherwise carry on in the branch. */
794
Guido van Rossum50700601997-12-08 17:15:20 +0000795 if ((int)(*cc) >= OP_BRA || (int)(*cc) == OP_ONCE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000796 {
797 if (!could_be_empty(cc)) break;
798 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
799 cc += 3;
800 }
801
802 else switch (*cc)
803 {
804 /* Reached end of a branch: the subpattern may match the empty string */
805
806 case OP_ALT:
807 case OP_KET:
808 case OP_KETRMAX:
809 case OP_KETRMIN:
810 return TRUE;
811
Guido van Rossumd0f432b1998-02-20 21:45:14 +0000812 /* Skip over entire bracket groups with zero lower bound */
813
814 case OP_BRAZERO:
815 case OP_BRAMINZERO:
816 cc++;
817 /* Fall through */
818
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000819 /* Skip over assertive subpatterns */
820
821 case OP_ASSERT:
822 case OP_ASSERT_NOT:
823 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
824 cc += 3;
825 break;
826
827 /* Skip over things that don't match chars */
828
829 case OP_SOD:
830 case OP_EOD:
831 case OP_CIRC:
832 case OP_DOLL:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000833 case OP_NOT_WORD_BOUNDARY:
834 case OP_WORD_BOUNDARY:
Guido van Rossum50700601997-12-08 17:15:20 +0000835 case OP_NOT_WORD_BOUNDARY_L:
836 case OP_WORD_BOUNDARY_L:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000837 cc++;
838 break;
839
840 /* Skip over simple repeats with zero lower bound */
841
842 case OP_STAR:
843 case OP_MINSTAR:
844 case OP_QUERY:
845 case OP_MINQUERY:
Guido van Rossum50700601997-12-08 17:15:20 +0000846 case OP_NOTSTAR:
847 case OP_NOTMINSTAR:
848 case OP_NOTQUERY:
849 case OP_NOTMINQUERY:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000850 case OP_TYPESTAR:
851 case OP_TYPEMINSTAR:
852 case OP_TYPEQUERY:
853 case OP_TYPEMINQUERY:
854 cc += 2;
855 break;
856
857 /* Skip over UPTOs (lower bound is zero) */
858
859 case OP_UPTO:
860 case OP_MINUPTO:
861 case OP_TYPEUPTO:
862 case OP_TYPEMINUPTO:
863 cc += 4;
864 break;
865
866 /* Check a class or a back reference for a zero minimum */
867
868 case OP_CLASS:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000869 case OP_REF:
Guido van Rossum50700601997-12-08 17:15:20 +0000870 case OP_CLASS_L:
871 switch(*cc)
872 {
873 case (OP_REF): cc += 2; break;
874 case (OP_CLASS): cc += 1+32; break;
875 case (OP_CLASS_L): cc += 1+1+32; break;
876 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000877
878 switch (*cc)
879 {
880 case OP_CRSTAR:
881 case OP_CRMINSTAR:
882 case OP_CRQUERY:
883 case OP_CRMINQUERY:
884 cc++;
885 break;
886
887 case OP_CRRANGE:
888 case OP_CRMINRANGE:
889 if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
890 cc += 3;
891 break;
892
893 default:
894 goto NEXT_BRANCH;
895 }
896 break;
897
898 /* Anything else matches at least one character */
899
900 default:
901 goto NEXT_BRANCH;
902 }
903 }
904
905 NEXT_BRANCH:
906 code += (code[1] << 8) + code[2];
907 }
908while (*code == OP_ALT);
909
910/* No branches match the empty string */
911
912return FALSE;
913}
914
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000915/* Determine the length of a group ID in an expression like
916 (?P<foo_123>...)
917Arguments:
918 ptr pattern position pointer (say that 3 times fast)
919 finalchar the character that will mark the end of the ID
920 errorptr points to the pointer to the error message
921*/
922
923static int
Guido van Rossum58132c61997-12-17 00:24:13 +0000924get_group_id(const uschar *ptr, char finalchar, const char **errorptr)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000925{
Guido van Rossum58132c61997-12-17 00:24:13 +0000926 const uschar *start = ptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000927
928 /* If the first character is not in \w, or is in \w but is a digit,
929 report an error */
930 if (!(pcre_ctypes[*ptr] & ctype_word) ||
931 (pcre_ctypes[*ptr++] & ctype_digit))
932 {
933 *errorptr = "(?P identifier must start with a letter or underscore";
934 return 0;
935 }
936
937 /* Increment ptr until we either hit a null byte, the desired
938 final character, or a non-word character */
939 for(; (*ptr != 0) && (*ptr != finalchar) &&
940 (pcre_ctypes[*ptr] & ctype_word); ptr++)
941 {
Guido van Rossumc3861071997-10-08 02:07:40 +0000942 /* Empty loop body */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000943 }
944 if (*ptr==finalchar)
945 return ptr-start;
946 if (*ptr==0)
947 {
948 *errorptr = "unterminated (?P identifier";
949 return 0;
950 }
951 *errorptr = "illegal character in (?P identifier";
952 return 0;
953}
954
955/*************************************************
956* Handle escapes *
957*************************************************/
958
959/* This function is called when a \ has been encountered. It either returns a
960positive value for a simple escape such as \n, or a negative value which
961encodes one of the more complicated things such as \d. On entry, ptr is
962pointing at the \. On exit, it is on the final character of the escape
963sequence.
964
965Arguments:
966 ptrptr points to the pattern position pointer
967 errorptr points to the pointer to the error message
Guido van Rossum50700601997-12-08 17:15:20 +0000968 bracount number of previous extracting brackets
969 options the options bits
970 isclass TRUE if inside a character class
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000971
972Returns: zero or positive => a data character
973 negative => a special escape sequence
974 on error, errorptr is set
975*/
976
977static int
Guido van Rossum58132c61997-12-17 00:24:13 +0000978check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
979 int options, BOOL isclass)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000980{
Guido van Rossum58132c61997-12-17 00:24:13 +0000981const uschar *ptr = *ptrptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000982int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
983int i;
984
Guido van Rossum50700601997-12-08 17:15:20 +0000985if (c == 0) *errorptr = ERR1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000986
987/* Digits or letters may have special meaning; all others are literals. */
988
989else if (c < '0' || c > 'z') {}
990
991/* Do an initial lookup in a table. A non-zero result is something that can be
992returned immediately. Otherwise further processing may be required. */
993
994else if ((i = escapes[c - '0']) != 0) c = i;
995
996/* Escapes that need further processing, or are illegal. */
997
Guido van Rossum50700601997-12-08 17:15:20 +0000998else
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000999 {
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001000
Guido van Rossum50700601997-12-08 17:15:20 +00001001 switch (c)
1002 {
1003 /* The handling of escape sequences consisting of a string of digits
1004 starting with one that is not zero is not straightforward. By experiment,
1005 the way Perl works seems to be as follows:
1006
1007 Outside a character class, the digits are read as a decimal number. If the
1008 number is less than 10, or if there are that many previous extracting
1009 left brackets, then it is a back reference. Otherwise, up to three octal
1010 digits are read to form an escaped byte. Thus \123 is likely to be octal
1011 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
1012 value is greater than 377, the least significant 8 bits are taken. Inside a
1013 character class, \ followed by a digit is always an octal number. */
1014
1015 case '1': case '2': case '3': case '4': case '5':
1016 case '6': case '7': case '8': case '9':
1017
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001018 {
1019 /* PYTHON: Try to compute an octal value for a character */
1020 for(c=0, i=0; c!=-1 && ptr[i]!=0 && i<3; i++)
1021 {
1022 if (( pcre_ctypes[ ptr[i] ] & ctype_odigit) != 0)
1023 c = c * 8 + ptr[i]-'0';
1024 else
1025 c = -1; /* Non-octal character */
1026 }
1027 /* Aha! There were 3 octal digits, so it must be a character */
1028 if (c != -1 && i == 3)
1029 {
1030 ptr += i-1;
1031 break;
1032 }
1033 c = ptr[0]; /* Restore the first character after the \ */
1034 c -= '0'; i = 1;
1035 while (i<2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0)
1036 {
1037 c = c * 10 + ptr[1] - '0';
1038 ptr++; i++;
1039 }
1040 if (c > 255 - ESC_REF) *errorptr = "back reference too big";
1041 c = -(ESC_REF + c);
1042 }
1043 break;
1044
Guido van Rossum50700601997-12-08 17:15:20 +00001045 /* \0 always starts an octal number, but we may drop through to here with a
1046 larger first octal digit */
1047
1048 case '0':
1049 c -= '0';
1050 while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&
1051 ptr[1] != '8' && ptr[1] != '9')
1052 c = c * 8 + *(++ptr) - '0';
1053 break;
1054
1055 /* Special escapes not starting with a digit are straightforward */
1056
1057 case 'x':
1058 c = 0;
1059 while ( (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001060 {
Guido van Rossum50700601997-12-08 17:15:20 +00001061 ptr++;
1062 c = c * 16 + pcre_lcc[*ptr] -
1063 (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
1064 c &= 255;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001065 }
1066 break;
1067
1068
Guido van Rossum50700601997-12-08 17:15:20 +00001069 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1070 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
1071 for Perl compatibility, it is a literal. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001072
Guido van Rossum50700601997-12-08 17:15:20 +00001073 default:
1074 if ((options & PCRE_EXTRA) != 0) switch(c)
1075 {
1076 case 'X':
1077 c = -ESC_X; /* This could be a lookup if it ever got into Perl */
1078 break;
1079
1080 default:
1081 *errorptr = ERR3;
1082 break;
1083 }
1084 break;
1085 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001086 }
1087
1088*ptrptr = ptr;
1089return c;
1090}
1091
1092
1093
1094/*************************************************
Guido van Rossum50700601997-12-08 17:15:20 +00001095* Check for counted repeat *
1096*************************************************/
1097
1098/* This function is called when a '{' is encountered in a place where it might
1099start a quantifier. It looks ahead to see if it really is a quantifier or not.
1100It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1101where the ddds are digits.
1102
1103Arguments:
1104 p pointer to the first char after '{'
1105
1106Returns: TRUE or FALSE
1107*/
1108
1109static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00001110is_counted_repeat(const uschar *p)
Guido van Rossum50700601997-12-08 17:15:20 +00001111{
1112if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1113while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1114if (*p == '}') return TRUE;
1115
1116if (*p++ != ',') return FALSE;
1117if (*p == '}') return TRUE;
1118
1119if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1120while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1121return (*p == '}');
1122}
1123
1124
1125
1126/*************************************************
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001127* Read repeat counts *
1128*************************************************/
1129
Guido van Rossum50700601997-12-08 17:15:20 +00001130/* Read an item of the form {n,m} and return the values. This is called only
1131after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1132so the syntax is guaranteed to be correct, but we need to check the values.
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001133
1134Arguments:
1135 p pointer to first char after '{'
1136 minp pointer to int for min
1137 maxp pointer to int for max
1138 returned as -1 if no max
1139 errorptr points to pointer to error message
1140
1141Returns: pointer to '}' on success;
1142 current ptr on error, with errorptr set
1143*/
1144
Guido van Rossum58132c61997-12-17 00:24:13 +00001145static const uschar *
1146read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001147{
1148int min = 0;
1149int max = -1;
1150
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001151while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
1152
1153if (*p == '}') max = min; else
1154 {
Guido van Rossum50700601997-12-08 17:15:20 +00001155 if (*(++p) != '}')
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001156 {
1157 max = 0;
1158 while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001159 if (max < min)
1160 {
Guido van Rossum50700601997-12-08 17:15:20 +00001161 *errorptr = ERR4;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001162 return p;
1163 }
1164 }
1165 }
1166
1167/* Do paranoid checks, then fill in the required variables, and pass back the
1168pointer to the terminating '}'. */
1169
Guido van Rossum50700601997-12-08 17:15:20 +00001170if (min > 65535 || max > 65535)
1171 *errorptr = ERR5;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001172else
1173 {
1174 *minp = min;
1175 *maxp = max;
1176 }
1177return p;
1178}
1179
1180
1181
1182/*************************************************
1183* Compile one branch *
1184*************************************************/
1185
1186/* Scan the pattern, compiling it into the code vector.
1187
1188Arguments:
Guido van Rossum50700601997-12-08 17:15:20 +00001189 options the option bits
1190 bracket points to number of brackets used
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001191 code points to the pointer to the current code point
1192 ptrptr points to the current pattern pointer
1193 errorptr points to pointer to error message
1194
1195Returns: TRUE on success
1196 FALSE, with *errorptr set on error
1197*/
1198
1199static BOOL
Guido van Rossum50700601997-12-08 17:15:20 +00001200compile_branch(int options, int *brackets, uschar **codeptr,
Guido van Rossum58132c61997-12-17 00:24:13 +00001201 const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001202{
1203int repeat_type, op_type;
1204int repeat_min, repeat_max;
1205int bravalue, length;
1206register int c;
1207register uschar *code = *codeptr;
Guido van Rossum58132c61997-12-17 00:24:13 +00001208const uschar *ptr = *ptrptr;
1209const uschar *oldptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001210uschar *previous = NULL;
Guido van Rossum50700601997-12-08 17:15:20 +00001211uschar class[32];
1212uschar *class_flag; /* Pointer to the single-byte flag for OP_CLASS_L */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001213
1214/* Switch on next character until the end of the branch */
1215
1216for (;; ptr++)
1217 {
Guido van Rossum50700601997-12-08 17:15:20 +00001218 BOOL negate_class;
1219 int class_charcount;
1220 int class_lastchar;
1221
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001222 c = *ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00001223 if ((options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001224 {
1225 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1226 if (c == '#')
1227 {
1228 while ((c = *(++ptr)) != 0 && c != '\n');
1229 continue;
1230 }
1231 }
1232
1233 switch(c)
1234 {
1235 /* The branch terminates at end of string, |, or ). */
1236
1237 case 0:
1238 case '|':
1239 case ')':
1240 *codeptr = code;
1241 *ptrptr = ptr;
1242 return TRUE;
1243
1244 /* Handle single-character metacharacters */
1245
1246 case '^':
1247 previous = NULL;
1248 *code++ = OP_CIRC;
1249 break;
1250
1251 case '$':
1252 previous = NULL;
1253 *code++ = OP_DOLL;
1254 break;
1255
1256 case '.':
1257 previous = code;
1258 *code++ = OP_ANY;
1259 break;
1260
Guido van Rossum50700601997-12-08 17:15:20 +00001261 /* Character classes. These always build a 32-byte bitmap of the permitted
1262 characters, except in the special case where there is only one character.
1263 For negated classes, we build the map as usual, then invert it at the end.
1264 */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001265
1266 case '[':
Guido van Rossum50700601997-12-08 17:15:20 +00001267 previous = code;
1268 if (options & PCRE_LOCALE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001269 {
Guido van Rossum50700601997-12-08 17:15:20 +00001270 *code++ = OP_CLASS_L;
1271 /* Set the flag for localized classes (like \w) to 0 */
1272 class_flag = code;
1273 *class_flag = 0;
1274 }
1275 else
1276 {
1277 *code++ = OP_CLASS;
1278 class_flag = NULL;
1279 }
1280
1281 /* If the first character is '^', set the negation flag */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001282
Guido van Rossum50700601997-12-08 17:15:20 +00001283 if ((c = *(++ptr)) == '^')
1284 {
1285 negate_class = TRUE;
1286 c = *(++ptr);
1287 }
1288 else negate_class = FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001289
Guido van Rossum50700601997-12-08 17:15:20 +00001290 /* Keep a count of chars so that we can optimize the case of just a single
1291 character. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001292
Guido van Rossum50700601997-12-08 17:15:20 +00001293 class_charcount = 0;
1294 class_lastchar = -1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001295
Guido van Rossum50700601997-12-08 17:15:20 +00001296 /* Initialize the 32-char bit map to all zeros. We have to build the
1297 map in a temporary bit of store, in case the class contains only 1
1298 character, because in that case the compiled code doesn't use the
1299 bit map. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001300
Guido van Rossum50700601997-12-08 17:15:20 +00001301 memset(class, 0, 32 * sizeof(uschar));
1302
1303 /* Process characters until ] is reached. By writing this as a "do" it
1304 means that an initial ] is taken as a data character. */
1305
1306 do
1307 {
1308 if (c == 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001309 {
Guido van Rossum50700601997-12-08 17:15:20 +00001310 *errorptr = ERR6;
1311 goto FAILED;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001312 }
1313
Guido van Rossum50700601997-12-08 17:15:20 +00001314 /* Backslash may introduce a single character, or it may introduce one
1315 of the specials, which just set a flag. Escaped items are checked for
1316 validity in the pre-compiling pass. The sequence \b is a special case.
Guido van Rossum58132c61997-12-17 00:24:13 +00001317 Inside a class (and only there) it is treated as backspace. Elsewhere
Guido van Rossum50700601997-12-08 17:15:20 +00001318 it marks a word boundary. Other escapes have preset maps ready to
1319 or into the one we are building. We assume they have more than one
1320 character in them, so set class_count bigger than one. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001321
Guido van Rossum50700601997-12-08 17:15:20 +00001322 if (c == '\\')
1323 {
1324 c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1325 if (-c == ESC_b) c = '\b';
1326 else if (c < 0)
1327 {
1328 class_charcount = 10;
1329 switch (-c)
1330 {
1331 case ESC_d:
Guido van Rossum50700601997-12-08 17:15:20 +00001332 {
1333 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];
1334 }
1335 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001336
Guido van Rossum50700601997-12-08 17:15:20 +00001337 case ESC_D:
Guido van Rossum50700601997-12-08 17:15:20 +00001338 {
1339 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];
1340 }
1341 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001342
Guido van Rossum50700601997-12-08 17:15:20 +00001343 case ESC_w:
1344 if (options & PCRE_LOCALE)
1345 {
1346 *class_flag |= 1;
1347 }
1348 else
1349 {
1350 for (c = 0; c < 32; c++)
1351 class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1352 }
1353 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001354
Guido van Rossum50700601997-12-08 17:15:20 +00001355 case ESC_W:
1356 if (options & PCRE_LOCALE)
1357 {
1358 *class_flag |= 2;
1359 }
1360 else
1361 {
1362 for (c = 0; c < 32; c++)
1363 class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1364 }
1365 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001366
Guido van Rossum50700601997-12-08 17:15:20 +00001367 case ESC_s:
Guido van Rossum50700601997-12-08 17:15:20 +00001368 {
1369 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];
1370 }
1371 continue;
1372
1373 case ESC_S:
Guido van Rossum50700601997-12-08 17:15:20 +00001374 {
1375 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];
1376 }
1377 continue;
1378
1379 default:
1380 *errorptr = ERR7;
1381 goto FAILED;
1382 }
1383 }
1384 /* Fall through if single character */
1385 }
1386
1387 /* A single character may be followed by '-' to form a range. However,
1388 Perl does not permit ']' to be the end of the range. A '-' character
1389 here is treated as a literal. */
1390
1391 if (ptr[1] == '-' && ptr[2] != ']')
1392 {
1393 int d;
1394 ptr += 2;
1395 d = *ptr;
1396
1397 if (d == 0)
1398 {
1399 *errorptr = ERR6;
1400 goto FAILED;
1401 }
1402
1403 /* The second part of a range can be a single-character escape, but
1404 not any of the other escapes. */
1405
1406 if (d == '\\')
1407 {
1408 d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1409 if (d < 0)
1410 {
1411 if (d == -ESC_b) d = '\b'; else
1412 {
1413 *errorptr = ERR7;
1414 goto FAILED;
1415 }
1416 }
1417 }
1418
1419 if (d < c)
1420 {
1421 *errorptr = ERR8;
1422 goto FAILED;
1423 }
1424
1425 for (; c <= d; c++)
1426 {
1427 class[c/8] |= (1 << (c&7));
1428 if ((options & PCRE_CASELESS) != 0)
1429 {
1430 int uc = pcre_fcc[c]; /* flip case */
1431 class[uc/8] |= (1 << (uc&7));
1432 }
1433 class_charcount++; /* in case a one-char range */
1434 class_lastchar = c;
1435 }
1436 continue; /* Go get the next char in the class */
1437 }
1438
1439 /* Handle a lone single character - we can get here for a normal
1440 non-escape char, or after \ that introduces a single character. */
1441
1442 class [c/8] |= (1 << (c&7));
1443 if ((options & PCRE_CASELESS) != 0)
1444 {
1445 c = pcre_fcc[c]; /* flip case */
1446 class[c/8] |= (1 << (c&7));
1447 }
1448 class_charcount++;
1449 class_lastchar = c;
1450 }
1451
1452 /* Loop until ']' reached; the check for end of string happens inside the
1453 loop. This "while" is the end of the "do" above. */
1454
1455 while ((c = *(++ptr)) != ']');
1456
1457 /* If class_charcount is 1 and class_lastchar is not negative, we saw
1458 precisely one character. This doesn't need the whole 32-byte bit map.
1459 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1460 it's negative. */
1461
1462 if (class_charcount == 1 && class_lastchar >= 0)
1463 {
1464 if (negate_class)
1465 {
1466 code[-1] = OP_NOT;
1467 }
1468 else
1469 {
1470 code[-1] = OP_CHARS;
1471 *code++ = 1;
1472 }
1473 *code++ = class_lastchar;
1474 }
1475
1476 /* Otherwise, negate the 32-byte map if necessary, and copy it into
1477 the code vector. */
1478
1479 else
1480 {
1481 /* If this is a localized opcode, bump the code pointer up */
1482 if (class_flag) code++;
1483 if (negate_class)
1484 {
1485 if (class_flag) *class_flag = (*class_flag) ^ 63;
1486 for (c = 0; c < 32; c++) code[c] = ~class[c];
1487 }
1488 else
1489 memcpy(code, class, 32);
1490 code += 32;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001491 }
1492 break;
1493
1494 /* Various kinds of repeat */
1495
1496 case '{':
Guido van Rossum50700601997-12-08 17:15:20 +00001497 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001498 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
1499 if (*errorptr != NULL) goto FAILED;
1500 goto REPEAT;
1501
1502 case '*':
1503 repeat_min = 0;
1504 repeat_max = -1;
1505 goto REPEAT;
1506
1507 case '+':
1508 repeat_min = 1;
1509 repeat_max = -1;
1510 goto REPEAT;
1511
1512 case '?':
1513 repeat_min = 0;
1514 repeat_max = 1;
1515
1516 REPEAT:
1517 if (previous == NULL)
1518 {
Guido van Rossum50700601997-12-08 17:15:20 +00001519 *errorptr = ERR9;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001520 goto FAILED;
1521 }
1522
1523 /* If the next character is '?' this is a minimizing repeat. Advance to the
1524 next character. */
1525
1526 if (ptr[1] == '?') { repeat_type = 1; ptr++; } else repeat_type = 0;
1527
Guido van Rossum50700601997-12-08 17:15:20 +00001528 /* If the maximum is zero then the minimum must also be zero; Perl allows
1529 this case, so we do too - by simply omitting the item altogether. */
1530
1531 if (repeat_max == 0) code = previous;
1532
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001533 /* If previous was a string of characters, chop off the last one and use it
1534 as the subject of the repeat. If there was only one character, we can
1535 abolish the previous item altogether. */
1536
Guido van Rossum50700601997-12-08 17:15:20 +00001537 else if (*previous == OP_CHARS)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001538 {
1539 int len = previous[1];
1540 if (len == 1)
1541 {
1542 c = previous[2];
1543 code = previous;
1544 }
1545 else
1546 {
1547 c = previous[len+1];
1548 previous[1]--;
1549 code--;
1550 }
1551 op_type = 0; /* Use single-char op codes */
1552 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1553 }
1554
Guido van Rossum50700601997-12-08 17:15:20 +00001555 /* If previous was a single negated character ([^a] or similar), we use
1556 one of the special opcodes, replacing it. The code is shared with single-
1557 character repeats by adding a suitable offset into repeat_type. */
1558
1559 else if ((int)*previous == OP_NOT)
1560 {
1561 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
1562 c = previous[1];
1563 code = previous;
1564 goto OUTPUT_SINGLE_REPEAT;
1565 }
1566
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001567 /* If previous was a character type match (\d or similar), abolish it and
1568 create a suitable repeat item. The code is shared with single-character
1569 repeats by adding a suitable offset into repeat_type. */
1570
Guido van Rossum50700601997-12-08 17:15:20 +00001571 else if ((int)*previous < OP_CIRC || *previous == OP_ANY)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001572 {
1573 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1574 c = *previous;
1575 code = previous;
1576
1577 OUTPUT_SINGLE_REPEAT:
1578 repeat_type += op_type; /* Combine both values for many cases */
1579
1580 /* A minimum of zero is handled either as the special case * or ?, or as
1581 an UPTO, with the maximum given. */
1582
1583 if (repeat_min == 0)
1584 {
1585 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1586 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1587 else
1588 {
1589 *code++ = OP_UPTO + repeat_type;
1590 *code++ = repeat_max >> 8;
1591 *code++ = (repeat_max & 255);
1592 }
1593 }
1594
1595 /* The case {1,} is handled as the special case + */
1596
1597 else if (repeat_min == 1 && repeat_max == -1)
1598 *code++ = OP_PLUS + repeat_type;
1599
1600 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1601 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1602
1603 else
1604 {
1605 if (repeat_min != 1)
1606 {
1607 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1608 *code++ = repeat_min >> 8;
1609 *code++ = (repeat_min & 255);
1610 }
1611
1612 /* If the mininum is 1 and the previous item was a character string,
1613 we either have to put back the item that got cancelled if the string
1614 length was 1, or add the character back onto the end of a longer
1615 string. For a character type nothing need be done; it will just get put
1616 back naturally. */
1617
1618 else if (*previous == OP_CHARS)
1619 {
1620 if (code == previous) code += 2; else previous[1]++;
1621 }
1622
Guido van Rossum557dea11997-12-22 22:46:52 +00001623 /* If the maximum is unlimited, insert an OP_STAR. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001624
Guido van Rossum557dea11997-12-22 22:46:52 +00001625 if (repeat_max < 0)
1626 {
1627 *code++ = c;
1628 *code++ = OP_STAR + repeat_type;
1629 }
1630
1631 /* Else insert an UPTO if the max is greater than the min. */
1632
1633 else if (repeat_max != repeat_min)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001634 {
1635 *code++ = c;
1636 repeat_max -= repeat_min;
1637 *code++ = OP_UPTO + repeat_type;
1638 *code++ = repeat_max >> 8;
1639 *code++ = (repeat_max & 255);
1640 }
1641 }
1642
1643 /* The character or character type itself comes last in all cases. */
1644
1645 *code++ = c;
1646 }
1647
1648 /* If previous was a character class or a back reference, we put the repeat
1649 stuff after it. */
1650
Guido van Rossum50700601997-12-08 17:15:20 +00001651 else if (*previous == OP_CLASS || *previous==OP_CLASS_L || *previous == OP_REF)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001652 {
1653 if (repeat_min == 0 && repeat_max == -1)
1654 *code++ = OP_CRSTAR + repeat_type;
1655 else if (repeat_min == 1 && repeat_max == -1)
1656 *code++ = OP_CRPLUS + repeat_type;
1657 else if (repeat_min == 0 && repeat_max == 1)
1658 *code++ = OP_CRQUERY + repeat_type;
1659 else
1660 {
1661 *code++ = OP_CRRANGE + repeat_type;
1662 *code++ = repeat_min >> 8;
1663 *code++ = repeat_min & 255;
1664 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1665 *code++ = repeat_max >> 8;
1666 *code++ = repeat_max & 255;
1667 }
1668 }
1669
1670 /* If previous was a bracket group, we may have to replicate it in certain
1671 cases. If the maximum repeat count is unlimited, check that the bracket
1672 group cannot match the empty string, and diagnose an error if it can. */
1673
1674 else if ((int)*previous >= OP_BRA)
1675 {
1676 int i;
Guido van Rossum557dea11997-12-22 22:46:52 +00001677 int len = code - previous;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001678
1679 if (repeat_max == -1 && could_be_empty(previous))
1680 {
Guido van Rossum50700601997-12-08 17:15:20 +00001681 *errorptr = ERR10;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001682 goto FAILED;
1683 }
1684
1685 /* If the minimum is greater than zero, and the maximum is unlimited or
1686 equal to the minimum, the first copy remains where it is, and is
1687 replicated up to the minimum number of times. This case includes the +
1688 repeat, but of course no replication is needed in that case. */
1689
1690 if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1691 {
1692 for (i = 1; i < repeat_min; i++)
1693 {
Guido van Rossum557dea11997-12-22 22:46:52 +00001694 memcpy(code, previous, len);
1695 code += len;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001696 }
1697 }
1698
1699 /* If the minimum is zero, stick BRAZERO in front of the first copy.
1700 Then, if there is a fixed upper limit, replicated up to that many times,
1701 sticking BRAZERO in front of all the optional ones. */
1702
1703 else
1704 {
1705 if (repeat_min == 0)
1706 {
Guido van Rossum557dea11997-12-22 22:46:52 +00001707 memmove(previous+1, previous, len);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001708 code++;
1709 *previous++ = OP_BRAZERO + repeat_type;
1710 }
1711
1712 for (i = 1; i < repeat_min; i++)
1713 {
Guido van Rossum557dea11997-12-22 22:46:52 +00001714 memcpy(code, previous, len);
1715 code += len;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001716 }
1717
1718 for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1719 {
1720 *code++ = OP_BRAZERO + repeat_type;
Guido van Rossum557dea11997-12-22 22:46:52 +00001721 memcpy(code, previous, len);
1722 code += len;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001723 }
1724 }
1725
1726 /* If the maximum is unlimited, set a repeater in the final copy. */
1727
1728 if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
1729 }
1730
1731 /* Else there's some kind of shambles */
1732
1733 else
1734 {
Guido van Rossum50700601997-12-08 17:15:20 +00001735 *errorptr = ERR11;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001736 goto FAILED;
1737 }
1738
1739 /* In all case we no longer have a previous item. */
1740
1741 previous = NULL;
1742 break;
1743
1744
1745 /* Start of nested bracket sub-expression, or comment or lookahead.
1746 First deal with special things that can come after a bracket; all are
1747 introduced by ?, and the appearance of any of them means that this is not a
1748 referencing group. They were checked for validity in the first pass over
1749 the string, so we don't have to check for syntax errors here. */
1750
1751 case '(':
1752 previous = code; /* Only real brackets can be repeated */
1753 if (*(++ptr) == '?')
1754 {
1755 bravalue = OP_BRA;
1756
1757 switch (*(++ptr))
1758 {
1759 case '#':
1760 case 'i':
Guido van Rossumbd49ac41997-12-10 23:05:53 +00001761 case 'L':
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001762 case 'm':
1763 case 's':
1764 case 'x':
1765 ptr++;
1766 while (*ptr != ')') ptr++;
1767 previous = NULL;
1768 continue;
1769
1770 case ':': /* Non-extracting bracket */
1771 ptr++;
1772 break;
1773
1774 case '=': /* Assertions can't be repeated */
1775 bravalue = OP_ASSERT;
1776 ptr++;
1777 previous = NULL;
1778 break;
1779
1780 case '!':
1781 bravalue = OP_ASSERT_NOT;
1782 ptr++;
1783 previous = NULL;
1784 break;
1785
1786 case ('P'):
1787 ptr++;
1788 if (*ptr=='<')
1789 {
1790 /* (?P<groupname>...) */
1791 int idlen;
1792 PyObject *string, *intobj;
1793
1794 ptr++;
1795 idlen = get_group_id(ptr, '>', errorptr);
1796 if (*errorptr) {
1797 goto FAILED;
1798 }
Guido van Rossum57ba4f31997-12-02 20:40:28 +00001799 string = PyString_FromStringAndSize((char*)ptr, idlen);
Guido van Rossum50700601997-12-08 17:15:20 +00001800 intobj = PyInt_FromLong( brackets[0] + 1 );
Guido van Rossum58132c61997-12-17 00:24:13 +00001801 if (intobj == NULL || string == NULL)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001802 {
1803 Py_XDECREF(string);
1804 Py_XDECREF(intobj);
1805 *errorptr = "exception raised";
1806 goto FAILED;
1807 }
1808 PyDict_SetItem(dictionary, string, intobj);
Guido van Rossum58132c61997-12-17 00:24:13 +00001809 Py_DECREF(string); Py_DECREF(intobj); /* XXX DECREF commented out! */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001810 ptr += idlen+1; /* Point to rest of expression */
1811 goto do_grouping_bracket;
1812 }
1813 if (*ptr=='=')
1814 {
1815 /* (?P=groupname) */
1816 int idlen, refnum;
1817 PyObject *string, *intobj;
1818
1819 ptr++;
1820 idlen = get_group_id(ptr, ')', errorptr);
1821 if (*errorptr) {
1822 goto FAILED;
1823 }
Guido van Rossum50700601997-12-08 17:15:20 +00001824 string = PyString_FromStringAndSize((char *)ptr, idlen);
Guido van Rossumc3861071997-10-08 02:07:40 +00001825 if (string==NULL) {
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001826 *errorptr = "exception raised";
1827 goto FAILED;
1828 }
1829 intobj = PyDict_GetItem(dictionary, string);
1830 if (intobj==NULL) {
Guido van Rossumc3861071997-10-08 02:07:40 +00001831 Py_DECREF(string);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001832 *errorptr = "?P= group identifier isn't defined";
1833 goto FAILED;
1834 }
1835
1836 refnum = PyInt_AsLong(intobj);
Guido van Rossum1eadb411997-12-15 17:33:24 +00001837 Py_DECREF(string);
Guido van Rossum58132c61997-12-17 00:24:13 +00001838 /* The caller doesn't own the reference to the value
1839 returned from PyDict_GetItem, so intobj is not
1840 DECREF'ed. */
1841
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001842 *code++ = OP_REF;
1843 *code++ = refnum;
1844 /* The continue will cause the top-level for() loop to
1845 be resumed, so ptr will be immediately incremented.
1846 Therefore, the following line adds just idlen, not
1847 idlen+1 */
1848 ptr += idlen;
1849 continue;
1850 }
1851 /* The character after ?P is neither < nor =, so
1852 report an error. Add more Python-extensions here. */
1853 *errorptr="unknown after (?P";
1854 goto FAILED;
1855 break;
Guido van Rossum50700601997-12-08 17:15:20 +00001856
1857 case '>': /* "Match once" brackets */
1858 if ((options & PCRE_EXTRA) != 0) /* Not yet standard */
1859 {
1860 bravalue = OP_ONCE;
1861 ptr++;
1862 previous = NULL;
1863 break;
1864 }
1865 /* Else fall through */
1866
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001867 default:
Guido van Rossum50700601997-12-08 17:15:20 +00001868 *errorptr = ERR12;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001869 goto FAILED;
1870 }
1871 }
1872
1873 /* Else we have a referencing group */
1874
1875 else
1876 {
1877 do_grouping_bracket:
Guido van Rossum50700601997-12-08 17:15:20 +00001878 if (++(*brackets) > EXTRACT_MAX)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001879 {
Guido van Rossum50700601997-12-08 17:15:20 +00001880 *errorptr = ERR13;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001881 goto FAILED;
1882 }
Guido van Rossum50700601997-12-08 17:15:20 +00001883 bravalue = OP_BRA + *brackets;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001884 }
1885
1886 /* Process nested bracketed re; at end pointer is on the bracket. We copy
1887 code into a non-register variable in order to be able to pass its address
1888 because some compilers complain otherwise. */
1889
1890 *code = bravalue;
1891 {
1892 uschar *mcode = code;
Guido van Rossum50700601997-12-08 17:15:20 +00001893 if (!compile_regex(options, brackets, &mcode, &ptr, errorptr, dictionary))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001894 goto FAILED;
1895 code = mcode;
1896 }
1897
1898 if (*ptr != ')')
1899 {
Guido van Rossum50700601997-12-08 17:15:20 +00001900 *errorptr = ERR14;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001901 goto FAILED;
1902 }
1903 break;
1904
1905 /* Check \ for being a real metacharacter; if not, fall through and handle
1906 it as a data character at the start of a string. Escape items are checked
1907 for validity in the pre-compiling pass. */
1908
1909 case '\\':
1910 oldptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00001911 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001912
1913 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1914 are arranged to be the negation of the corresponding OP_values. For the
1915 back references, the values are ESC_REF plus the reference number. Only
1916 back references and those types that consume a character may be repeated.
1917 We can test for values between ESC_b and ESC_Z for the latter; this may
1918 have to change if any new ones are ever created. */
1919
1920 if (c < 0)
1921 {
1922 if (-c >= ESC_REF)
1923 {
Guido van Rossum50700601997-12-08 17:15:20 +00001924 int refnum = -c - ESC_REF;
1925 if (*brackets < refnum)
1926 {
1927 *errorptr = ERR15;
1928 goto FAILED;
1929 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001930 previous = code;
1931 *code++ = OP_REF;
1932 *code++ = refnum;
1933 }
1934 else
1935 {
Guido van Rossum50700601997-12-08 17:15:20 +00001936 previous = (-c > ESC_b && -c < ESC_X)? code : NULL;
1937 if ( (options & PCRE_LOCALE) != 0)
1938 {
1939 switch (c)
1940 {
1941 case (-ESC_b): c = -OP_WORD_BOUNDARY_L; break;
1942 case (-ESC_B): c = -OP_NOT_WORD_BOUNDARY_L; break;
1943 case (-ESC_w): c = -OP_WORDCHAR_L; break;
1944 case (-ESC_W): c = -OP_NOT_WORDCHAR_L; break;
1945 }
1946 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001947 *code++ = -c;
1948 }
1949 continue;
1950 }
1951
Guido van Rossum58132c61997-12-17 00:24:13 +00001952 /* Data character: Reset and fall through */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001953
1954 ptr = oldptr;
1955 c = '\\';
1956
1957 /* Handle a run of data characters until a metacharacter is encountered.
1958 The first character is guaranteed not to be whitespace or # when the
1959 extended flag is set. */
1960
Guido van Rossum50700601997-12-08 17:15:20 +00001961 NORMAL_CHAR:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001962 default:
1963 previous = code;
1964 *code = OP_CHARS;
1965 code += 2;
1966 length = 0;
1967
1968 do
1969 {
Guido van Rossum50700601997-12-08 17:15:20 +00001970 if ((options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001971 {
1972 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1973 if (c == '#')
1974 {
1975 while ((c = *(++ptr)) != 0 && c != '\n');
1976 if (c == 0) break;
1977 continue;
1978 }
1979 }
1980
1981 /* Backslash may introduce a data char or a metacharacter. Escaped items
1982 are checked for validity in the pre-compiling pass. Stop the string
1983 before a metaitem. */
1984
1985 if (c == '\\')
1986 {
1987 oldptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00001988 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001989 if (c < 0) { ptr = oldptr; break; }
1990 }
1991
1992 /* Ordinary character or single-char escape */
1993
1994 *code++ = c;
1995 length++;
1996 }
1997
1998 /* This "while" is the end of the "do" above. */
1999
2000 while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2001
2002 /* Compute the length and set it in the data vector, and advance to
2003 the next state. */
2004
2005 previous[1] = length;
2006 ptr--;
2007 break;
2008 }
2009 } /* end of big loop */
2010
2011/* Control never reaches here by falling through, only by a goto for all the
2012error states. Pass back the position in the pattern so that it can be displayed
2013to the user for diagnosing the error. */
2014
2015FAILED:
2016*ptrptr = ptr;
2017return FALSE;
2018}
2019
2020
2021
2022
2023/*************************************************
2024* Compile sequence of alternatives *
2025*************************************************/
2026
2027/* On entry, ptr is pointing past the bracket character, but on return
2028it points to the closing bracket, or vertical bar, or end of string.
2029The code variable is pointing at the byte into which the BRA operator has been
2030stored.
2031
2032Argument:
Guido van Rossum50700601997-12-08 17:15:20 +00002033 options the option bits
2034 brackets -> int containing the number of extracting brackets used
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002035 codeptr -> the address of the current code pointer
2036 ptrptr -> the address of the current pattern pointer
2037 errorptr -> pointer to error message
2038
2039Returns: TRUE on success
2040*/
2041
2042static BOOL
Guido van Rossum50700601997-12-08 17:15:20 +00002043compile_regex(int options, int *brackets, uschar **codeptr,
Guido van Rossum58132c61997-12-17 00:24:13 +00002044 const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002045{
Guido van Rossum58132c61997-12-17 00:24:13 +00002046const uschar *ptr = *ptrptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002047uschar *code = *codeptr;
2048uschar *start_bracket = code;
2049
2050for (;;)
2051 {
2052 int length;
2053 uschar *last_branch = code;
2054
2055 code += 3;
Guido van Rossum50700601997-12-08 17:15:20 +00002056 if (!compile_branch(options, brackets, &code, &ptr, errorptr, dictionary))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002057 {
2058 *ptrptr = ptr;
2059 return FALSE;
2060 }
2061
2062 /* Fill in the length of the last branch */
2063
2064 length = code - last_branch;
2065 last_branch[1] = length >> 8;
2066 last_branch[2] = length & 255;
2067
2068 /* Reached end of expression, either ')' or end of pattern. Insert a
2069 terminating ket and the length of the whole bracketed item, and return,
2070 leaving the pointer at the terminating char. */
2071
2072 if (*ptr != '|')
2073 {
2074 length = code - start_bracket;
2075 *code++ = OP_KET;
2076 *code++ = length >> 8;
2077 *code++ = length & 255;
2078 *codeptr = code;
2079 *ptrptr = ptr;
2080 return TRUE;
2081 }
2082
2083 /* Another branch follows; insert an "or" node and advance the pointer. */
2084
2085 *code = OP_ALT;
2086 ptr++;
2087 }
2088/* Control never reaches here */
2089}
2090
2091
2092
2093/*************************************************
2094* Check for anchored expression *
2095*************************************************/
2096
2097/* Try to find out if this is an anchored regular expression. Consider each
2098alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2099all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2100it's anchored. However, if this is a multiline pattern, then only OP_SOD
2101counts, since OP_CIRC can match in the middle.
2102
2103A branch is also implicitly anchored if it starts with .* because that will try
2104the rest of the pattern at all possible matching points, so there is no point
2105trying them again.
2106
2107Argument: points to start of expression (the bracket)
2108Returns: TRUE or FALSE
2109*/
2110
2111static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00002112is_anchored(register const uschar *code, BOOL multiline)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002113{
2114do {
2115 int op = (int)code[3];
Guido van Rossum50700601997-12-08 17:15:20 +00002116 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002117 { if (!is_anchored(code+3, multiline)) return FALSE; }
2118 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2119 { if (code[4] != OP_ANY) return FALSE; }
2120 else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;
2121 code += (code[1] << 8) + code[2];
2122 }
2123while (*code == OP_ALT);
2124return TRUE;
2125}
2126
2127
2128
2129/*************************************************
2130* Check for start with \n line expression *
2131*************************************************/
2132
2133/* This is called for multiline expressions to try to find out if every branch
2134starts with ^ so that "first char" processing can be done to speed things up.
2135
2136Argument: points to start of expression (the bracket)
2137Returns: TRUE or FALSE
2138*/
2139
2140static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00002141is_startline(const uschar *code)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002142{
2143do {
2144 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2145 { if (!is_startline(code+3)) return FALSE; }
2146 else if (code[3] != OP_CIRC) return FALSE;
2147 code += (code[1] << 8) + code[2];
2148 }
2149while (*code == OP_ALT);
2150return TRUE;
2151}
2152
2153
2154
2155/*************************************************
2156* Check for fixed first char *
2157*************************************************/
2158
2159/* Try to find out if there is a fixed first character. This is called for
2160unanchored expressions, as it speeds up their processing quite considerably.
2161Consider each alternative branch. If they all start with the same char, or with
2162a bracket all of whose alternatives start with the same char (recurse ad lib),
2163then we return that char, otherwise -1.
2164
2165Argument: points to start of expression (the bracket)
2166Returns: -1 or the fixed first char
2167*/
2168
2169static int
2170find_firstchar(uschar *code)
2171{
2172register int c = -1;
2173do
2174 {
2175 register int charoffset = 4;
2176
2177 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2178 {
2179 register int d;
2180 if ((d = find_firstchar(code+3)) < 0) return -1;
2181 if (c < 0) c = d; else if (c != d) return -1;
2182 }
2183
2184 else switch(code[3])
2185 {
2186 default:
2187 return -1;
2188
2189 case OP_EXACT: /* Fall through */
2190 charoffset++;
2191
2192 case OP_CHARS: /* Fall through */
2193 charoffset++;
2194
2195 case OP_PLUS:
2196 case OP_MINPLUS:
2197 if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;
2198 break;
2199 }
2200 code += (code[1] << 8) + code[2];
2201 }
2202while (*code == OP_ALT);
2203return c;
2204}
2205
2206
2207
2208/*************************************************
2209* Compile a Regular Expression *
2210*************************************************/
2211
2212/* This function takes a string and returns a pointer to a block of store
2213holding a compiled version of the expression.
2214
2215Arguments:
2216 pattern the regular expression
2217 options various option bits
2218 errorptr pointer to pointer to error text
2219 erroroffset ptr offset in pattern where error was detected
2220
2221Returns: pointer to compiled data block, or NULL on error,
2222 with errorptr and erroroffset set
2223*/
2224
2225pcre *
Guido van Rossum58132c61997-12-17 00:24:13 +00002226pcre_compile(const char *pattern, int options, const char **errorptr,
Guido van Rossum50700601997-12-08 17:15:20 +00002227 int *erroroffset, PyObject *dictionary)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002228{
2229real_pcre *re;
2230int spaces = 0;
2231int length = 3; /* For initial BRA plus length */
2232int runlength;
2233int c, size;
Guido van Rossum50700601997-12-08 17:15:20 +00002234int bracount = 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002235int brastack[200];
Guido van Rossum50700601997-12-08 17:15:20 +00002236int top_backref = 0;
Guido van Rossum58132c61997-12-17 00:24:13 +00002237unsigned int brastackptr = 0;
2238uschar *code;
2239const uschar *ptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002240
2241#ifdef DEBUG
2242uschar *code_base, *code_end;
2243#endif
2244
Guido van Rossum50700601997-12-08 17:15:20 +00002245/* We can't pass back an error message if errorptr is NULL; I guess the best we
2246can do is just return NULL. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002247
Guido van Rossum50700601997-12-08 17:15:20 +00002248if (errorptr == NULL) return NULL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002249*errorptr = NULL;
Guido van Rossum50700601997-12-08 17:15:20 +00002250
2251/* However, we can give a message for this error */
2252
2253if (erroroffset == NULL)
2254 {
2255 *errorptr = ERR16;
2256 return NULL;
2257 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002258*erroroffset = 0;
2259
2260if ((options & ~PUBLIC_OPTIONS) != 0)
2261 {
Guido van Rossum50700601997-12-08 17:15:20 +00002262 *errorptr = ERR17;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002263 return NULL;
2264 }
2265
Guido van Rossum557dea11997-12-22 22:46:52 +00002266DPRINTF(("------------------------------------------------------------------\n"));
2267DPRINTF(("%s\n", pattern));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002268
2269/* The first thing to do is to make a pass over the pattern to compute the
2270amount of store required to hold the compiled code. This does not have to be
2271perfect as long as errors are overestimates. At the same time we can detect any
2272internal flag settings. Make an attempt to correct for any counted white space
2273if an "extended" flag setting appears late in the pattern. We can't be so
2274clever for #-comments. */
2275
Guido van Rossum58132c61997-12-17 00:24:13 +00002276ptr = (const uschar *)(pattern - 1);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002277while ((c = *(++ptr)) != 0)
2278 {
Guido van Rossum50700601997-12-08 17:15:20 +00002279 int min, max;
2280 int class_charcount;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002281
2282 if ((pcre_ctypes[c] & ctype_space) != 0)
2283 {
Guido van Rossum50700601997-12-08 17:15:20 +00002284 if ((options & PCRE_EXTENDED) != 0) continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002285 spaces++;
2286 }
2287
Guido van Rossum50700601997-12-08 17:15:20 +00002288 if (c == '#' && (options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002289 {
2290 while ((c = *(++ptr)) != 0 && c != '\n');
2291 continue;
2292 }
2293
2294 switch(c)
2295 {
2296 /* A backslashed item may be an escaped "normal" character or a
2297 character type. For a "normal" character, put the pointers and
2298 character back so that tests for whitespace etc. in the input
2299 are done correctly. */
2300
2301 case '\\':
2302 {
Guido van Rossum58132c61997-12-17 00:24:13 +00002303 const uschar *save_ptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00002304 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002305 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2306 if (c >= 0)
2307 {
2308 ptr = save_ptr;
2309 c = '\\';
2310 goto NORMAL_CHAR;
2311 }
2312 }
2313 length++;
2314
2315 /* A back reference needs an additional char, plus either one or 5
Guido van Rossum50700601997-12-08 17:15:20 +00002316 bytes for a repeat. We also need to keep the value of the highest
2317 back reference. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002318
2319 if (c <= -ESC_REF)
2320 {
Guido van Rossum50700601997-12-08 17:15:20 +00002321 int refnum = -c - ESC_REF;
2322 if (refnum > top_backref) top_backref = refnum;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002323 length++; /* For single back reference */
Guido van Rossum50700601997-12-08 17:15:20 +00002324 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002325 {
2326 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2327 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2328 if ((min == 0 && (max == 1 || max == -1)) ||
2329 (min == 1 && max == -1))
2330 length++;
2331 else length += 5;
2332 if (ptr[1] == '?') ptr++;
2333 }
2334 }
2335 continue;
2336
2337 case '^':
2338 case '.':
2339 case '$':
2340 case '*': /* These repeats won't be after brackets; */
2341 case '+': /* those are handled separately */
2342 case '?':
2343 length++;
2344 continue;
2345
2346 /* This covers the cases of repeats after a single char, metachar, class,
2347 or back reference. */
2348
2349 case '{':
Guido van Rossum50700601997-12-08 17:15:20 +00002350 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002351 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
2352 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2353 if ((min == 0 && (max == 1 || max == -1)) ||
2354 (min == 1 && max == -1))
2355 length++;
2356 else
2357 {
2358 length--; /* Uncount the original char or metachar */
2359 if (min == 1) length++; else if (min > 0) length += 4;
2360 if (max > 0) length += 4; else length += 2;
2361 }
2362 if (ptr[1] == '?') ptr++;
2363 continue;
2364
2365 /* An alternation contains an offset to the next branch or ket. */
2366 case '|':
2367 length += 3;
2368 continue;
2369
Guido van Rossum50700601997-12-08 17:15:20 +00002370 /* A character class uses 33 characters. Don't worry about character types
2371 that aren't allowed in classes - they'll get picked up during the compile.
2372 A character class that contains only one character uses 2 or 3 bytes,
2373 depending on whether it is negated or not. Notice this where we can. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002374
2375 case '[':
Guido van Rossum50700601997-12-08 17:15:20 +00002376 class_charcount = 0;
2377 if (*(++ptr) == '^') ptr++;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002378 do
2379 {
Guido van Rossum50700601997-12-08 17:15:20 +00002380 if (*ptr == '\\')
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002381 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002382 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002383 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
Guido van Rossum557dea11997-12-22 22:46:52 +00002384 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002385 }
Guido van Rossum50700601997-12-08 17:15:20 +00002386 else class_charcount++;
2387 ptr++;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002388 }
2389 while (*ptr != 0 && *ptr != ']');
2390
Guido van Rossum50700601997-12-08 17:15:20 +00002391 /* Repeats for negated single chars are handled by the general code */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002392
Guido van Rossum50700601997-12-08 17:15:20 +00002393 if (class_charcount == 1) length += 3; else
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002394 {
Guido van Rossum50700601997-12-08 17:15:20 +00002395 length += 33;
2396 if (options & PCRE_LOCALE) length++; /* Add a byte for the localization flag */
2397
2398 /* A repeat needs either 1 or 5 bytes. */
2399
Guido van Rossum557dea11997-12-22 22:46:52 +00002400 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
Guido van Rossum50700601997-12-08 17:15:20 +00002401 {
2402 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2403 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2404 if ((min == 0 && (max == 1 || max == -1)) ||
2405 (min == 1 && max == -1))
2406 length++;
2407 else length += 5;
2408 if (ptr[1] == '?') ptr++;
2409 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002410 }
2411 continue;
2412
2413 /* Brackets may be genuine groups or special things */
2414
2415 case '(':
2416
2417 /* Handle special forms of bracket, which all start (? */
2418
2419 if (ptr[1] == '?') switch (c = ptr[2])
2420 {
2421 /* Skip over comments entirely */
2422 case '#':
2423 ptr += 3;
2424 while (*ptr != 0 && *ptr != ')') ptr++;
2425 if (*ptr == 0)
2426 {
Guido van Rossum50700601997-12-08 17:15:20 +00002427 *errorptr = ERR18;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002428 goto PCRE_ERROR_RETURN;
2429 }
2430 continue;
2431
2432 /* Non-referencing groups and lookaheads just move the pointer on, and
Guido van Rossum50700601997-12-08 17:15:20 +00002433 then behave like a non-special bracket, except that they don't increment
2434 the count of extracting brackets. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002435
2436 case ':':
2437 case '=':
2438 case '!':
2439 ptr += 2;
2440 break;
2441
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002442 case ('P'):
2443 {
2444 int idlen;
2445 switch (*ptr++) {
2446 case ('<'):
2447 idlen = get_group_id(ptr++, '>', errorptr);
2448 if (*errorptr) goto PCRE_ERROR_RETURN;
2449 ptr += idlen+1;
2450 break;
2451 case ('='):
2452 idlen = get_group_id(ptr++, ')', errorptr);
2453 if (*errorptr) goto PCRE_ERROR_RETURN;
2454 ptr += idlen+1;
2455 length++;
2456 break;
2457 }
2458 }
2459 break;
2460
Guido van Rossum50700601997-12-08 17:15:20 +00002461 /* Ditto for the "once only" bracket, allowed only if the extra bit
2462 is set. */
2463
2464 case '>':
2465 if ((options & PCRE_EXTRA) != 0)
2466 {
2467 ptr += 2;
2468 break;
2469 }
2470 /* Else fall thourh */
2471
2472 /* Else loop setting valid options until ) is met. Anything else is an
2473 error. */
2474
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002475 default:
2476 ptr += 2;
2477 for (;; ptr++)
2478 {
2479 if ((c = *ptr) == 'i')
2480 {
2481 options |= PCRE_CASELESS;
2482 continue;
2483 }
Guido van Rossumbd49ac41997-12-10 23:05:53 +00002484 else if ((c = *ptr) == 'L')
Guido van Rossum50700601997-12-08 17:15:20 +00002485 {
2486 options |= PCRE_LOCALE;
2487 continue;
2488 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002489 else if ((c = *ptr) == 'm')
2490 {
2491 options |= PCRE_MULTILINE;
2492 continue;
2493 }
Guido van Rossum50700601997-12-08 17:15:20 +00002494 else if (c == 's')
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002495 {
2496 options |= PCRE_DOTALL;
2497 continue;
2498 }
2499 else if (c == 'x')
2500 {
2501 options |= PCRE_EXTENDED;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002502 length -= spaces; /* Already counted spaces */
2503 continue;
2504 }
2505 else if (c == ')') break;
2506
Guido van Rossum50700601997-12-08 17:15:20 +00002507 *errorptr = ERR12;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002508 goto PCRE_ERROR_RETURN;
2509 }
2510 continue; /* End of this bracket handling */
2511 }
2512
Guido van Rossum50700601997-12-08 17:15:20 +00002513 /* Extracting brackets must be counted so we can process escapes in a
2514 Perlish way. */
2515
2516 else bracount++;
2517
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002518 /* Non-special forms of bracket. Save length for computing whole length
2519 at end if there's a repeat that requires duplication of the group. */
2520
2521 if (brastackptr >= sizeof(brastack)/sizeof(int))
2522 {
Guido van Rossum50700601997-12-08 17:15:20 +00002523 *errorptr = ERR19;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002524 goto PCRE_ERROR_RETURN;
2525 }
2526
2527 brastack[brastackptr++] = length;
2528 length += 3;
2529 continue;
2530
2531 /* Handle ket. Look for subsequent max/min; for certain sets of values we
Guido van Rossum557dea11997-12-22 22:46:52 +00002532 have to replicate this bracket up to that many times. If brastackptr is
2533 0 this is an unmatched bracket which will generate an error, but take care
2534 not to try to access brastack[-1]. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002535
2536 case ')':
2537 length += 3;
2538 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002539 int minval = 1;
2540 int maxval = 1;
2541 int duplength = (brastackptr > 0)? length - brastack[--brastackptr] : 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002542
2543 /* Leave ptr at the final char; for read_repeat_counts this happens
2544 automatically; for the others we need an increment. */
2545
Guido van Rossum50700601997-12-08 17:15:20 +00002546 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002547 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002548 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002549 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2550 }
Guido van Rossum557dea11997-12-22 22:46:52 +00002551 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2552 else if (c == '+') { maxval = -1; ptr++; }
2553 else if (c == '?') { minval = 0; ptr++; }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002554
Guido van Rossum557dea11997-12-22 22:46:52 +00002555 /* If there is a minimum > 1 we have to replicate up to minval-1 times;
2556 if there is a limited maximum we have to replicate up to maxval-1 times
2557 and allow for a BRAZERO item before each optional copy, as we also have
2558 to do before the first copy if the minimum is zero. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002559
Guido van Rossum557dea11997-12-22 22:46:52 +00002560 if (minval == 0) length++;
2561 else if (minval > 1) length += (minval - 1) * duplength;
2562 if (maxval > minval) length += (maxval - minval) * (duplength + 1);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002563 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002564 continue;
2565
2566 /* Non-special character. For a run of such characters the length required
2567 is the number of characters + 2, except that the maximum run length is 255.
2568 We won't get a skipped space or a non-data escape or the start of a #
2569 comment as the first character, so the length can't be zero. */
2570
2571 NORMAL_CHAR:
2572 default:
2573 length += 2;
2574 runlength = 0;
2575 do
2576 {
2577 if ((pcre_ctypes[c] & ctype_space) != 0)
2578 {
Guido van Rossum50700601997-12-08 17:15:20 +00002579 if ((options & PCRE_EXTENDED) != 0) continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002580 spaces++;
2581 }
2582
Guido van Rossum50700601997-12-08 17:15:20 +00002583 if (c == '#' && (options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002584 {
2585 while ((c = *(++ptr)) != 0 && c != '\n');
2586 continue;
2587 }
2588
2589 /* Backslash may introduce a data char or a metacharacter; stop the
2590 string before the latter. */
2591
2592 if (c == '\\')
2593 {
Guido van Rossum58132c61997-12-17 00:24:13 +00002594 const uschar *saveptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00002595 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002596 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2597 if (c < 0) { ptr = saveptr; break; }
2598 }
2599
2600 /* Ordinary character or single-char escape */
2601
2602 runlength++;
2603 }
2604
2605 /* This "while" is the end of the "do" above. */
2606
2607 while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2608
2609 ptr--;
2610 length += runlength;
2611 continue;
2612 }
2613 }
2614
2615length += 4; /* For final KET and END */
2616
2617if (length > 65539)
2618 {
Guido van Rossum50700601997-12-08 17:15:20 +00002619 *errorptr = ERR20;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002620 return NULL;
2621 }
2622
2623/* Compute the size of data block needed and get it, either from malloc or
Guido van Rossum557dea11997-12-22 22:46:52 +00002624externally provided function. We specify "code[0]" in the offsetof() expression
2625rather than just "code", because it has been reported that one broken compiler
2626fails on "code" because it is also an independent variable. It should make no
2627difference to the value of the offsetof(). */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002628
Guido van Rossum557dea11997-12-22 22:46:52 +00002629size = length + offsetof(real_pcre, code[0]);
Guido van Rossum50700601997-12-08 17:15:20 +00002630re = (real_pcre *)(pcre_malloc)(size+50);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002631
2632if (re == NULL)
2633 {
Guido van Rossum50700601997-12-08 17:15:20 +00002634 *errorptr = ERR21;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002635 return NULL;
2636 }
2637
Guido van Rossum557dea11997-12-22 22:46:52 +00002638/* Put in the magic number and the options. */
2639
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002640re->magic_number = MAGIC_NUMBER;
2641re->options = options;
2642
2643/* Set up a starting, non-extracting bracket, then compile the expression. On
2644error, *errorptr will be set non-NULL, so we don't need to look at the result
2645of the function here. */
2646
Guido van Rossum58132c61997-12-17 00:24:13 +00002647ptr = (const uschar *)pattern;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002648code = re->code;
2649*code = OP_BRA;
Guido van Rossum50700601997-12-08 17:15:20 +00002650bracount = 0;
2651(void)compile_regex(options, &bracount, &code, &ptr, errorptr, dictionary);
2652re->top_bracket = bracount;
2653re->top_backref = top_backref;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002654
2655/* If not reached end of pattern on success, there's an excess bracket. */
2656
Guido van Rossum50700601997-12-08 17:15:20 +00002657if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
2658
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002659/* Fill in the terminating state and check for disastrous overflow, but
2660if debugging, leave the test till after things are printed out. */
2661
2662*code++ = OP_END;
2663
Guido van Rossum50700601997-12-08 17:15:20 +00002664
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002665#ifndef DEBUG
Guido van Rossum50700601997-12-08 17:15:20 +00002666if (code - re->code > length) *errorptr = ERR23;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002667#endif
2668
2669/* Failed to compile */
2670
2671if (*errorptr != NULL)
2672 {
2673 (pcre_free)(re);
2674 PCRE_ERROR_RETURN:
Guido van Rossum58132c61997-12-17 00:24:13 +00002675 *erroroffset = ptr - (const uschar *)pattern;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002676 return NULL;
2677 }
2678
2679/* If the anchored option was not passed, set flag if we can determine that it
2680is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
2681we can determine what the first character has to be, because that speeds up
2682unanchored matches no end. In the case of multiline matches, an alternative is
2683to set the PCRE_STARTLINE flag if all branches start with ^. */
2684
2685if ((options & PCRE_ANCHORED) == 0)
2686 {
2687 if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))
2688 re->options |= PCRE_ANCHORED;
2689 else
2690 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002691 int ch = find_firstchar(re->code);
2692 if (ch >= 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002693 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002694 re->first_char = ch;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002695 re->options |= PCRE_FIRSTSET;
2696 }
2697 else if (is_startline(re->code))
2698 re->options |= PCRE_STARTLINE;
2699 }
2700 }
2701
2702/* Print out the compiled data for debugging */
2703
2704#ifdef DEBUG
2705
Guido van Rossum50700601997-12-08 17:15:20 +00002706printf("Length = %d top_bracket = %d top_backref=%d\n",
2707 length, re->top_bracket, re->top_backref);
2708
2709if (re->options != 0)
2710 {
2711 printf("%s%s%s%s%s%s%s\n",
2712 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2713 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2714 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2715 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2716 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2717 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
2718 ((re->options & PCRE_EXTRA) != 0)? "extra " : "");
2719 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002720
2721if ((re->options & PCRE_FIRSTSET) != 0)
2722 {
2723 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2724 else printf("First char = \\x%02x\n", re->first_char);
2725 }
2726
2727code_end = code;
2728code_base = code = re->code;
2729
2730while (code < code_end)
2731 {
2732 int charlength;
2733
2734 printf("%3d ", code - code_base);
2735
2736 if (*code >= OP_BRA)
2737 {
2738 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2739 code += 2;
2740 }
2741
2742 else switch(*code)
2743 {
2744 case OP_CHARS:
2745 charlength = *(++code);
2746 printf("%3d ", charlength);
2747 while (charlength-- > 0)
2748 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2749 break;
2750
2751 case OP_KETRMAX:
2752 case OP_KETRMIN:
2753 case OP_ALT:
2754 case OP_KET:
2755 case OP_ASSERT:
2756 case OP_ASSERT_NOT:
Guido van Rossum50700601997-12-08 17:15:20 +00002757 case OP_ONCE:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002758 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2759 code += 2;
2760 break;
2761
2762 case OP_STAR:
2763 case OP_MINSTAR:
2764 case OP_PLUS:
2765 case OP_MINPLUS:
2766 case OP_QUERY:
2767 case OP_MINQUERY:
2768 case OP_TYPESTAR:
2769 case OP_TYPEMINSTAR:
2770 case OP_TYPEPLUS:
2771 case OP_TYPEMINPLUS:
2772 case OP_TYPEQUERY:
2773 case OP_TYPEMINQUERY:
2774 if (*code >= OP_TYPESTAR)
2775 printf(" %s", OP_names[code[1]]);
2776 else if (isprint(c = code[1])) printf(" %c", c);
2777 else printf(" \\x%02x", c);
2778 printf("%s", OP_names[*code++]);
2779 break;
2780
2781 case OP_EXACT:
2782 case OP_UPTO:
2783 case OP_MINUPTO:
2784 if (isprint(c = code[3])) printf(" %c{", c);
2785 else printf(" \\x%02x{", c);
Guido van Rossum557dea11997-12-22 22:46:52 +00002786 if (*code != OP_EXACT) printf("0,");
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002787 printf("%d}", (code[1] << 8) + code[2]);
2788 if (*code == OP_MINUPTO) printf("?");
2789 code += 3;
2790 break;
2791
2792 case OP_TYPEEXACT:
2793 case OP_TYPEUPTO:
2794 case OP_TYPEMINUPTO:
2795 printf(" %s{", OP_names[code[3]]);
2796 if (*code != OP_TYPEEXACT) printf(",");
2797 printf("%d}", (code[1] << 8) + code[2]);
2798 if (*code == OP_TYPEMINUPTO) printf("?");
2799 code += 3;
2800 break;
2801
Guido van Rossum50700601997-12-08 17:15:20 +00002802 case OP_NOT:
2803 if (isprint(c = *(++code))) printf(" [^%c]", c);
2804 else printf(" [^\\x%02x]", c);
2805 break;
2806
2807 case OP_NOTSTAR:
2808 case OP_NOTMINSTAR:
2809 case OP_NOTPLUS:
2810 case OP_NOTMINPLUS:
2811 case OP_NOTQUERY:
2812 case OP_NOTMINQUERY:
2813 if (isprint(c = code[1])) printf(" [^%c]", c);
2814 else printf(" [^\\x%02x]", c);
2815 printf("%s", OP_names[*code++]);
2816 break;
2817
2818 case OP_NOTEXACT:
2819 case OP_NOTUPTO:
2820 case OP_NOTMINUPTO:
2821 if (isprint(c = code[3])) printf(" [^%c]{", c);
2822 else printf(" [^\\x%02x]{", c);
2823 if (*code != OP_NOTEXACT) printf(",");
2824 printf("%d}", (code[1] << 8) + code[2]);
2825 if (*code == OP_NOTMINUPTO) printf("?");
2826 code += 3;
2827 break;
2828
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002829 case OP_REF:
2830 printf(" \\%d", *(++code));
Guido van Rossum557dea11997-12-22 22:46:52 +00002831 code ++;
2832 goto CLASS_REF_REPEAT;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002833
2834 case OP_CLASS:
Guido van Rossum50700601997-12-08 17:15:20 +00002835 case OP_CLASS_L:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002836 {
2837 int i, min, max;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002838
Guido van Rossum50700601997-12-08 17:15:20 +00002839 if (*code==OP_CLASS_L)
2840 {
2841 code++;
2842 printf("Locflag = %i ", *code++);
2843 }
2844 else
2845 code++;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002846
Guido van Rossum50700601997-12-08 17:15:20 +00002847 printf(" [");
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002848
Guido van Rossum50700601997-12-08 17:15:20 +00002849 for (i = 0; i < 256; i++)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002850 {
Guido van Rossum50700601997-12-08 17:15:20 +00002851 if ((code[i/8] & (1 << (i&7))) != 0)
2852 {
2853 int j;
2854 for (j = i+1; j < 256; j++)
2855 if ((code[j/8] & (1 << (j&7))) == 0) break;
2856 if (i == '-' || i == ']') printf("\\");
2857 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2858 if (--j > i)
2859 {
2860 printf("-");
2861 if (j == '-' || j == ']') printf("\\");
2862 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2863 }
2864 i = j;
2865 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002866 }
2867 printf("]");
Guido van Rossum50700601997-12-08 17:15:20 +00002868 code += 32;
2869 /* code ++;*/
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002870
Guido van Rossum557dea11997-12-22 22:46:52 +00002871 CLASS_REF_REPEAT:
2872
Guido van Rossum50700601997-12-08 17:15:20 +00002873 switch(*code)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002874 {
2875 case OP_CRSTAR:
2876 case OP_CRMINSTAR:
2877 case OP_CRPLUS:
2878 case OP_CRMINPLUS:
2879 case OP_CRQUERY:
2880 case OP_CRMINQUERY:
2881 printf("%s", OP_names[*code]);
2882 break;
2883
2884 case OP_CRRANGE:
2885 case OP_CRMINRANGE:
2886 min = (code[1] << 8) + code[2];
2887 max = (code[3] << 8) + code[4];
2888 if (max == 0) printf("{%d,}", min);
2889 else printf("{%d,%d}", min, max);
2890 if (*code == OP_CRMINRANGE) printf("?");
2891 code += 4;
2892 break;
2893
2894 default:
2895 code--;
2896 }
2897 }
2898 break;
2899
2900 /* Anything else is just a one-node item */
2901
2902 default:
2903 printf(" %s", OP_names[*code]);
2904 break;
2905 }
2906
2907 code++;
2908 printf("\n");
2909 }
2910printf("------------------------------------------------------------------\n");
2911
2912/* This check is done here in the debugging case so that the code that
2913was compiled can be seen. */
2914
2915if (code - re->code > length)
2916 {
Guido van Rossum50700601997-12-08 17:15:20 +00002917 printf("length=%i, code length=%i\n", length, code-re->code);
2918 *errorptr = ERR23;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002919 (pcre_free)(re);
2920 *erroroffset = ptr - (uschar *)pattern;
2921 return NULL;
2922 }
2923#endif
2924
2925return (pcre *)re;
2926}
2927
2928
2929
2930/*************************************************
2931* Match a character type *
2932*************************************************/
2933
2934/* Not used in all the places it might be as it's sometimes faster
2935to put the code inline.
2936
2937Arguments:
2938 type the character type
2939 c the character
Guido van Rossum50700601997-12-08 17:15:20 +00002940 dotall the dotall flag
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002941
2942Returns: TRUE if character is of the type
2943*/
2944
2945static BOOL
2946match_type(int type, int c, BOOL dotall)
2947{
2948
2949#ifdef DEBUG
2950if (isprint(c)) printf("matching subject %c against ", c);
2951 else printf("matching subject \\x%02x against ", c);
2952printf("%s\n", OP_names[type]);
2953#endif
2954
2955switch(type)
2956 {
2957 case OP_ANY: return dotall || c != '\n';
2958 case OP_NOT_DIGIT: return (pcre_ctypes[c] & ctype_digit) == 0;
2959 case OP_DIGIT: return (pcre_ctypes[c] & ctype_digit) != 0;
2960 case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
2961 case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0;
2962 case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0;
2963 case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0;
Guido van Rossum58132c61997-12-17 00:24:13 +00002964 case OP_NOT_WORDCHAR_L: return (c!='_' && !isalnum(c));
2965 case OP_WORDCHAR_L: return (c=='_' || isalnum(c));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002966 }
2967return FALSE;
2968}
2969
2970
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002971
2972/*************************************************
2973* Match a back-reference *
2974*************************************************/
2975
2976/* If a back reference hasn't been set, the match fails.
2977
2978Arguments:
2979 number reference number
2980 eptr points into the subject
2981 length length to be matched
2982 md points to match data block
2983
2984Returns: TRUE if matched
2985*/
2986
2987static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00002988match_ref(int number, register const uschar *eptr, int length, match_data *md)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002989{
Guido van Rossum58132c61997-12-17 00:24:13 +00002990const uschar *p = md->start_subject + md->offset_vector[number];
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002991
2992#ifdef DEBUG
2993if (eptr >= md->end_subject)
2994 printf("matching subject <null>");
2995else
2996 {
2997 printf("matching subject ");
2998 pchars(eptr, length, TRUE, md);
2999 }
3000printf(" against backref ");
3001pchars(p, length, FALSE, md);
3002printf("\n");
3003#endif
3004
3005/* Always fail if not enough characters left */
3006
3007if (length > md->end_subject - p) return FALSE;
3008
Guido van Rossum58132c61997-12-17 00:24:13 +00003009/* Separate the caseless case for speed */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003010
3011if (md->caseless)
3012 { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }
3013else
3014 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3015
3016return TRUE;
3017}
3018
3019static int free_stack(match_data *md)
3020{
3021/* Free any stack space that was allocated by the call to match(). */
3022if (md->off_num) free(md->off_num);
3023if (md->offset_top) free(md->offset_top);
3024if (md->r1) free(md->r1);
3025if (md->r2) free(md->r2);
3026if (md->eptr) free(md->eptr);
Guido van Rossumc3861071997-10-08 02:07:40 +00003027if (md->ecode) free(md->ecode);
3028return 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003029}
3030
3031static int grow_stack(match_data *md)
3032{
Guido van Rossum50700601997-12-08 17:15:20 +00003033 if (md->length != 0)
3034 {
3035 md->length = md->length + md->length/2;
3036 }
3037 else
3038 {
3039 int string_len = md->end_subject - md->start_subject + 1;
3040 if (string_len < 80) {md->length = string_len; }
3041 else {md->length = 80;}
3042 }
3043 PyMem_RESIZE(md->offset_top, int, md->length);
Guido van Rossum58132c61997-12-17 00:24:13 +00003044 PyMem_RESIZE(md->eptr, const uschar *, md->length);
3045 PyMem_RESIZE(md->ecode, const uschar *, md->length);
Guido van Rossum50700601997-12-08 17:15:20 +00003046 PyMem_RESIZE(md->off_num, int, md->length);
3047 PyMem_RESIZE(md->r1, int, md->length);
3048 PyMem_RESIZE(md->r2, int, md->length);
3049 if (md->offset_top == NULL || md->eptr == NULL || md->ecode == NULL ||
3050 md->off_num == NULL || md->r1 == NULL || md->r2 == NULL)
3051 {
3052 PyErr_SetString(PyExc_MemoryError, "Can't increase failure stack for re operation");
3053 longjmp(md->error_env, 1);
3054 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003055 return 0;
3056}
3057
Guido van Rossum50700601997-12-08 17:15:20 +00003058
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003059/*************************************************
3060* Match from current position *
3061*************************************************/
3062
3063/* On entry ecode points to the first opcode, and eptr to the first character.
3064
3065Arguments:
3066 eptr pointer in subject
3067 ecode position in code
3068 offset_top current top pointer
3069 md pointer to "static" info for the match
3070
3071Returns: TRUE if matched
3072*/
3073
3074static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00003075match(register const uschar *eptr, register const uschar *ecode, int offset_top,
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003076 match_data *md)
3077{
3078 int save_stack_position = md->point;
3079match_loop:
3080
3081#define SUCCEED goto succeed
3082#define FAIL goto fail
3083
3084for (;;)
3085 {
3086 int min, max, ctype;
3087 register int i;
3088 register int c;
Guido van Rossum58132c61997-12-17 00:24:13 +00003089 BOOL minimize = FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003090
3091 /* Opening bracket. Check the alternative branches in turn, failing if none
3092 match. We have to set the start offset if required and there is space
3093 in the offset vector so that it is available for subsequent back references
3094 if the bracket matches. However, if the bracket fails, we must put back the
3095 previous value of both offsets in case they were set by a previous copy of
3096 the same bracket. Don't worry about setting the flag for the error case here;
3097 that is handled in the code for KET. */
3098
3099 if ((int)*ecode >= OP_BRA)
3100 {
3101 int number = (*ecode - OP_BRA) << 1;
Guido van Rossum58132c61997-12-17 00:24:13 +00003102 int save_offset1 = 0, save_offset2 = 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003103
Guido van Rossum557dea11997-12-22 22:46:52 +00003104 DPRINTF(("start bracket %d\n", number/2));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003105
3106 if (number > 0 && number < md->offset_end)
3107 {
3108 save_offset1 = md->offset_vector[number];
3109 save_offset2 = md->offset_vector[number+1];
3110 md->offset_vector[number] = eptr - md->start_subject;
3111
Guido van Rossum557dea11997-12-22 22:46:52 +00003112 DPRINTF(("saving %d %d\n", save_offset1, save_offset2));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003113 }
3114
3115 /* Recurse for all the alternatives. */
3116
3117 do
3118 {
3119 if (match(eptr, ecode+3, offset_top, md)) SUCCEED;
3120 ecode += (ecode[1] << 8) + ecode[2];
3121 }
3122 while (*ecode == OP_ALT);
3123
Guido van Rossum557dea11997-12-22 22:46:52 +00003124 DPRINTF(("bracket %d failed\n", number/2));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003125
3126 if (number > 0 && number < md->offset_end)
3127 {
3128 md->offset_vector[number] = save_offset1;
3129 md->offset_vector[number+1] = save_offset2;
3130 }
3131
3132 FAIL;
3133 }
3134
3135 /* Other types of node can be handled by a switch */
3136
3137 switch(*ecode)
3138 {
3139 case OP_END:
3140 md->end_match_ptr = eptr; /* Record where we ended */
3141 md->end_offset_top = offset_top; /* and how many extracts were taken */
3142 SUCCEED;
3143
Guido van Rossum50700601997-12-08 17:15:20 +00003144 /* The equivalent of Prolog's "cut" - if the rest doesn't match, the
3145 whole thing doesn't match, so we have to get out via a longjmp(). */
3146
3147 case OP_CUT:
3148 if (match(eptr, ecode+1, offset_top, md)) SUCCEED;
3149 longjmp(md->fail_env, 1);
3150
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003151 /* Assertion brackets. Check the alternative branches in turn - the
3152 matching won't pass the KET for an assertion. If any one branch matches,
3153 the assertion is true. */
3154
3155 case OP_ASSERT:
3156 do
3157 {
3158 if (match(eptr, ecode+3, offset_top, md)) break;
3159 ecode += (ecode[1] << 8) + ecode[2];
3160 }
3161 while (*ecode == OP_ALT);
3162 if (*ecode == OP_KET) FAIL;
3163
3164 /* Continue from after the assertion, updating the offsets high water
3165 mark, since extracts may have been taken during the assertion. */
3166
3167 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3168 ecode += 3;
3169 offset_top = md->end_offset_top;
3170 continue;
3171
3172 /* Negative assertion: all branches must fail to match */
3173
3174 case OP_ASSERT_NOT:
3175 do
3176 {
3177 if (match(eptr, ecode+3, offset_top, md)) FAIL;
3178 ecode += (ecode[1] << 8) + ecode[2];
3179 }
3180 while (*ecode == OP_ALT);
3181 ecode += 3;
3182 continue;
3183
Guido van Rossum50700601997-12-08 17:15:20 +00003184 /* "Once" brackets are like assertion brackets except that after a match,
3185 the point in the subject string is not moved back. Thus there can never be
3186 a move back into the brackets. Check the alternative branches in turn - the
3187 matching won't pass the KET for this kind of subpattern. If any one branch
3188 matches, we carry on, leaving the subject pointer. */
3189
3190 case OP_ONCE:
3191 do
3192 {
3193 if (match(eptr, ecode+3, offset_top, md)) break;
3194 ecode += (ecode[1] << 8) + ecode[2];
3195 }
3196 while (*ecode == OP_ALT);
Guido van Rossum557dea11997-12-22 22:46:52 +00003197 if (*ecode == OP_KET) FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00003198
3199 /* Continue as from after the assertion, updating the offsets high water
3200 mark, since extracts may have been taken. */
3201
3202 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3203 ecode += 3;
3204 offset_top = md->end_offset_top;
3205 eptr = md->end_match_ptr;
3206 continue;
3207
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003208 /* An alternation is the end of a branch; scan along to find the end of the
3209 bracketed group and go to there. */
3210
3211 case OP_ALT:
3212 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3213 break;
3214
3215 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3216 that it may occur zero times. It may repeat infinitely, or not at all -
3217 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3218 repeat limits are compiled as a number of copies, with the optional ones
3219 preceded by BRAZERO or BRAMINZERO. */
3220
3221 case OP_BRAZERO:
3222 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003223 const uschar *next = ecode+1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003224 if (match(eptr, next, offset_top, md)) SUCCEED;
3225 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3226 ecode = next + 3;
3227 }
3228 break;
3229
3230 case OP_BRAMINZERO:
3231 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003232 const uschar *next = ecode+1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003233 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3234 if (match(eptr, next+3, offset_top, md)) SUCCEED;
3235 ecode++;
3236 }
3237 break;;
3238
3239 /* End of a group, repeated or non-repeating. If we are at the end of
3240 an assertion "group", stop matching and SUCCEED, but record the
3241 current high water mark for use by positive assertions. */
3242
3243 case OP_KET:
3244 case OP_KETRMIN:
3245 case OP_KETRMAX:
3246 {
Guido van Rossum50700601997-12-08 17:15:20 +00003247 int number;
Guido van Rossum58132c61997-12-17 00:24:13 +00003248 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003249
Guido van Rossum50700601997-12-08 17:15:20 +00003250 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003251 {
Guido van Rossum50700601997-12-08 17:15:20 +00003252 md->end_match_ptr = eptr; /* For ONCE */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003253 md->end_offset_top = offset_top;
3254 SUCCEED;
3255 }
3256
3257 /* In all other cases we have to check the group number back at the
3258 start and if necessary complete handling an extraction by setting the
3259 final offset and bumping the high water mark. */
3260
3261 number = (*prev - OP_BRA) << 1;
3262
Guido van Rossum557dea11997-12-22 22:46:52 +00003263 DPRINTF(("end bracket %d\n", number/2));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003264
3265 if (number > 0)
3266 {
3267 if (number >= md->offset_end) md->offset_overflow = TRUE; else
3268 {
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003269 md->offset_vector[number+1] = eptr - md->start_subject;
3270 if (offset_top <= number) offset_top = number + 2;
3271 }
3272 }
3273
3274 /* For a non-repeating ket, just advance to the next node and continue at
3275 this level. */
3276
3277 if (*ecode == OP_KET)
3278 {
3279 ecode += 3;
3280 break;
3281 }
3282
3283 /* The repeating kets try the rest of the pattern or restart from the
3284 preceding bracket, in the appropriate order. */
3285
3286 if (*ecode == OP_KETRMIN)
3287 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003288 const uschar *ptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003289 if (match(eptr, ecode+3, offset_top, md)) goto succeed;
3290 /* Handle alternation inside the BRA...KET; push the additional
Guido van Rossum58132c61997-12-17 00:24:13 +00003291 alternatives onto the stack */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003292 ptr=prev;
3293 do {
3294 ptr += (ptr[1]<<8)+ ptr[2];
3295 if (*ptr==OP_ALT)
3296 {
Guido van Rossum50700601997-12-08 17:15:20 +00003297 if (md->length == md->point)
3298 {
3299 grow_stack(md);
3300 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003301 md->offset_top[md->point] = offset_top;
3302 md->eptr[md->point] = eptr;
3303 md->ecode[md->point] = ptr+3;
3304 md->r1[md->point] = 0;
3305 md->r2[md->point] = 0;
3306 md->off_num[md->point] = 0;
3307 md->point++;
3308 }
3309 } while (*ptr==OP_ALT);
3310 ecode=prev+3; goto match_loop;
3311 }
3312 else /* OP_KETRMAX */
3313 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003314 const uschar *ptr;
3315 /*int points_pushed=0;*/
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003316
3317 /* Push one failure point, that will resume matching at the code after
3318 the KETRMAX opcode. */
Guido van Rossum50700601997-12-08 17:15:20 +00003319 if (md->length == md->point)
3320 {
3321 grow_stack(md);
3322 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003323 md->offset_top[md->point] = offset_top;
3324 md->eptr[md->point] = eptr;
3325 md->ecode[md->point] = ecode+3;
3326 md->r1[md->point] = md->offset_vector[number];
3327 md->r2[md->point] = md->offset_vector[number+1];
3328 md->off_num[md->point] = number;
3329 md->point++;
3330
3331 md->offset_vector[number] = eptr - md->start_subject;
3332 /* Handle alternation inside the BRA...KET; push each of the
Guido van Rossum58132c61997-12-17 00:24:13 +00003333 additional alternatives onto the stack */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003334 ptr=prev;
3335 do {
3336 ptr += (ptr[1]<<8)+ ptr[2];
3337 if (*ptr==OP_ALT)
3338 {
Guido van Rossum50700601997-12-08 17:15:20 +00003339 if (md->length == md->point)
3340 if (md->length == md->point)
3341 {
3342 grow_stack(md);
3343 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003344 md->offset_top[md->point] = offset_top;
3345 md->eptr[md->point] = eptr;
3346 md->ecode[md->point] = ptr+3;
3347 md->r1[md->point] = 0;
3348 md->r2[md->point] = 0;
3349 md->off_num[md->point] = 0;
3350 md->point++;
Guido van Rossum58132c61997-12-17 00:24:13 +00003351 /*points_pushed++;*/
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003352 }
3353 } while (*ptr==OP_ALT);
3354 /* Jump to the first (or only) alternative and resume trying to match */
3355 ecode=prev+3; goto match_loop;
3356 }
3357 }
Guido van Rossum58132c61997-12-17 00:24:13 +00003358 break;
3359
Guido van Rossum50700601997-12-08 17:15:20 +00003360 /* Start of subject unless notbol, or after internal newline if multiline */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003361
3362 case OP_CIRC:
Guido van Rossum50700601997-12-08 17:15:20 +00003363 if (md->notbol && eptr == md->start_subject) FAIL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003364 if (md->multiline)
3365 {
3366 if (eptr != md->start_subject && eptr[-1] != '\n') FAIL;
3367 ecode++;
3368 break;
3369 }
3370 /* ... else fall through */
3371
3372 /* Start of subject assertion */
3373
3374 case OP_SOD:
3375 if (eptr != md->start_subject) FAIL;
3376 ecode++;
3377 break;
3378
Guido van Rossum50700601997-12-08 17:15:20 +00003379 /* Assert before internal newline if multiline, or before
3380 a terminating newline unless endonly is set, else end of subject unless
3381 noteol is set. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003382
3383 case OP_DOLL:
Guido van Rossum50700601997-12-08 17:15:20 +00003384 if (md->noteol && eptr >= md->end_subject) FAIL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003385 if (md->multiline)
3386 {
3387 if (eptr < md->end_subject && *eptr != '\n') FAIL;
3388 ecode++;
3389 break;
3390 }
Guido van Rossum50700601997-12-08 17:15:20 +00003391 else if (!md->endonly)
3392 {
3393 if (eptr < md->end_subject - 1 ||
3394 (eptr == md->end_subject - 1 && *eptr != '\n')) FAIL;
3395 ecode++;
3396 break;
3397 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003398 /* ... else fall through */
3399
3400 /* End of subject assertion */
3401
3402 case OP_EOD:
3403 if (eptr < md->end_subject) FAIL;
3404 ecode++;
3405 break;
3406
3407 /* Word boundary assertions */
3408
3409 case OP_NOT_WORD_BOUNDARY:
3410 case OP_WORD_BOUNDARY:
3411 {
3412 BOOL prev_is_word = (eptr != md->start_subject) &&
3413 ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
3414 BOOL cur_is_word = (eptr < md->end_subject) &&
3415 ((pcre_ctypes[*eptr] & ctype_word) != 0);
3416 if ((*ecode++ == OP_WORD_BOUNDARY)?
3417 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3418 FAIL;
3419 }
3420 break;
3421
Guido van Rossum50700601997-12-08 17:15:20 +00003422 case OP_NOT_WORD_BOUNDARY_L:
3423 case OP_WORD_BOUNDARY_L:
3424 {
3425 BOOL prev_is_word = (eptr != md->start_subject) &&
Guido van Rossum58132c61997-12-17 00:24:13 +00003426 (isalnum(eptr[-1]) || eptr[-1]=='_');
Guido van Rossum50700601997-12-08 17:15:20 +00003427 BOOL cur_is_word = (eptr < md->end_subject) &&
Guido van Rossum58132c61997-12-17 00:24:13 +00003428 (isalnum(*eptr) || *eptr=='_');
Guido van Rossum50700601997-12-08 17:15:20 +00003429 if ((*ecode++ == OP_WORD_BOUNDARY_L)?
3430 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3431 FAIL;
3432 }
3433 break;
3434
3435
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003436 /* Match a single character type; inline for speed */
3437
3438 case OP_ANY:
3439 if (!md->dotall && eptr < md->end_subject && *eptr == '\n') FAIL;
3440 if (eptr++ >= md->end_subject) FAIL;
3441 ecode++;
3442 break;
3443
3444 case OP_NOT_DIGIT:
3445 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)
3446 FAIL;
3447 ecode++;
3448 break;
3449
3450 case OP_DIGIT:
3451 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)
3452 FAIL;
3453 ecode++;
3454 break;
3455
3456 case OP_NOT_WHITESPACE:
3457 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)
3458 FAIL;
3459 ecode++;
3460 break;
3461
3462 case OP_WHITESPACE:
3463 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)
3464 FAIL;
3465 ecode++;
3466 break;
3467
3468 case OP_NOT_WORDCHAR:
3469 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)
3470 FAIL;
3471 ecode++;
3472 break;
3473
3474 case OP_WORDCHAR:
3475 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)
3476 FAIL;
3477 ecode++;
3478 break;
3479
Guido van Rossum50700601997-12-08 17:15:20 +00003480 case OP_NOT_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00003481 if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) ))
Guido van Rossum557dea11997-12-22 22:46:52 +00003482 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00003483 eptr++;
3484 ecode++;
3485 break;
3486
3487 case OP_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00003488 if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) ))
Guido van Rossum557dea11997-12-22 22:46:52 +00003489 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00003490 eptr++;
3491 ecode++;
3492 break;
3493
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003494 /* Match a back reference, possibly repeatedly. Look past the end of the
3495 item to see if there is repeat information following. The code is similar
3496 to that for character classes, but repeated for efficiency. Then obey
3497 similar code to character type repeats - written out again for speed.
3498 However, if the referenced string is the empty string, always treat
3499 it as matched, any number of times (otherwise there could be infinite
3500 loops). */
3501
3502 case OP_REF:
3503 {
3504 int length;
3505 int number = ecode[1] << 1; /* Doubled reference number */
3506 ecode += 2; /* Advance past the item */
3507
3508 if (number >= offset_top || md->offset_vector[number] < 0)
3509 {
3510 md->errorcode = PCRE_ERROR_BADREF;
3511 FAIL;
3512 }
3513
3514 length = md->offset_vector[number+1] - md->offset_vector[number];
3515
3516 switch (*ecode)
3517 {
3518 case OP_CRSTAR:
3519 case OP_CRMINSTAR:
3520 case OP_CRPLUS:
3521 case OP_CRMINPLUS:
3522 case OP_CRQUERY:
3523 case OP_CRMINQUERY:
3524 c = *ecode++ - OP_CRSTAR;
3525 minimize = (c & 1) != 0;
3526 min = rep_min[c]; /* Pick up values from tables; */
3527 max = rep_max[c]; /* zero for max => infinity */
3528 if (max == 0) max = INT_MAX;
3529 break;
3530
3531 case OP_CRRANGE:
3532 case OP_CRMINRANGE:
3533 minimize = (*ecode == OP_CRMINRANGE);
3534 min = (ecode[1] << 8) + ecode[2];
3535 max = (ecode[3] << 8) + ecode[4];
3536 if (max == 0) max = INT_MAX;
3537 ecode += 5;
3538 break;
3539
3540 default: /* No repeat follows */
3541 if (!match_ref(number, eptr, length, md)) FAIL;
3542 eptr += length;
3543 continue; /* With the main loop */
3544 }
3545
3546 /* If the length of the reference is zero, just continue with the
3547 main loop. */
3548
3549 if (length == 0) continue;
3550
3551 /* First, ensure the minimum number of matches are present. We get back
3552 the length of the reference string explicitly rather than passing the
3553 address of eptr, so that eptr can be a register variable. */
3554
3555 for (i = 1; i <= min; i++)
3556 {
3557 if (!match_ref(number, eptr, length, md)) FAIL;
3558 eptr += length;
3559 }
3560
3561 /* If min = max, continue at the same level without recursion.
3562 They are not both allowed to be zero. */
3563
3564 if (min == max) continue;
3565
3566 /* If minimizing, keep trying and advancing the pointer */
3567
3568 if (minimize)
3569 {
3570 for (i = min;; i++)
3571 {
3572 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3573 if (i >= max || !match_ref(number, eptr, length, md))
3574 FAIL;
3575 eptr += length;
3576 }
3577 /* Control never gets here */
3578 }
3579
3580 /* If maximizing, find the longest string and work backwards */
3581
3582 else
3583 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003584 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003585 for (i = min; i < max; i++)
3586 {
3587 if (!match_ref(number, eptr, length, md)) break;
3588 eptr += length;
3589 }
3590 while (eptr >= pp)
3591 {
3592 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3593 eptr -= length;
3594 }
3595 FAIL;
3596 }
3597 }
3598 /* Control never gets here */
3599
3600 /* Match a character class, possibly repeatedly. Look past the end of the
3601 item to see if there is repeat information following. Then obey similar
Guido van Rossum50700601997-12-08 17:15:20 +00003602 code to character type repeats - written out again for speed. If caseless
3603 matching was set at runtime but not at compile time, we have to check both
3604 versions of a character. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003605
3606 case OP_CLASS:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003607 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003608 const uschar *data = ecode + 1; /* Save for matching */
3609 ecode += 33; /* Advance past the item */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003610
3611 switch (*ecode)
3612 {
3613 case OP_CRSTAR:
3614 case OP_CRMINSTAR:
3615 case OP_CRPLUS:
3616 case OP_CRMINPLUS:
3617 case OP_CRQUERY:
3618 case OP_CRMINQUERY:
3619 c = *ecode++ - OP_CRSTAR;
3620 minimize = (c & 1) != 0;
3621 min = rep_min[c]; /* Pick up values from tables; */
3622 max = rep_max[c]; /* zero for max => infinity */
3623 if (max == 0) max = INT_MAX;
3624 break;
3625
3626 case OP_CRRANGE:
3627 case OP_CRMINRANGE:
3628 minimize = (*ecode == OP_CRMINRANGE);
3629 min = (ecode[1] << 8) + ecode[2];
3630 max = (ecode[3] << 8) + ecode[4];
3631 if (max == 0) max = INT_MAX;
3632 ecode += 5;
3633 break;
3634
3635 default: /* No repeat follows */
Guido van Rossum50700601997-12-08 17:15:20 +00003636 if (eptr >= md->end_subject) FAIL;
3637 c = *eptr++;
3638 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
3639 if (md->runtime_caseless)
3640 {
3641 c = pcre_fcc[c];
3642 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
3643 }
3644 FAIL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003645 }
3646
3647 /* First, ensure the minimum number of matches are present. */
3648
3649 for (i = 1; i <= min; i++)
Guido van Rossum50700601997-12-08 17:15:20 +00003650 {
3651 if (eptr >= md->end_subject) FAIL;
3652 c = *eptr++;
3653 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3654 if (md->runtime_caseless)
3655 {
3656 c = pcre_fcc[c];
3657 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3658 }
3659 FAIL;
3660 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003661
3662 /* If max == min we can continue with the main loop without the
3663 need to recurse. */
3664
3665 if (min == max) continue;
3666
3667 /* If minimizing, keep testing the rest of the expression and advancing
3668 the pointer while it matches the class. */
3669
3670 if (minimize)
3671 {
3672 for (i = min;; i++)
3673 {
3674 if (match(eptr, ecode, offset_top, md)) SUCCEED;
Guido van Rossum50700601997-12-08 17:15:20 +00003675 if (i >= max || eptr >= md->end_subject) FAIL;
3676 c = *eptr++;
3677 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3678 if (md->runtime_caseless)
3679 {
3680 c = pcre_fcc[c];
3681 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3682 }
3683 FAIL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003684 }
3685 /* Control never gets here */
3686 }
3687
3688 /* If maximizing, find the longest possible run, then work backwards. */
3689
3690 else
3691 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003692 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00003693 for (i = min; i < max; eptr++, i++)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003694 {
Guido van Rossum50700601997-12-08 17:15:20 +00003695 if (eptr >= md->end_subject) break;
3696 c = *eptr;
3697 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3698 if (md->runtime_caseless)
3699 {
3700 c = pcre_fcc[c];
3701 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3702 }
3703 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003704 }
Guido van Rossum50700601997-12-08 17:15:20 +00003705
3706 while (eptr >= pp)
3707 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3708 FAIL;
3709 }
3710 }
3711 /* Control never gets here */
3712
3713 /* OP_CLASS_L opcode: handles localized character classes */
3714
3715 case OP_CLASS_L:
3716 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003717 const uschar *data = ecode + 1; /* Save for matching */
3718 const uschar locale_flag = *data;
Guido van Rossum50700601997-12-08 17:15:20 +00003719 ecode++; data++; /* The localization support adds an extra byte */
3720
3721 ecode += 33; /* Advance past the item */
3722
3723 switch (*ecode)
3724 {
3725 case OP_CRSTAR:
3726 case OP_CRMINSTAR:
3727 case OP_CRPLUS:
3728 case OP_CRMINPLUS:
3729 case OP_CRQUERY:
3730 case OP_CRMINQUERY:
3731 c = *ecode++ - OP_CRSTAR;
3732 minimize = (c & 1) != 0;
3733 min = rep_min[c]; /* Pick up values from tables; */
3734 max = rep_max[c]; /* zero for max => infinity */
3735 if (max == 0) max = INT_MAX;
3736 break;
3737
3738 case OP_CRRANGE:
3739 case OP_CRMINRANGE:
3740 minimize = (*ecode == OP_CRMINRANGE);
3741 min = (ecode[1] << 8) + ecode[2];
3742 max = (ecode[3] << 8) + ecode[4];
3743 if (max == 0) max = INT_MAX;
3744 ecode += 5;
3745 break;
3746
3747 default: /* No repeat follows */
3748 if (eptr >= md->end_subject) FAIL;
3749 c = *eptr++;
3750 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
Guido van Rossum58132c61997-12-17 00:24:13 +00003751 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3752 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003753#if 0
3754 if ( (locale_flag & 4) && isdigit(c) ) continue; /* Locale \d */
3755 if ( (locale_flag & 8) && !isdigit(c) ) continue; /* Locale \D */
3756 if ( (locale_flag & 16) && isspace(c) ) continue; /* Locale \s */
3757 if ( (locale_flag & 32) && !isspace(c) ) continue; /* Locale \S */
3758#endif
3759
3760 if (md->runtime_caseless)
3761 {
3762 c = pcre_fcc[c];
3763 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
3764
Guido van Rossum58132c61997-12-17 00:24:13 +00003765 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3766 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003767 }
3768 FAIL;
3769 }
3770
3771 /* First, ensure the minimum number of matches are present. */
3772
3773 for (i = 1; i <= min; i++)
3774 {
3775 if (eptr >= md->end_subject) FAIL;
3776 c = *eptr++;
3777 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003778 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3779 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003780
3781 if (md->runtime_caseless)
3782 {
3783 c = pcre_fcc[c];
3784 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003785 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3786 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003787 }
3788 FAIL;
3789 }
3790
3791 /* If max == min we can continue with the main loop without the
3792 need to recurse. */
3793
3794 if (min == max) continue;
3795
3796 /* If minimizing, keep testing the rest of the expression and advancing
3797 the pointer while it matches the class. */
3798
3799 if (minimize)
3800 {
3801 for (i = min;; i++)
3802 {
3803 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3804 if (i >= max || eptr >= md->end_subject) FAIL;
3805 c = *eptr++;
3806 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003807 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3808 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003809
3810 if (md->runtime_caseless)
3811 {
3812 c = pcre_fcc[c];
3813 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003814 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3815 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003816 }
3817 FAIL;
3818 }
3819 /* Control never gets here */
3820 }
3821
3822 /* If maximizing, find the longest possible run, then work backwards. */
3823
3824 else
3825 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003826 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00003827 for (i = min; i < max; eptr++, i++)
3828 {
3829 if (eptr >= md->end_subject) break;
3830 c = *eptr;
3831 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003832 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3833 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003834 if (md->runtime_caseless)
3835 {
3836 c = pcre_fcc[c];
3837 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003838 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3839 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003840 }
3841 break;
3842 }
3843
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003844 while (eptr >= pp)
3845 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3846 FAIL;
3847 }
3848 }
3849 /* Control never gets here */
3850
3851 /* Match a run of characters */
3852
3853 case OP_CHARS:
3854 {
3855 register int length = ecode[1];
3856 ecode += 2;
3857
Guido van Rossum557dea11997-12-22 22:46:52 +00003858#ifdef DEBUG /* Sigh. Some compilers never learn. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003859 if (eptr >= md->end_subject)
3860 printf("matching subject <null> against pattern ");
3861 else
3862 {
3863 printf("matching subject ");
3864 pchars(eptr, length, TRUE, md);
3865 printf(" against pattern ");
3866 }
3867 pchars(ecode, length, FALSE, md);
3868 printf("\n");
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003869#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003870
3871 if (length > md->end_subject - eptr) FAIL;
3872 if (md->caseless)
3873 {
3874 while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) FAIL;
3875 }
3876 else
3877 {
3878 while (length-- > 0) if (*ecode++ != *eptr++) FAIL;
3879 }
3880 }
3881 break;
3882
3883 /* Match a single character repeatedly; different opcodes share code. */
3884
3885 case OP_EXACT:
3886 min = max = (ecode[1] << 8) + ecode[2];
3887 ecode += 3;
3888 goto REPEATCHAR;
3889
3890 case OP_UPTO:
3891 case OP_MINUPTO:
3892 min = 0;
3893 max = (ecode[1] << 8) + ecode[2];
3894 minimize = *ecode == OP_MINUPTO;
3895 ecode += 3;
3896 goto REPEATCHAR;
3897
3898 case OP_STAR:
3899 case OP_MINSTAR:
3900 case OP_PLUS:
3901 case OP_MINPLUS:
3902 case OP_QUERY:
3903 case OP_MINQUERY:
3904 c = *ecode++ - OP_STAR;
3905 minimize = (c & 1) != 0;
3906 min = rep_min[c]; /* Pick up values from tables; */
3907 max = rep_max[c]; /* zero for max => infinity */
3908 if (max == 0) max = INT_MAX;
3909
3910 /* Common code for all repeated single-character matches. We can give
3911 up quickly if there are fewer than the minimum number of characters left in
3912 the subject. */
3913
3914 REPEATCHAR:
3915 if (min > md->end_subject - eptr) FAIL;
3916 c = *ecode++;
3917
3918 /* The code is duplicated for the caseless and caseful cases, for speed,
3919 since matching characters is likely to be quite common. First, ensure the
3920 minimum number of matches are present. If min = max, continue at the same
3921 level without recursing. Otherwise, if minimizing, keep trying the rest of
3922 the expression and advancing one matching character if failing, up to the
3923 maximum. Alternatively, if maximizing, find the maximum number of
3924 characters and work backwards. */
3925
Guido van Rossum557dea11997-12-22 22:46:52 +00003926 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
3927 max, eptr));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003928
3929 if (md->caseless)
3930 {
3931 c = pcre_lcc[c];
3932 for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) FAIL;
3933 if (min == max) continue;
3934 if (minimize)
3935 {
3936 for (i = min;; i++)
3937 {
3938 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3939 if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])
3940 FAIL;
3941 }
3942 /* Control never gets here */
3943 }
3944 else
3945 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003946 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003947 for (i = min; i < max; i++)
3948 {
3949 if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;
3950 eptr++;
3951 }
3952 while (eptr >= pp)
3953 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3954 FAIL;
3955 }
Guido van Rossum50700601997-12-08 17:15:20 +00003956 /* Control never gets here */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003957 }
3958
3959 /* Caseful comparisons */
3960
3961 else
3962 {
3963 for (i = 1; i <= min; i++) if (c != *eptr++) FAIL;
3964 if (min == max) continue;
3965 if (minimize)
3966 {
3967 for (i = min;; i++)
3968 {
3969 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3970 if (i >= max || eptr >= md->end_subject || c != *eptr++) FAIL;
3971 }
3972 /* Control never gets here */
3973 }
3974 else
3975 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003976 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003977 for (i = min; i < max; i++)
3978 {
3979 if (eptr >= md->end_subject || c != *eptr) break;
3980 eptr++;
3981 }
3982 while (eptr >= pp)
3983 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3984 FAIL;
3985 }
3986 }
3987 /* Control never gets here */
3988
Guido van Rossum50700601997-12-08 17:15:20 +00003989 /* Match a negated single character */
3990
3991 case OP_NOT:
Guido van Rossum557dea11997-12-22 22:46:52 +00003992 if (eptr >= md->end_subject) FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00003993 ecode++;
3994 if (md->caseless)
3995 {
3996 if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) FAIL;
3997 }
3998 else
3999 {
4000 if (*ecode++ == *eptr++) FAIL;
4001 }
4002 break;
4003
4004 /* Match a negated single character repeatedly. This is almost a repeat of
4005 the code for a repeated single character, but I haven't found a nice way of
4006 commoning these up that doesn't require a test of the positive/negative
4007 option for each character match. Maybe that wouldn't add very much to the
4008 time taken, but character matching *is* what this is all about... */
4009
4010 case OP_NOTEXACT:
4011 min = max = (ecode[1] << 8) + ecode[2];
4012 ecode += 3;
4013 goto REPEATNOTCHAR;
4014
4015 case OP_NOTUPTO:
4016 case OP_NOTMINUPTO:
4017 min = 0;
4018 max = (ecode[1] << 8) + ecode[2];
4019 minimize = *ecode == OP_NOTMINUPTO;
4020 ecode += 3;
4021 goto REPEATNOTCHAR;
4022
4023 case OP_NOTSTAR:
4024 case OP_NOTMINSTAR:
4025 case OP_NOTPLUS:
4026 case OP_NOTMINPLUS:
4027 case OP_NOTQUERY:
4028 case OP_NOTMINQUERY:
4029 c = *ecode++ - OP_NOTSTAR;
4030 minimize = (c & 1) != 0;
4031 min = rep_min[c]; /* Pick up values from tables; */
4032 max = rep_max[c]; /* zero for max => infinity */
4033 if (max == 0) max = INT_MAX;
4034
4035 /* Common code for all repeated single-character matches. We can give
4036 up quickly if there are fewer than the minimum number of characters left in
4037 the subject. */
4038
4039 REPEATNOTCHAR:
4040 if (min > md->end_subject - eptr) FAIL;
4041 c = *ecode++;
4042
4043 /* The code is duplicated for the caseless and caseful cases, for speed,
4044 since matching characters is likely to be quite common. First, ensure the
4045 minimum number of matches are present. If min = max, continue at the same
4046 level without recursing. Otherwise, if minimizing, keep trying the rest of
4047 the expression and advancing one matching character if failing, up to the
4048 maximum. Alternatively, if maximizing, find the maximum number of
4049 characters and work backwards. */
4050
Guido van Rossum557dea11997-12-22 22:46:52 +00004051 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
4052 max, eptr));
Guido van Rossum50700601997-12-08 17:15:20 +00004053
4054 if (md->caseless)
4055 {
4056 c = pcre_lcc[c];
4057 for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) FAIL;
4058 if (min == max) continue;
4059 if (minimize)
4060 {
4061 for (i = min;; i++)
4062 {
4063 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4064 if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])
4065 FAIL;
4066 }
4067 /* Control never gets here */
4068 }
4069 else
4070 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004071 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00004072 for (i = min; i < max; i++)
4073 {
4074 if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;
4075 eptr++;
4076 }
4077 while (eptr >= pp)
4078 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4079 FAIL;
4080 }
4081 /* Control never gets here */
4082 }
4083
4084 /* Caseful comparisons */
4085
4086 else
4087 {
4088 for (i = 1; i <= min; i++) if (c == *eptr++) FAIL;
4089 if (min == max) continue;
4090 if (minimize)
4091 {
4092 for (i = min;; i++)
4093 {
4094 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4095 if (i >= max || eptr >= md->end_subject || c == *eptr++) FAIL;
4096 }
4097 /* Control never gets here */
4098 }
4099 else
4100 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004101 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00004102 for (i = min; i < max; i++)
4103 {
4104 if (eptr >= md->end_subject || c == *eptr) break;
4105 eptr++;
4106 }
4107 while (eptr >= pp)
4108 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4109 FAIL;
4110 }
4111 }
4112 /* Control never gets here */
4113
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004114 /* Match a single character type repeatedly; several different opcodes
4115 share code. This is very similar to the code for single characters, but we
4116 repeat it in the interests of efficiency. */
4117
4118 case OP_TYPEEXACT:
4119 min = max = (ecode[1] << 8) + ecode[2];
4120 minimize = TRUE;
4121 ecode += 3;
4122 goto REPEATTYPE;
4123
4124 case OP_TYPEUPTO:
4125 case OP_TYPEMINUPTO:
4126 min = 0;
4127 max = (ecode[1] << 8) + ecode[2];
4128 minimize = *ecode == OP_TYPEMINUPTO;
4129 ecode += 3;
4130 goto REPEATTYPE;
4131
4132 case OP_TYPESTAR:
4133 case OP_TYPEMINSTAR:
4134 case OP_TYPEPLUS:
4135 case OP_TYPEMINPLUS:
4136 case OP_TYPEQUERY:
4137 case OP_TYPEMINQUERY:
4138 c = *ecode++ - OP_TYPESTAR;
4139 minimize = (c & 1) != 0;
4140 min = rep_min[c]; /* Pick up values from tables; */
4141 max = rep_max[c]; /* zero for max => infinity */
4142 if (max == 0) max = INT_MAX;
4143
4144 /* Common code for all repeated single character type matches */
4145
4146 REPEATTYPE:
4147 ctype = *ecode++; /* Code for the character type */
4148
4149 /* First, ensure the minimum number of matches are present. Use inline
4150 code for maximizing the speed, and do the type test once at the start
4151 (i.e. keep it out of the loop). Also test that there are at least the
4152 minimum number of characters before we start. */
4153
4154 if (min > md->end_subject - eptr) FAIL;
4155 if (min > 0) switch(ctype)
4156 {
4157 case OP_ANY:
4158 if (!md->dotall)
4159 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') FAIL; }
4160 else eptr += min;
4161 break;
4162
4163 case OP_NOT_DIGIT:
4164 for (i = 1; i <= min; i++)
4165 if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) FAIL;
4166 break;
4167
4168 case OP_DIGIT:
4169 for (i = 1; i <= min; i++)
4170 if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) FAIL;
4171 break;
4172
4173 case OP_NOT_WHITESPACE:
4174 for (i = 1; i <= min; i++)
4175 if ((pcre_ctypes[*eptr++] & ctype_space) != 0) FAIL;
4176 break;
4177
4178 case OP_WHITESPACE:
4179 for (i = 1; i <= min; i++)
4180 if ((pcre_ctypes[*eptr++] & ctype_space) == 0) FAIL;
4181 break;
4182
4183 case OP_NOT_WORDCHAR:
4184 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
4185 FAIL;
4186 break;
4187
4188 case OP_WORDCHAR:
4189 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
4190 FAIL;
4191 break;
Guido van Rossum50700601997-12-08 17:15:20 +00004192
4193 case OP_NOT_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00004194 for (i = 1; i <= min; i++, eptr++) if (*eptr=='_' || isalnum(*eptr))
Guido van Rossum557dea11997-12-22 22:46:52 +00004195 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00004196 break;
4197
4198 case OP_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00004199 for (i = 1; i <= min; i++, eptr++) if (*eptr!='_' && !isalnum(*eptr))
Guido van Rossum557dea11997-12-22 22:46:52 +00004200 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00004201 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004202 }
4203
4204 /* If min = max, continue at the same level without recursing */
4205
4206 if (min == max) continue;
4207
4208 /* If minimizing, we have to test the rest of the pattern before each
4209 subsequent match, so inlining isn't much help; just use the function. */
4210
4211 if (minimize)
4212 {
4213 for (i = min;; i++)
4214 {
4215 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4216 if (i >= max || eptr >= md->end_subject ||
4217 !match_type(ctype, *eptr++, md->dotall))
4218 FAIL;
4219 }
4220 /* Control never gets here */
4221 }
4222
4223 /* If maximizing it is worth using inline code for speed, doing the type
4224 test once at the start (i.e. keep it out of the loop). */
4225
4226 else
4227 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004228 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004229 switch(ctype)
4230 {
4231 case OP_ANY:
4232 if (!md->dotall)
4233 {
4234 for (i = min; i < max; i++)
4235 {
4236 if (eptr >= md->end_subject || *eptr == '\n') break;
4237 eptr++;
4238 }
4239 }
4240 else
4241 {
4242 c = max - min;
4243 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4244 eptr += c;
4245 }
4246 break;
4247
4248 case OP_NOT_DIGIT:
4249 for (i = min; i < max; i++)
4250 {
4251 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)
4252 break;
4253 eptr++;
4254 }
4255 break;
4256
4257 case OP_DIGIT:
4258 for (i = min; i < max; i++)
4259 {
4260 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)
4261 break;
4262 eptr++;
4263 }
4264 break;
4265
4266 case OP_NOT_WHITESPACE:
4267 for (i = min; i < max; i++)
4268 {
4269 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)
4270 break;
4271 eptr++;
4272 }
4273 break;
4274
4275 case OP_WHITESPACE:
4276 for (i = min; i < max; i++)
4277 {
4278 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)
4279 break;
4280 eptr++;
4281 }
4282 break;
4283
4284 case OP_NOT_WORDCHAR:
4285 for (i = min; i < max; i++)
4286 {
4287 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)
4288 break;
4289 eptr++;
4290 }
4291 break;
4292
4293 case OP_WORDCHAR:
4294 for (i = min; i < max; i++)
4295 {
Guido van Rossum50700601997-12-08 17:15:20 +00004296 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)
4297 break;
4298 eptr++;
4299 }
4300 break;
4301 case OP_NOT_WORDCHAR_L:
4302 for (i = min; i < max; i++)
4303 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004304 if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) ) )
Guido van Rossum50700601997-12-08 17:15:20 +00004305 break;
4306 eptr++;
4307 }
4308 break;
4309
4310 case OP_WORDCHAR_L:
4311 for (i = min; i < max; i++)
4312 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004313 if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) ) )
Guido van Rossum50700601997-12-08 17:15:20 +00004314 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004315 eptr++;
4316 }
4317 break;
4318 }
4319
4320 while (eptr >= pp)
4321 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4322 FAIL;
4323 }
4324 /* Control never gets here */
4325
4326 /* There's been some horrible disaster. */
4327
4328 default:
Guido van Rossum557dea11997-12-22 22:46:52 +00004329 DPRINTF(("Unknown opcode %d\n", *ecode));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004330 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4331 FAIL;
4332 }
4333
4334 /* Do not stick any code in here without much thought; it is assumed
4335 that "continue" in the code above comes out to here to repeat the main
4336 loop. */
4337
4338 } /* End of main loop */
4339/* Control never reaches here */
4340
4341fail:
4342 if (md->point > save_stack_position)
4343 {
4344 /* If there are still points remaining on the stack, pop the next one off */
Guido van Rossumc3861071997-10-08 02:07:40 +00004345 int off_num;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004346
4347 md->point--;
4348 offset_top = md->offset_top[md->point];
4349 eptr = md->eptr[md->point];
4350 ecode = md->ecode[md->point];
4351 off_num = md->off_num[md->point];
4352 md->offset_vector[off_num] = md->r1[md->point];
4353 md->offset_vector[off_num+1] = md->r2[md->point];
4354 goto match_loop;
4355 }
4356 /* Failure, and nothing left on the stack, so end this function call */
4357
4358 /* Restore the top of the stack to where it was before this function
4359 call. This lets us use one stack for everything; recursive calls
4360 can push and pop information, and may increase the stack. When
4361 the call returns, the parent function can resume pushing and
4362 popping wherever it was. */
4363
4364 md->point = save_stack_position;
4365 return FALSE;
4366
4367succeed:
4368 return TRUE;
4369}
4370
4371
Guido van Rossum50700601997-12-08 17:15:20 +00004372
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004373/*************************************************
Guido van Rossum557dea11997-12-22 22:46:52 +00004374* Segregate setjmp() *
4375*************************************************/
4376
4377/* The -Wall option of gcc gives warnings for all local variables when setjmp()
4378is used, even if the coding conforms to the rules of ANSI C. To avoid this, we
4379hide it in a separate function. This is called only when PCRE_EXTRA is set,
4380since it's needed only for the extension \X option, and with any luck, a good
4381compiler will spot the tail recursion and compile it efficiently.
4382
4383Arguments:
4384 eptr pointer in subject
4385 ecode position in code
4386 offset_top current top pointer
4387 md pointer to "static" info for the match
4388
4389Returns: TRUE if matched
4390*/
4391
4392static BOOL
4393match_with_setjmp(const uschar *eptr, const uschar *ecode, int offset_top,
4394 match_data *match_block)
4395{
4396return setjmp(match_block->fail_env) == 0 &&
4397 match(eptr, ecode, offset_top, match_block);
4398}
4399
4400
4401
4402/*************************************************
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004403* Execute a Regular Expression *
4404*************************************************/
4405
4406/* This function applies a compiled re to a subject string and picks out
4407portions of the string if it matches. Two elements in the vector are set for
4408each substring: the offsets to the start and end of the substring.
4409
4410Arguments:
Guido van Rossum50700601997-12-08 17:15:20 +00004411 external_re points to the compiled expression
4412 external_extra points to "hints" from pcre_study() or is NULL
4413 subject points to the subject string
4414 length length of subject string (may contain binary zeros)
4415 options option bits
4416 offsets points to a vector of ints to be filled in with offsets
4417 offsetcount the number of elements in the vector
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004418
Guido van Rossum50700601997-12-08 17:15:20 +00004419Returns: > 0 => success; value is the number of elements filled in
4420 = 0 => success, but offsets is not big enough
4421 -1 => failed to match
4422 < -1 => some kind of unexpected problem
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004423*/
4424
4425int
Guido van Rossum50700601997-12-08 17:15:20 +00004426pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4427 const char *subject, int length, int options, int *offsets, int offsetcount)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004428{
Guido van Rossum58132c61997-12-17 00:24:13 +00004429 /* The "volatile" directives are to make gcc -Wall stop complaining
4430 that these variables can be clobbered by the longjmp. Hopefully
4431 they won't cost too much performance. */
Guido van Rossum557dea11997-12-22 22:46:52 +00004432int resetcount, ocount;
4433int first_char = -1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004434match_data match_block;
Guido van Rossum557dea11997-12-22 22:46:52 +00004435const uschar *start_bits = NULL;
4436const uschar *start_match = (const uschar *)subject;
Guido van Rossum58132c61997-12-17 00:24:13 +00004437const uschar *end_subject;
4438const real_pcre *re = (const real_pcre *)external_re;
4439const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
Guido van Rossum557dea11997-12-22 22:46:52 +00004440BOOL using_temporary_offsets = FALSE;
4441BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4442BOOL startline = (re->options & PCRE_STARTLINE) != 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004443
4444if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4445
4446if (re == NULL || subject == NULL ||
4447 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4448if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4449
Guido van Rossum58132c61997-12-17 00:24:13 +00004450match_block.start_subject = (const uschar *)subject;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004451match_block.end_subject = match_block.start_subject + length;
4452end_subject = match_block.end_subject;
4453
Guido van Rossum50700601997-12-08 17:15:20 +00004454match_block.caseless = ((re->options | options) & PCRE_CASELESS) != 0;
4455match_block.runtime_caseless = match_block.caseless &&
4456 (re->options & PCRE_CASELESS) == 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004457
Guido van Rossum50700601997-12-08 17:15:20 +00004458match_block.multiline = ((re->options | options) & PCRE_MULTILINE) != 0;
4459match_block.dotall = ((re->options | options) & PCRE_DOTALL) != 0;
4460match_block.endonly = ((re->options | options) & PCRE_DOLLAR_ENDONLY) != 0;
4461
4462match_block.notbol = (options & PCRE_NOTBOL) != 0;
4463match_block.noteol = (options & PCRE_NOTEOL) != 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004464
4465match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
4466
4467/* Set the stack state to empty */
4468 match_block.off_num = match_block.offset_top = NULL;
4469 match_block.r1 = match_block.r2 = NULL;
4470 match_block.eptr = match_block.ecode = NULL;
4471 match_block.point = match_block.length = 0;
4472
Guido van Rossum50700601997-12-08 17:15:20 +00004473/* If the expression has got more back references than the offsets supplied can
4474hold, we get a temporary bit of working store to use during the matching.
Guido van Rossum557dea11997-12-22 22:46:52 +00004475Otherwise, we can use the vector supplied, rounding down its size to a multiple
4476of 2. */
Guido van Rossum50700601997-12-08 17:15:20 +00004477
Guido van Rossum557dea11997-12-22 22:46:52 +00004478ocount = offsetcount & (-2);
4479if (re->top_backref > 0 && re->top_backref >= ocount/2)
Guido van Rossum50700601997-12-08 17:15:20 +00004480 {
4481 ocount = re->top_backref * 2 + 2;
4482 match_block.offset_vector = (pcre_malloc)(ocount * sizeof(int));
4483 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
Guido van Rossum557dea11997-12-22 22:46:52 +00004484 using_temporary_offsets = TRUE;
4485 DPRINTF(("Got memory to hold back references\n"));
Guido van Rossum50700601997-12-08 17:15:20 +00004486 }
4487else match_block.offset_vector = offsets;
4488
4489match_block.offset_end = ocount;
4490match_block.offset_overflow = FALSE;
4491
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004492/* Compute the minimum number of offsets that we need to reset each time. Doing
4493this makes a huge difference to execution time when there aren't many brackets
4494in the pattern. */
4495
4496resetcount = 2 + re->top_bracket * 2;
Guido van Rossum50700601997-12-08 17:15:20 +00004497if (resetcount > offsetcount) resetcount = ocount;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004498
4499/* If MULTILINE is set at exec time but was not set at compile time, and the
4500anchored flag is set, we must re-check because a setting provoked by ^ in the
4501pattern is not right in multi-line mode. Calling is_anchored() again here does
4502the right check, because multiline is now set. If it now yields FALSE, the
4503expression must have had ^ starting some of its branches. Check to see if
4504that is true for *all* branches, and if so, set the startline flag. */
4505
Guido van Rossum557dea11997-12-22 22:46:52 +00004506if (match_block.multiline && anchored && (re->options & PCRE_MULTILINE) == 0 &&
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004507 !is_anchored(re->code, match_block.multiline))
4508 {
4509 anchored = FALSE;
4510 if (is_startline(re->code)) startline = TRUE;
4511 }
4512
4513/* Set up the first character to match, if available. The first_char value is
4514never set for an anchored regular expression, but the anchoring may be forced
4515at run time, so we have to test for anchoring. The first char may be unset for
4516an unanchored pattern, of course. If there's no first char and the pattern was
4517studied, the may be a bitmap of possible first characters. However, we can
4518use this only if the caseless state of the studying was correct. */
4519
4520if (!anchored)
4521 {
4522 if ((re->options & PCRE_FIRSTSET) != 0)
4523 {
4524 first_char = re->first_char;
4525 if (match_block.caseless) first_char = pcre_lcc[first_char];
4526 }
4527 else
4528 if (!startline && extra != NULL &&
4529 (extra->options & PCRE_STUDY_MAPPED) != 0 &&
4530 ((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless)
4531 start_bits = extra->start_bits;
4532 }
4533
4534/* Loop for unanchored matches; for anchored regexps the loop runs just once. */
4535
4536do
4537 {
Guido van Rossum557dea11997-12-22 22:46:52 +00004538 int rc;
Guido van Rossum50700601997-12-08 17:15:20 +00004539 register int *iptr = match_block.offset_vector;
4540 register int *iend = iptr + resetcount;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004541
4542 /* Reset the maximum number of extractions we might see. */
4543
4544 while (iptr < iend) *iptr++ = -1;
4545
4546 /* Advance to a unique first char if possible */
4547
4548 if (first_char >= 0)
4549 {
4550 if (match_block.caseless)
4551 while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
4552 start_match++;
4553 else
4554 while (start_match < end_subject && *start_match != first_char)
4555 start_match++;
4556 }
4557
4558 /* Or to just after \n for a multiline match if possible */
4559
4560 else if (startline)
4561 {
4562 if (start_match > match_block.start_subject)
4563 {
4564 while (start_match < end_subject && start_match[-1] != '\n')
4565 start_match++;
4566 }
4567 }
4568
4569 /* Or to a non-unique first char */
4570
4571 else if (start_bits != NULL)
4572 {
4573 while (start_match < end_subject)
4574 {
4575 register int c = *start_match;
Guido van Rossum50700601997-12-08 17:15:20 +00004576 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004577 }
4578 }
4579
Guido van Rossum557dea11997-12-22 22:46:52 +00004580#ifdef DEBUG /* Sigh. Some compilers never learn. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004581 printf(">>>> Match against: ");
4582 pchars(start_match, end_subject - start_match, TRUE, &match_block);
4583 printf("\n");
Guido van Rossum57ba4f31997-12-02 20:40:28 +00004584#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004585
4586 /* When a match occurs, substrings will be set for all internal extractions;
4587 we just need to set up the whole thing as substring 0 before returning. If
Guido van Rossum50700601997-12-08 17:15:20 +00004588 there were too many extractions, set the return code to zero. In the case
4589 where we had to get some local store to hold offsets for backreferences, copy
4590 those back references that we can. In this case there need not be overflow
4591 if certain parts of the pattern were not used.
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004592
Guido van Rossum50700601997-12-08 17:15:20 +00004593 Before starting the match, we have to set up a longjmp() target to enable
Guido van Rossum557dea11997-12-22 22:46:52 +00004594 the "cut" operation to fail a match completely without backtracking. This
4595 is done in a separate function to avoid compiler warnings. We need not do
4596 it unless PCRE_EXTRA is set, since only in that case is the "cut" operation
4597 enabled. */
Guido van Rossum50700601997-12-08 17:15:20 +00004598
4599 /* To handle errors such as running out of memory for the failure
4600 stack, we need to save this location via setjmp(), so
4601 error-handling code can call longjmp() to jump out of deeply-nested code. */
4602 if (setjmp(match_block.error_env)==0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004603 {
Guido van Rossum50700601997-12-08 17:15:20 +00004604
Guido van Rossum557dea11997-12-22 22:46:52 +00004605 if ((re->options & PCRE_EXTRA) != 0)
Guido van Rossum50700601997-12-08 17:15:20 +00004606 {
Guido van Rossum557dea11997-12-22 22:46:52 +00004607 if (!match_with_setjmp(start_match, re->code, 2, &match_block))
4608 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004609 }
Guido van Rossum557dea11997-12-22 22:46:52 +00004610 else if (!match(start_match, re->code, 2, &match_block)) continue;
4611
4612 /* Copy the offset information from temporary store if necessary */
4613
4614 if (using_temporary_offsets)
4615 {
4616 if (offsetcount >= 4)
4617 {
4618 memcpy(offsets + 2, match_block.offset_vector + 2,
4619 (offsetcount - 2) * sizeof(int));
4620 DPRINTF(("Copied offsets from temporary memory\n"));
4621 }
4622 if (match_block.end_offset_top > offsetcount)
4623 match_block.offset_overflow = TRUE;
4624
4625 DPRINTF(("Freeing temporary memory\n"));
4626 (pcre_free)(match_block.offset_vector);
4627 }
4628
4629 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
4630
4631 if (match_block.offset_end < 2) rc = 0; else
4632 {
4633 offsets[0] = start_match - match_block.start_subject;
4634 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
4635 }
4636
4637 DPRINTF((">>>> returning %d\n", rc));
4638 free_stack(&match_block);
4639 return rc;
Guido van Rossum50700601997-12-08 17:15:20 +00004640 } /* End of (if setjmp(match_block.error_env)...) */
4641 /* Return an error code; pcremodule.c will preserve the exception */
4642 if (PyErr_Occurred()) return PCRE_ERROR_NOMEMORY;
4643
4644 free_stack(&match_block);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004645 }
4646while (!anchored &&
4647 match_block.errorcode == PCRE_ERROR_NOMATCH &&
4648 start_match++ < end_subject);
4649
Guido van Rossum557dea11997-12-22 22:46:52 +00004650if (using_temporary_offsets)
4651 {
4652 DPRINTF(("Freeing temporary memory\n"));
4653 (pcre_free)(match_block.offset_vector);
4654 }
4655
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004656#ifdef DEBUG
4657printf(">>>> returning %d\n", match_block.errorcode);
4658#endif
Guido van Rossum50700601997-12-08 17:15:20 +00004659
Guido van Rossum557dea11997-12-22 22:46:52 +00004660 return match_block.errorcode;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004661}
4662
4663/* End of pcre.c */