blob: 96baa89c03627facf62811d8713b99556829f668 [file] [log] [blame]
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001
2/*************************************************
3* Perl-Compatible Regular Expressions *
4*************************************************/
5
6/* DO NOT EDIT THIS FILE! */
7
8/* This file is automatically written by the merge-files.py script
9included with the PCRE distribution for Python; it's produced from
10several C files, and code is removed in the process. If you want to
11modify the code or track down bugs, it will be much easier to work
12with the code in its original, multiple-file form. Don't edit this
13file by hand, or submit patches to it.
14
15The Python-specific PCRE distribution can be retrieved from
16 http://starship.skyport.net/crew/amk/regex/
17
Guido van Rossum58132c61997-12-17 00:24:13 +000018The unmodified original PCRE distribution is available at
19ftp://ftp.cus.cam.ac.uk/pub/software/programs/pcre/, and is originally
20written by: Philip Hazel <ph10@cam.ac.uk>
Guido van Rossum51b3aa31997-10-06 14:43:11 +000021
22Extensively modified by the Python String-SIG: <string-sig@python.org>
23Send bug reports to: <string-sig@python.org>
24(They'll figure out if it's a bug in PCRE or in the Python-specific
25changes.)
26
27 Copyright (c) 1997 University of Cambridge
28
29-----------------------------------------------------------------------------
30Permission is granted to anyone to use this software for any purpose on any
31computer system, and to redistribute it freely, subject to the following
32restrictions:
33
341. This software is distributed in the hope that it will be useful,
35 but WITHOUT ANY WARRANTY; without even the implied warranty of
36 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
37
382. The origin of this software must not be misrepresented, either by
39 explicit claim or by omission.
40
413. Altered versions must be plainly marked as such, and must not be
42 misrepresented as being the original software.
43-----------------------------------------------------------------------------
44*/
45
46
47#define FOR_PYTHON
Guido van Rossum58132c61997-12-17 00:24:13 +000048#include "pcre-int.h"
Guido van Rossum51b3aa31997-10-06 14:43:11 +000049#include "Python.h"
Guido van Rossum50700601997-12-08 17:15:20 +000050#include "mymalloc.h"
51#include <ctype.h>
Guido van Rossum51b3aa31997-10-06 14:43:11 +000052#include "graminit.h"
53
54/*************************************************
55* Perl-Compatible Regular Expressions *
56*************************************************/
57
58/* This file is automatically written by the makechartables auxiliary
59program. If you edit it by hand, you might like to edit the Makefile to
60prevent its ever being regenerated. */
61
62/* This table is a lower casing table. */
63
64unsigned char pcre_lcc[] = {
65 0, 1, 2, 3, 4, 5, 6, 7,
66 8, 9, 10, 11, 12, 13, 14, 15,
67 16, 17, 18, 19, 20, 21, 22, 23,
68 24, 25, 26, 27, 28, 29, 30, 31,
69 32, 33, 34, 35, 36, 37, 38, 39,
70 40, 41, 42, 43, 44, 45, 46, 47,
71 48, 49, 50, 51, 52, 53, 54, 55,
72 56, 57, 58, 59, 60, 61, 62, 63,
73 64, 97, 98, 99,100,101,102,103,
74 104,105,106,107,108,109,110,111,
75 112,113,114,115,116,117,118,119,
76 120,121,122, 91, 92, 93, 94, 95,
77 96, 97, 98, 99,100,101,102,103,
78 104,105,106,107,108,109,110,111,
79 112,113,114,115,116,117,118,119,
80 120,121,122,123,124,125,126,127,
81 128,129,130,131,132,133,134,135,
82 136,137,138,139,140,141,142,143,
83 144,145,146,147,148,149,150,151,
84 152,153,154,155,156,157,158,159,
85 160,161,162,163,164,165,166,167,
86 168,169,170,171,172,173,174,175,
87 176,177,178,179,180,181,182,183,
88 184,185,186,187,188,189,190,191,
89 192,193,194,195,196,197,198,199,
90 200,201,202,203,204,205,206,207,
91 208,209,210,211,212,213,214,215,
92 216,217,218,219,220,221,222,223,
93 224,225,226,227,228,229,230,231,
94 232,233,234,235,236,237,238,239,
95 240,241,242,243,244,245,246,247,
96 248,249,250,251,252,253,254,255 };
97
Guido van Rossum50700601997-12-08 17:15:20 +000098/* This table is a case flipping table. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +000099
Guido van Rossum50700601997-12-08 17:15:20 +0000100unsigned char pcre_fcc[] = {
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000101 0, 1, 2, 3, 4, 5, 6, 7,
102 8, 9, 10, 11, 12, 13, 14, 15,
103 16, 17, 18, 19, 20, 21, 22, 23,
104 24, 25, 26, 27, 28, 29, 30, 31,
105 32, 33, 34, 35, 36, 37, 38, 39,
106 40, 41, 42, 43, 44, 45, 46, 47,
107 48, 49, 50, 51, 52, 53, 54, 55,
108 56, 57, 58, 59, 60, 61, 62, 63,
Guido van Rossum50700601997-12-08 17:15:20 +0000109 64, 97, 98, 99,100,101,102,103,
110 104,105,106,107,108,109,110,111,
111 112,113,114,115,116,117,118,119,
112 120,121,122, 91, 92, 93, 94, 95,
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000113 96, 65, 66, 67, 68, 69, 70, 71,
114 72, 73, 74, 75, 76, 77, 78, 79,
115 80, 81, 82, 83, 84, 85, 86, 87,
116 88, 89, 90,123,124,125,126,127,
117 128,129,130,131,132,133,134,135,
118 136,137,138,139,140,141,142,143,
119 144,145,146,147,148,149,150,151,
120 152,153,154,155,156,157,158,159,
121 160,161,162,163,164,165,166,167,
122 168,169,170,171,172,173,174,175,
123 176,177,178,179,180,181,182,183,
124 184,185,186,187,188,189,190,191,
125 192,193,194,195,196,197,198,199,
126 200,201,202,203,204,205,206,207,
127 208,209,210,211,212,213,214,215,
128 216,217,218,219,220,221,222,223,
129 224,225,226,227,228,229,230,231,
130 232,233,234,235,236,237,238,239,
131 240,241,242,243,244,245,246,247,
132 248,249,250,251,252,253,254,255 };
133
Guido van Rossum50700601997-12-08 17:15:20 +0000134/* This table contains bit maps for digits, letters, 'word' chars, and
135white space. Each map is 32 bytes long and the bits run from the least
136significant end of each byte. */
137
138unsigned char pcre_cbits[] = {
139 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
140 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
141 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
142 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
143
144 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
145 0xfe,0xff,0xff,0x07,0xfe,0xff,0xff,0x07,
146 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
147 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
148
149 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
150 0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07,
151 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
152 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
153
154 0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00,
155 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
156 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
157 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 };
158
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000159/* This table identifies various classes of character by individual bits:
Guido van Rossum50700601997-12-08 17:15:20 +0000160 0x01 white space character
161 0x02 letter
162 0x04 decimal digit
163 0x08 hexadecimal digit
164 0x10 alphanumeric or '_'
165 0x80 regular expression metacharacter or binary zero
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000166*/
167
168unsigned char pcre_ctypes[] = {
169 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
170 0x00,0x01,0x01,0x01,0x01,0x01,0x00,0x00, /* 8- 15 */
171 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
172 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
173 0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /* - ' */
174 0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /* ( - / */
Guido van Rossum50700601997-12-08 17:15:20 +0000175 0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c, /* 0 - 7 */
176 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x80, /* 8 - ? */
177 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* @ - G */
178 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* H - O */
179 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* P - W */
180 0x12,0x12,0x12,0x80,0x00,0x00,0x80,0x10, /* X - _ */
181 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* ` - g */
182 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* h - o */
183 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* p - w */
184 0x12,0x12,0x12,0x80,0x80,0x00,0x00,0x00, /* x -127 */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000185 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
186 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
187 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
188 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
189 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
190 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
191 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
192 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
193 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
194 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
195 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
196 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
197 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
198 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
199 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
200 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
201
Guido van Rossum50700601997-12-08 17:15:20 +0000202/* End of chartables.c */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000203/*************************************************
204* Perl-Compatible Regular Expressions *
205*************************************************/
206
207/*
208This is a library of functions to support regular expressions whose syntax
209and semantics are as close as possible to those of the Perl 5 language. See
210the file Tech.Notes for some information on the internals.
211
212Written by: Philip Hazel <ph10@cam.ac.uk>
213
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000214 Copyright (c) 1998 University of Cambridge
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000215
216-----------------------------------------------------------------------------
217Permission is granted to anyone to use this software for any purpose on any
218computer system, and to redistribute it freely, subject to the following
219restrictions:
220
2211. This software is distributed in the hope that it will be useful,
222 but WITHOUT ANY WARRANTY; without even the implied warranty of
223 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
224
2252. The origin of this software must not be misrepresented, either by
226 explicit claim or by omission.
227
2283. Altered versions must be plainly marked as such, and must not be
229 misrepresented as being the original software.
230-----------------------------------------------------------------------------
231*/
232
233
234/* Include the internals header, which itself includes Standard C headers plus
235the external pcre header. */
236
237
238
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000239
240/*************************************************
241* Create bitmap of starting chars *
242*************************************************/
243
244/* This function scans a compiled unanchored expression and attempts to build a
245bitmap of the set of initial characters. If it can't, it returns FALSE. As time
246goes by, we may be able to get more clever at doing this.
247
248Arguments:
249 code points to an expression
250 start_bits points to a 32-byte table, initialized to 0
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000251
252Returns: TRUE if table built, FALSE otherwise
253*/
254
255static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +0000256set_start_bits(const uschar *code, uschar *start_bits)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000257{
Guido van Rossum50700601997-12-08 17:15:20 +0000258register int c;
259
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000260do
261 {
Guido van Rossum58132c61997-12-17 00:24:13 +0000262 const uschar *tcode = code + 3;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000263 BOOL try_next = TRUE;
264
265 while (try_next)
266 {
267 try_next = FALSE;
268
269 if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
270 {
Guido van Rossum50700601997-12-08 17:15:20 +0000271 if (!set_start_bits(tcode, start_bits)) return FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000272 }
273
274 else switch(*tcode)
275 {
276 default:
277 return FALSE;
278
279 /* BRAZERO does the bracket, but carries on. */
280
281 case OP_BRAZERO:
282 case OP_BRAMINZERO:
Guido van Rossum50700601997-12-08 17:15:20 +0000283 if (!set_start_bits(++tcode, start_bits)) return FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000284 do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
285 tcode += 3;
286 try_next = TRUE;
287 break;
288
289 /* Single-char * or ? sets the bit and tries the next item */
290
291 case OP_STAR:
292 case OP_MINSTAR:
293 case OP_QUERY:
294 case OP_MINQUERY:
Guido van Rossum50700601997-12-08 17:15:20 +0000295 start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000296 tcode += 2;
297 try_next = TRUE;
298 break;
299
300 /* Single-char upto sets the bit and tries the next */
301
302 case OP_UPTO:
303 case OP_MINUPTO:
Guido van Rossum50700601997-12-08 17:15:20 +0000304 start_bits[tcode[3]/8] |= (1 << (tcode[3]&7));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000305 tcode += 4;
306 try_next = TRUE;
307 break;
308
309 /* At least one single char sets the bit and stops */
310
311 case OP_EXACT: /* Fall through */
312 tcode++;
313
314 case OP_CHARS: /* Fall through */
315 tcode++;
316
317 case OP_PLUS:
318 case OP_MINPLUS:
Guido van Rossum50700601997-12-08 17:15:20 +0000319 start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000320 break;
321
322 /* Single character type sets the bits and stops */
323
324 case OP_NOT_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000325 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000326 break;
327
328 case OP_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000329 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000330 break;
331
332 case OP_NOT_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000333 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000334 break;
335
336 case OP_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000337 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000338 break;
339
340 case OP_NOT_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000341 for (c = 0; c < 32; c++)
342 start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000343 break;
344
345 case OP_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000346 for (c = 0; c < 32; c++)
347 start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000348 break;
349
350 /* One or more character type fudges the pointer and restarts, knowing
351 it will hit a single character type and stop there. */
352
353 case OP_TYPEPLUS:
354 case OP_TYPEMINPLUS:
355 tcode++;
356 try_next = TRUE;
357 break;
358
359 case OP_TYPEEXACT:
360 tcode += 3;
361 try_next = TRUE;
362 break;
363
364 /* Zero or more repeats of character types set the bits and then
365 try again. */
366
367 case OP_TYPEUPTO:
368 case OP_TYPEMINUPTO:
369 tcode += 2; /* Fall through */
370
371 case OP_TYPESTAR:
372 case OP_TYPEMINSTAR:
373 case OP_TYPEQUERY:
374 case OP_TYPEMINQUERY:
375 switch(tcode[1])
376 {
377 case OP_NOT_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000378 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000379 break;
380
381 case OP_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000382 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000383 break;
384
385 case OP_NOT_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000386 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000387 break;
388
389 case OP_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000390 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000391 break;
392
393 case OP_NOT_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000394 for (c = 0; c < 32; c++)
395 start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000396 break;
397
398 case OP_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000399 for (c = 0; c < 32; c++)
400 start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000401 break;
402 }
403
404 tcode += 2;
405 try_next = TRUE;
406 break;
407
408 /* Character class: set the bits and either carry on or not,
409 according to the repeat count. */
410
411 case OP_CLASS:
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000412 case OP_NEGCLASS:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000413 {
Guido van Rossum50700601997-12-08 17:15:20 +0000414 tcode++;
415 for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
416 tcode += 32;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000417 switch (*tcode)
418 {
419 case OP_CRSTAR:
420 case OP_CRMINSTAR:
421 case OP_CRQUERY:
422 case OP_CRMINQUERY:
423 tcode++;
424 try_next = TRUE;
425 break;
426
427 case OP_CRRANGE:
428 case OP_CRMINRANGE:
429 if (((tcode[1] << 8) + tcode[2]) == 0)
430 {
431 tcode += 5;
432 try_next = TRUE;
433 }
434 break;
435 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000436 }
437 break; /* End of class handling */
438
439 } /* End of switch */
440 } /* End of try_next loop */
441
442 code += (code[1] << 8) + code[2]; /* Advance to next branch */
443 }
444while (*code == OP_ALT);
445return TRUE;
446}
447
448
449
450/*************************************************
451* Study a compiled expression *
452*************************************************/
453
454/* This function is handed a compiled expression that it must study to produce
455information that will speed up the matching. It returns a pcre_extra block
456which then gets handed back to pcre_exec().
457
458Arguments:
459 re points to the compiled expression
460 options contains option bits
461 errorptr points to where to place error messages;
462 set NULL unless error
463
464Returns: pointer to a pcre_extra block,
465 NULL on error or if no optimization possible
466*/
467
468pcre_extra *
Guido van Rossum58132c61997-12-17 00:24:13 +0000469pcre_study(const pcre *external_re, int options, const char **errorptr)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000470{
471BOOL caseless;
472uschar start_bits[32];
473real_pcre_extra *extra;
Guido van Rossum58132c61997-12-17 00:24:13 +0000474const real_pcre *re = (const real_pcre *)external_re;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000475
476*errorptr = NULL;
477
478if (re == NULL || re->magic_number != MAGIC_NUMBER)
479 {
480 *errorptr = "argument is not a compiled regular expression";
481 return NULL;
482 }
483
484if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
485 {
486 *errorptr = "unknown or incorrect option bit(s) set";
487 return NULL;
488 }
489
Guido van Rossum50700601997-12-08 17:15:20 +0000490/* Caseless can either be from the compiled regex or from options. */
491
492caseless = ((re->options | options) & PCRE_CASELESS) != 0;
493
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000494/* For an anchored pattern, or an unchored pattern that has a first char, or a
495multiline pattern that matches only at "line starts", no further processing at
496present. */
497
498if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
499 return NULL;
500
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000501/* See if we can find a fixed set of initial characters for the pattern. */
502
Guido van Rossum50700601997-12-08 17:15:20 +0000503memset(start_bits, 0, 32 * sizeof(uschar));
504if (!set_start_bits(re->code, start_bits)) return NULL;
505
506/* If this studying is caseless, scan the created bit map and duplicate the
507bits for any letters. */
508
509if (caseless)
510 {
511 register int c;
512 for (c = 0; c < 256; c++)
513 {
514 if ((start_bits[c/8] & (1 << (c&7))) != 0 &&
515 (pcre_ctypes[c] & ctype_letter) != 0)
516 {
517 int d = pcre_fcc[c];
518 start_bits[d/8] |= (1 << (d&7));
519 }
520 }
521 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000522
523/* Get an "extra" block and put the information therein. */
524
525extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
526
527if (extra == NULL)
528 {
529 *errorptr = "failed to get memory";
530 return NULL;
531 }
Guido van Rossum50700601997-12-08 17:15:20 +0000532
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000533extra->options = PCRE_STUDY_MAPPED | (caseless? PCRE_STUDY_CASELESS : 0);
Guido van Rossum50700601997-12-08 17:15:20 +0000534memcpy(extra->start_bits, start_bits, sizeof(start_bits));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000535
536return (pcre_extra *)extra;
537}
538
Guido van Rossum50700601997-12-08 17:15:20 +0000539/* End of study.c */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000540/*************************************************
541* Perl-Compatible Regular Expressions *
542*************************************************/
543
544/*
545This is a library of functions to support regular expressions whose syntax
546and semantics are as close as possible to those of the Perl 5 language. See
547the file Tech.Notes for some information on the internals.
548
549Written by: Philip Hazel <ph10@cam.ac.uk>
550
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000551 Copyright (c) 1998 University of Cambridge
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000552
553-----------------------------------------------------------------------------
554Permission is granted to anyone to use this software for any purpose on any
555computer system, and to redistribute it freely, subject to the following
556restrictions:
557
5581. This software is distributed in the hope that it will be useful,
559 but WITHOUT ANY WARRANTY; without even the implied warranty of
560 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
561
5622. The origin of this software must not be misrepresented, either by
563 explicit claim or by omission.
564
5653. Altered versions must be plainly marked as such, and must not be
566 misrepresented as being the original software.
567-----------------------------------------------------------------------------
568*/
569
570
571/* Define DEBUG to get debugging output on stdout. */
572
573/* #define DEBUG */
574
Guido van Rossum557dea11997-12-22 22:46:52 +0000575/* Use a macro for debugging printing, 'cause that eliminates the the use
576of #ifdef inline, and there are *still* stupid compilers about that don't like
577indented pre-processor statements. I suppose it's only been 10 years... */
578
579#ifdef DEBUG
580#define DPRINTF(p) printf p
581#else
582#define DPRINTF(p) /*nothing*/
583#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000584
585/* Include the internals header, which itself includes Standard C headers plus
586the external pcre header. */
587
588
Guido van Rossum50700601997-12-08 17:15:20 +0000589
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000590
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000591#ifndef Py_eval_input
592/* For Python 1.4, graminit.h has to be explicitly included */
593#define Py_eval_input eval_input
Guido van Rossum50700601997-12-08 17:15:20 +0000594
595#endif /* FOR_PYTHON */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000596
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000597/* Allow compilation as C++ source code, should anybody want to do that. */
598
599#ifdef __cplusplus
600#define class pcre_class
601#endif
602
603
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000604/* Min and max values for the common repeats; for the maxima, 0 => infinity */
605
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000606static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
607static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000608
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000609/* Text forms of OP_ values and things, for debugging (not all used) */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000610
611#ifdef DEBUG
Guido van Rossum58132c61997-12-17 00:24:13 +0000612static const char *OP_names[] = {
613 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
Guido van Rossum50700601997-12-08 17:15:20 +0000614 "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z",
615 "localized \\B", "localized \\b", "localized \\W", "localized \\w",
616 "^", "$", "Any", "chars",
617 "not",
618 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000619 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
620 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
621 "*", "*?", "+", "+?", "?", "??", "{", "{",
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000622 "class", "negclass", "classL", "Ref",
Guido van Rossum50700601997-12-08 17:15:20 +0000623 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000624 "Brazero", "Braminzero", "Bra"
625};
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000626#endif
627
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000628/* Table for handling escaped characters in the range '0'-'z'. Positive returns
629are simple data values; negative values are for special things like \d and so
630on. Zero means further processing is needed (for things like \x), or the escape
631is invalid. */
632
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000633static const short int escapes[] = {
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000634 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
635 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
636 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
637 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
638 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
639 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
640 '`', 7, -ESC_b, 0, -ESC_d, 0, '\f', 0, /* ` - g */
641 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
642 0, 0, '\r', -ESC_s, '\t', 0, '\v', -ESC_w, /* p - w */
643 0, 0, 0 /* x - z */
644};
645
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000646/* Definition to allow mutual recursion */
647
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000648static BOOL
649compile_regex(int, int *, uschar **, const uschar **, const char **,
650 PyObject *);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000651
652/* Structure for passing "static" information around between the functions
653doing the matching, so that they are thread-safe. */
654
655typedef struct match_data {
656 int errorcode; /* As it says */
657 int *offset_vector; /* Offset vector */
658 int offset_end; /* One past the end */
659 BOOL offset_overflow; /* Set if too many extractions */
660 BOOL caseless; /* Case-independent flag */
Guido van Rossum50700601997-12-08 17:15:20 +0000661 BOOL runtime_caseless; /* Caseless forced at run time */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000662 BOOL multiline; /* Multiline flag */
Guido van Rossum50700601997-12-08 17:15:20 +0000663 BOOL notbol; /* NOTBOL flag */
664 BOOL noteol; /* NOTEOL flag */
665 BOOL dotall; /* Dot matches any char */
666 BOOL endonly; /* Dollar not before final \n */
Guido van Rossum58132c61997-12-17 00:24:13 +0000667 const uschar *start_subject; /* Start of the subject string */
668 const uschar *end_subject; /* End of the subject string */
Guido van Rossum50700601997-12-08 17:15:20 +0000669 jmp_buf fail_env; /* Environment for longjump() break out */
Guido van Rossum58132c61997-12-17 00:24:13 +0000670 const uschar *end_match_ptr; /* Subject position at end match */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000671 int end_offset_top; /* Highwater mark at end of match */
Guido van Rossum50700601997-12-08 17:15:20 +0000672 jmp_buf error_env; /* For longjmp() if an error occurs deep inside a
673 matching operation */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000674 int length; /* Length of the allocated stacks */
675 int point; /* Point to add next item pushed onto stacks */
676 /* Pointers to the 6 stacks */
677 int *off_num, *offset_top, *r1, *r2;
Guido van Rossum58132c61997-12-17 00:24:13 +0000678 const uschar **eptr, **ecode;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000679} match_data;
680
681
682
683/*************************************************
Guido van Rossum50700601997-12-08 17:15:20 +0000684* Global variables *
685*************************************************/
686
687/* PCRE is thread-clean and doesn't use any global variables in the normal
688sense. However, it calls memory allocation and free functions via the two
689indirections below, which are can be changed by the caller, but are shared
690between all threads. */
691
692void *(*pcre_malloc)(size_t) = malloc;
693void (*pcre_free)(void *) = free;
694
695
696
697
698/*************************************************
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000699* Return version string *
700*************************************************/
701
Guido van Rossum58132c61997-12-17 00:24:13 +0000702const char *
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000703pcre_version(void)
704{
705return PCRE_VERSION;
706}
707
708
709
710
711/*************************************************
712* Return info about a compiled pattern *
713*************************************************/
714
715/* This function picks potentially useful data out of the private
716structure.
717
718Arguments:
719 external_re points to compiled code
720 optptr where to pass back the options
721 first_char where to pass back the first character,
722 or -1 if multiline and all branches start ^,
723 or -2 otherwise
724
725Returns: number of identifying extraction brackets
726 or negative values on error
727*/
728
729int
Guido van Rossum50700601997-12-08 17:15:20 +0000730pcre_info(const pcre *external_re, int *optptr, int *first_char)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000731{
Guido van Rossum58132c61997-12-17 00:24:13 +0000732const real_pcre *re = (real_pcre *)external_re;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000733if (re == NULL) return PCRE_ERROR_NULL;
734if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
735if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
736if (first_char != NULL)
737 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
738 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
739return re->top_bracket;
740}
741
742
743
744
745#ifdef DEBUG
746/*************************************************
747* Debugging function to print chars *
748*************************************************/
749
750/* Print a sequence of chars in printable format, stopping at the end of the
751subject if the requested.
752
753Arguments:
754 p points to characters
755 length number to print
756 is_subject TRUE if printing from within md->start_subject
757 md pointer to matching data block, if is_subject is TRUE
758
759Returns: nothing
760*/
761
Guido van Rossum557dea11997-12-22 22:46:52 +0000762static void
763pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000764{
765int c;
766if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
767while (length-- > 0)
768 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
769}
770#endif
771
772
773
774
775/*************************************************
776* Check subpattern for empty operand *
777*************************************************/
778
779/* This function checks a bracketed subpattern to see if any of the paths
780through it could match an empty string. This is used to diagnose an error if
781such a subpattern is followed by a quantifier with an unlimited upper bound.
782
783Argument:
784 code points to the opening bracket
785
786Returns: TRUE or FALSE
787*/
788
789static BOOL
790could_be_empty(uschar *code)
791{
792do {
793 uschar *cc = code + 3;
794
795 /* Scan along the opcodes for this branch; as soon as we find something
796 that matches a non-empty string, break out and advance to test the next
797 branch. If we get to the end of the branch, return TRUE for the whole
798 sub-expression. */
799
800 for (;;)
801 {
802 /* Test an embedded subpattern; if it could not be empty, break the
803 loop. Otherwise carry on in the branch. */
804
Guido van Rossum50700601997-12-08 17:15:20 +0000805 if ((int)(*cc) >= OP_BRA || (int)(*cc) == OP_ONCE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000806 {
807 if (!could_be_empty(cc)) break;
808 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
809 cc += 3;
810 }
811
812 else switch (*cc)
813 {
814 /* Reached end of a branch: the subpattern may match the empty string */
815
816 case OP_ALT:
817 case OP_KET:
818 case OP_KETRMAX:
819 case OP_KETRMIN:
820 return TRUE;
821
Guido van Rossumd0f432b1998-02-20 21:45:14 +0000822 /* Skip over entire bracket groups with zero lower bound */
823
824 case OP_BRAZERO:
825 case OP_BRAMINZERO:
826 cc++;
827 /* Fall through */
828
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000829 /* Skip over assertive subpatterns */
830
831 case OP_ASSERT:
832 case OP_ASSERT_NOT:
833 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
834 cc += 3;
835 break;
836
837 /* Skip over things that don't match chars */
838
839 case OP_SOD:
840 case OP_EOD:
841 case OP_CIRC:
842 case OP_DOLL:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000843 case OP_NOT_WORD_BOUNDARY:
844 case OP_WORD_BOUNDARY:
Guido van Rossum50700601997-12-08 17:15:20 +0000845 case OP_NOT_WORD_BOUNDARY_L:
846 case OP_WORD_BOUNDARY_L:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000847 cc++;
848 break;
849
850 /* Skip over simple repeats with zero lower bound */
851
852 case OP_STAR:
853 case OP_MINSTAR:
854 case OP_QUERY:
855 case OP_MINQUERY:
Guido van Rossum50700601997-12-08 17:15:20 +0000856 case OP_NOTSTAR:
857 case OP_NOTMINSTAR:
858 case OP_NOTQUERY:
859 case OP_NOTMINQUERY:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000860 case OP_TYPESTAR:
861 case OP_TYPEMINSTAR:
862 case OP_TYPEQUERY:
863 case OP_TYPEMINQUERY:
864 cc += 2;
865 break;
866
867 /* Skip over UPTOs (lower bound is zero) */
868
869 case OP_UPTO:
870 case OP_MINUPTO:
871 case OP_TYPEUPTO:
872 case OP_TYPEMINUPTO:
873 cc += 4;
874 break;
875
876 /* Check a class or a back reference for a zero minimum */
877
878 case OP_CLASS:
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000879 case OP_NEGCLASS:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000880 case OP_REF:
Guido van Rossum50700601997-12-08 17:15:20 +0000881 case OP_CLASS_L:
882 switch(*cc)
883 {
884 case (OP_REF): cc += 2; break;
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000885 case (OP_CLASS): case (OP_NEGCLASS): cc += 1+32; break;
Guido van Rossum50700601997-12-08 17:15:20 +0000886 case (OP_CLASS_L): cc += 1+1+32; break;
887 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000888
889 switch (*cc)
890 {
891 case OP_CRSTAR:
892 case OP_CRMINSTAR:
893 case OP_CRQUERY:
894 case OP_CRMINQUERY:
895 cc++;
896 break;
897
898 case OP_CRRANGE:
899 case OP_CRMINRANGE:
900 if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
901 cc += 3;
902 break;
903
904 default:
905 goto NEXT_BRANCH;
906 }
907 break;
908
909 /* Anything else matches at least one character */
910
911 default:
912 goto NEXT_BRANCH;
913 }
914 }
915
916 NEXT_BRANCH:
917 code += (code[1] << 8) + code[2];
918 }
919while (*code == OP_ALT);
920
921/* No branches match the empty string */
922
923return FALSE;
924}
925
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000926/* Determine the length of a group ID in an expression like
927 (?P<foo_123>...)
928Arguments:
929 ptr pattern position pointer (say that 3 times fast)
930 finalchar the character that will mark the end of the ID
931 errorptr points to the pointer to the error message
932*/
933
934static int
Guido van Rossum58132c61997-12-17 00:24:13 +0000935get_group_id(const uschar *ptr, char finalchar, const char **errorptr)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000936{
Guido van Rossum58132c61997-12-17 00:24:13 +0000937 const uschar *start = ptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000938
939 /* If the first character is not in \w, or is in \w but is a digit,
940 report an error */
941 if (!(pcre_ctypes[*ptr] & ctype_word) ||
942 (pcre_ctypes[*ptr++] & ctype_digit))
943 {
944 *errorptr = "(?P identifier must start with a letter or underscore";
945 return 0;
946 }
947
948 /* Increment ptr until we either hit a null byte, the desired
949 final character, or a non-word character */
950 for(; (*ptr != 0) && (*ptr != finalchar) &&
951 (pcre_ctypes[*ptr] & ctype_word); ptr++)
952 {
Guido van Rossumc3861071997-10-08 02:07:40 +0000953 /* Empty loop body */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000954 }
955 if (*ptr==finalchar)
956 return ptr-start;
957 if (*ptr==0)
958 {
959 *errorptr = "unterminated (?P identifier";
960 return 0;
961 }
962 *errorptr = "illegal character in (?P identifier";
963 return 0;
964}
965
966/*************************************************
967* Handle escapes *
968*************************************************/
969
970/* This function is called when a \ has been encountered. It either returns a
971positive value for a simple escape such as \n, or a negative value which
972encodes one of the more complicated things such as \d. On entry, ptr is
973pointing at the \. On exit, it is on the final character of the escape
974sequence.
975
976Arguments:
977 ptrptr points to the pattern position pointer
978 errorptr points to the pointer to the error message
Guido van Rossum50700601997-12-08 17:15:20 +0000979 bracount number of previous extracting brackets
980 options the options bits
981 isclass TRUE if inside a character class
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000982
983Returns: zero or positive => a data character
984 negative => a special escape sequence
985 on error, errorptr is set
986*/
987
988static int
Guido van Rossum58132c61997-12-17 00:24:13 +0000989check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
990 int options, BOOL isclass)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000991{
Guido van Rossum58132c61997-12-17 00:24:13 +0000992const uschar *ptr = *ptrptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000993int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
994int i;
995
Guido van Rossum50700601997-12-08 17:15:20 +0000996if (c == 0) *errorptr = ERR1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000997
998/* Digits or letters may have special meaning; all others are literals. */
999
1000else if (c < '0' || c > 'z') {}
1001
1002/* Do an initial lookup in a table. A non-zero result is something that can be
1003returned immediately. Otherwise further processing may be required. */
1004
1005else if ((i = escapes[c - '0']) != 0) c = i;
1006
1007/* Escapes that need further processing, or are illegal. */
1008
Guido van Rossum50700601997-12-08 17:15:20 +00001009else
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001010 {
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001011
Guido van Rossum50700601997-12-08 17:15:20 +00001012 switch (c)
1013 {
1014 /* The handling of escape sequences consisting of a string of digits
1015 starting with one that is not zero is not straightforward. By experiment,
1016 the way Perl works seems to be as follows:
1017
1018 Outside a character class, the digits are read as a decimal number. If the
1019 number is less than 10, or if there are that many previous extracting
1020 left brackets, then it is a back reference. Otherwise, up to three octal
1021 digits are read to form an escaped byte. Thus \123 is likely to be octal
1022 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
1023 value is greater than 377, the least significant 8 bits are taken. Inside a
1024 character class, \ followed by a digit is always an octal number. */
1025
1026 case '1': case '2': case '3': case '4': case '5':
1027 case '6': case '7': case '8': case '9':
1028
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001029 {
1030 /* PYTHON: Try to compute an octal value for a character */
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001031 for(c=0, i=0; ptr[i]!=0 && i<3; i++)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001032 {
1033 if (( pcre_ctypes[ ptr[i] ] & ctype_odigit) != 0)
1034 c = c * 8 + ptr[i]-'0';
1035 else
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001036 break; /* Non-octal character--break out of the loop */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001037 }
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001038 /* It's a character if there were exactly 3 octal digits, or if
1039 we're inside a character class and there was at least one
1040 octal digit. */
1041 if ( (i == 3) || (isclass && i!=0) )
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001042 {
1043 ptr += i-1;
1044 break;
1045 }
1046 c = ptr[0]; /* Restore the first character after the \ */
1047 c -= '0'; i = 1;
1048 while (i<2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0)
1049 {
1050 c = c * 10 + ptr[1] - '0';
1051 ptr++; i++;
1052 }
1053 if (c > 255 - ESC_REF) *errorptr = "back reference too big";
1054 c = -(ESC_REF + c);
1055 }
1056 break;
1057
Guido van Rossum50700601997-12-08 17:15:20 +00001058 /* \0 always starts an octal number, but we may drop through to here with a
1059 larger first octal digit */
1060
1061 case '0':
1062 c -= '0';
1063 while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&
1064 ptr[1] != '8' && ptr[1] != '9')
1065 c = c * 8 + *(++ptr) - '0';
1066 break;
1067
1068 /* Special escapes not starting with a digit are straightforward */
1069
1070 case 'x':
1071 c = 0;
1072 while ( (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001073 {
Guido van Rossum50700601997-12-08 17:15:20 +00001074 ptr++;
1075 c = c * 16 + pcre_lcc[*ptr] -
1076 (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
1077 c &= 255;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001078 }
1079 break;
1080
1081
Guido van Rossum50700601997-12-08 17:15:20 +00001082 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1083 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
1084 for Perl compatibility, it is a literal. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001085
Guido van Rossum50700601997-12-08 17:15:20 +00001086 default:
1087 if ((options & PCRE_EXTRA) != 0) switch(c)
1088 {
1089 case 'X':
1090 c = -ESC_X; /* This could be a lookup if it ever got into Perl */
1091 break;
1092
1093 default:
1094 *errorptr = ERR3;
1095 break;
1096 }
1097 break;
1098 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001099 }
1100
1101*ptrptr = ptr;
1102return c;
1103}
1104
1105
1106
1107/*************************************************
Guido van Rossum50700601997-12-08 17:15:20 +00001108* Check for counted repeat *
1109*************************************************/
1110
1111/* This function is called when a '{' is encountered in a place where it might
1112start a quantifier. It looks ahead to see if it really is a quantifier or not.
1113It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1114where the ddds are digits.
1115
1116Arguments:
1117 p pointer to the first char after '{'
1118
1119Returns: TRUE or FALSE
1120*/
1121
1122static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00001123is_counted_repeat(const uschar *p)
Guido van Rossum50700601997-12-08 17:15:20 +00001124{
1125if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1126while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1127if (*p == '}') return TRUE;
1128
1129if (*p++ != ',') return FALSE;
1130if (*p == '}') return TRUE;
1131
1132if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1133while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1134return (*p == '}');
1135}
1136
1137
1138
1139/*************************************************
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001140* Read repeat counts *
1141*************************************************/
1142
Guido van Rossum50700601997-12-08 17:15:20 +00001143/* Read an item of the form {n,m} and return the values. This is called only
1144after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1145so the syntax is guaranteed to be correct, but we need to check the values.
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001146
1147Arguments:
1148 p pointer to first char after '{'
1149 minp pointer to int for min
1150 maxp pointer to int for max
1151 returned as -1 if no max
1152 errorptr points to pointer to error message
1153
1154Returns: pointer to '}' on success;
1155 current ptr on error, with errorptr set
1156*/
1157
Guido van Rossum58132c61997-12-17 00:24:13 +00001158static const uschar *
1159read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001160{
1161int min = 0;
1162int max = -1;
1163
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001164while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
1165
1166if (*p == '}') max = min; else
1167 {
Guido van Rossum50700601997-12-08 17:15:20 +00001168 if (*(++p) != '}')
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001169 {
1170 max = 0;
1171 while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001172 if (max < min)
1173 {
Guido van Rossum50700601997-12-08 17:15:20 +00001174 *errorptr = ERR4;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001175 return p;
1176 }
1177 }
1178 }
1179
1180/* Do paranoid checks, then fill in the required variables, and pass back the
1181pointer to the terminating '}'. */
1182
Guido van Rossum50700601997-12-08 17:15:20 +00001183if (min > 65535 || max > 65535)
1184 *errorptr = ERR5;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001185else
1186 {
1187 *minp = min;
1188 *maxp = max;
1189 }
1190return p;
1191}
1192
1193
1194
1195/*************************************************
1196* Compile one branch *
1197*************************************************/
1198
1199/* Scan the pattern, compiling it into the code vector.
1200
1201Arguments:
Guido van Rossum50700601997-12-08 17:15:20 +00001202 options the option bits
1203 bracket points to number of brackets used
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001204 code points to the pointer to the current code point
1205 ptrptr points to the current pattern pointer
1206 errorptr points to pointer to error message
1207
1208Returns: TRUE on success
1209 FALSE, with *errorptr set on error
1210*/
1211
1212static BOOL
Guido van Rossum50700601997-12-08 17:15:20 +00001213compile_branch(int options, int *brackets, uschar **codeptr,
Guido van Rossum58132c61997-12-17 00:24:13 +00001214 const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001215{
1216int repeat_type, op_type;
1217int repeat_min, repeat_max;
1218int bravalue, length;
Guido van Rossumdda66961998-05-07 15:32:44 +00001219int greedy_default, greedy_non_default;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001220register int c;
1221register uschar *code = *codeptr;
Guido van Rossum58132c61997-12-17 00:24:13 +00001222const uschar *ptr = *ptrptr;
1223const uschar *oldptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001224uschar *previous = NULL;
Guido van Rossum50700601997-12-08 17:15:20 +00001225uschar class[32];
1226uschar *class_flag; /* Pointer to the single-byte flag for OP_CLASS_L */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001227
Guido van Rossumdda66961998-05-07 15:32:44 +00001228/* Set up the default and non-default settings for greediness */
1229
1230greedy_default = ((options & PCRE_UNGREEDY) != 0);
1231greedy_non_default = greedy_default ^ 1;
1232
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001233/* Switch on next character until the end of the branch */
1234
1235for (;; ptr++)
1236 {
Guido van Rossum50700601997-12-08 17:15:20 +00001237 BOOL negate_class;
1238 int class_charcount;
1239 int class_lastchar;
1240
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001241 c = *ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00001242 if ((options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001243 {
1244 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1245 if (c == '#')
1246 {
1247 while ((c = *(++ptr)) != 0 && c != '\n');
1248 continue;
1249 }
1250 }
1251
1252 switch(c)
1253 {
1254 /* The branch terminates at end of string, |, or ). */
1255
1256 case 0:
1257 case '|':
1258 case ')':
1259 *codeptr = code;
1260 *ptrptr = ptr;
1261 return TRUE;
1262
1263 /* Handle single-character metacharacters */
1264
1265 case '^':
1266 previous = NULL;
1267 *code++ = OP_CIRC;
1268 break;
1269
1270 case '$':
1271 previous = NULL;
1272 *code++ = OP_DOLL;
1273 break;
1274
1275 case '.':
1276 previous = code;
1277 *code++ = OP_ANY;
1278 break;
1279
Guido van Rossum50700601997-12-08 17:15:20 +00001280 /* Character classes. These always build a 32-byte bitmap of the permitted
1281 characters, except in the special case where there is only one character.
1282 For negated classes, we build the map as usual, then invert it at the end.
1283 */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001284
1285 case '[':
Guido van Rossum50700601997-12-08 17:15:20 +00001286 previous = code;
1287 if (options & PCRE_LOCALE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001288 {
Guido van Rossum50700601997-12-08 17:15:20 +00001289 *code++ = OP_CLASS_L;
1290 /* Set the flag for localized classes (like \w) to 0 */
1291 class_flag = code;
1292 *class_flag = 0;
1293 }
1294 else
1295 {
1296 *code++ = OP_CLASS;
1297 class_flag = NULL;
1298 }
1299
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001300 /* If the first character is '^', set the negation flag, and use a
1301 different opcode. This only matters if caseless matching is specified at
1302 runtime. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001303
Guido van Rossum50700601997-12-08 17:15:20 +00001304 if ((c = *(++ptr)) == '^')
1305 {
1306 negate_class = TRUE;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001307 if (*(code-1)==OP_CLASS) *(code-1) = OP_NEGCLASS;
Guido van Rossum50700601997-12-08 17:15:20 +00001308 c = *(++ptr);
1309 }
1310 else negate_class = FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001311
Guido van Rossum50700601997-12-08 17:15:20 +00001312 /* Keep a count of chars so that we can optimize the case of just a single
1313 character. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001314
Guido van Rossum50700601997-12-08 17:15:20 +00001315 class_charcount = 0;
1316 class_lastchar = -1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001317
Guido van Rossum50700601997-12-08 17:15:20 +00001318 /* Initialize the 32-char bit map to all zeros. We have to build the
1319 map in a temporary bit of store, in case the class contains only 1
1320 character, because in that case the compiled code doesn't use the
1321 bit map. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001322
Guido van Rossum50700601997-12-08 17:15:20 +00001323 memset(class, 0, 32 * sizeof(uschar));
1324
1325 /* Process characters until ] is reached. By writing this as a "do" it
1326 means that an initial ] is taken as a data character. */
1327
1328 do
1329 {
1330 if (c == 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001331 {
Guido van Rossum50700601997-12-08 17:15:20 +00001332 *errorptr = ERR6;
1333 goto FAILED;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001334 }
1335
Guido van Rossum50700601997-12-08 17:15:20 +00001336 /* Backslash may introduce a single character, or it may introduce one
1337 of the specials, which just set a flag. Escaped items are checked for
1338 validity in the pre-compiling pass. The sequence \b is a special case.
Guido van Rossum58132c61997-12-17 00:24:13 +00001339 Inside a class (and only there) it is treated as backspace. Elsewhere
Guido van Rossum50700601997-12-08 17:15:20 +00001340 it marks a word boundary. Other escapes have preset maps ready to
1341 or into the one we are building. We assume they have more than one
1342 character in them, so set class_count bigger than one. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001343
Guido van Rossum50700601997-12-08 17:15:20 +00001344 if (c == '\\')
1345 {
1346 c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1347 if (-c == ESC_b) c = '\b';
1348 else if (c < 0)
1349 {
1350 class_charcount = 10;
1351 switch (-c)
1352 {
1353 case ESC_d:
Guido van Rossum50700601997-12-08 17:15:20 +00001354 {
1355 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];
1356 }
1357 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001358
Guido van Rossum50700601997-12-08 17:15:20 +00001359 case ESC_D:
Guido van Rossum50700601997-12-08 17:15:20 +00001360 {
1361 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];
1362 }
1363 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001364
Guido van Rossum50700601997-12-08 17:15:20 +00001365 case ESC_w:
1366 if (options & PCRE_LOCALE)
1367 {
1368 *class_flag |= 1;
1369 }
1370 else
1371 {
1372 for (c = 0; c < 32; c++)
1373 class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1374 }
1375 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001376
Guido van Rossum50700601997-12-08 17:15:20 +00001377 case ESC_W:
1378 if (options & PCRE_LOCALE)
1379 {
1380 *class_flag |= 2;
1381 }
1382 else
1383 {
1384 for (c = 0; c < 32; c++)
1385 class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1386 }
1387 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001388
Guido van Rossum50700601997-12-08 17:15:20 +00001389 case ESC_s:
Guido van Rossum50700601997-12-08 17:15:20 +00001390 {
1391 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];
1392 }
1393 continue;
1394
1395 case ESC_S:
Guido van Rossum50700601997-12-08 17:15:20 +00001396 {
1397 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];
1398 }
1399 continue;
1400
1401 default:
1402 *errorptr = ERR7;
1403 goto FAILED;
1404 }
1405 }
1406 /* Fall through if single character */
1407 }
1408
1409 /* A single character may be followed by '-' to form a range. However,
1410 Perl does not permit ']' to be the end of the range. A '-' character
1411 here is treated as a literal. */
1412
1413 if (ptr[1] == '-' && ptr[2] != ']')
1414 {
1415 int d;
1416 ptr += 2;
1417 d = *ptr;
1418
1419 if (d == 0)
1420 {
1421 *errorptr = ERR6;
1422 goto FAILED;
1423 }
1424
1425 /* The second part of a range can be a single-character escape, but
1426 not any of the other escapes. */
1427
1428 if (d == '\\')
1429 {
1430 d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1431 if (d < 0)
1432 {
1433 if (d == -ESC_b) d = '\b'; else
1434 {
1435 *errorptr = ERR7;
1436 goto FAILED;
1437 }
1438 }
1439 }
1440
1441 if (d < c)
1442 {
1443 *errorptr = ERR8;
1444 goto FAILED;
1445 }
1446
1447 for (; c <= d; c++)
1448 {
1449 class[c/8] |= (1 << (c&7));
1450 if ((options & PCRE_CASELESS) != 0)
1451 {
1452 int uc = pcre_fcc[c]; /* flip case */
1453 class[uc/8] |= (1 << (uc&7));
1454 }
1455 class_charcount++; /* in case a one-char range */
1456 class_lastchar = c;
1457 }
1458 continue; /* Go get the next char in the class */
1459 }
1460
1461 /* Handle a lone single character - we can get here for a normal
1462 non-escape char, or after \ that introduces a single character. */
1463
1464 class [c/8] |= (1 << (c&7));
1465 if ((options & PCRE_CASELESS) != 0)
1466 {
1467 c = pcre_fcc[c]; /* flip case */
1468 class[c/8] |= (1 << (c&7));
1469 }
1470 class_charcount++;
1471 class_lastchar = c;
1472 }
1473
1474 /* Loop until ']' reached; the check for end of string happens inside the
1475 loop. This "while" is the end of the "do" above. */
1476
1477 while ((c = *(++ptr)) != ']');
1478
1479 /* If class_charcount is 1 and class_lastchar is not negative, we saw
1480 precisely one character. This doesn't need the whole 32-byte bit map.
1481 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1482 it's negative. */
1483
1484 if (class_charcount == 1 && class_lastchar >= 0)
1485 {
1486 if (negate_class)
1487 {
1488 code[-1] = OP_NOT;
1489 }
1490 else
1491 {
1492 code[-1] = OP_CHARS;
1493 *code++ = 1;
1494 }
1495 *code++ = class_lastchar;
1496 }
1497
1498 /* Otherwise, negate the 32-byte map if necessary, and copy it into
1499 the code vector. */
1500
1501 else
1502 {
1503 /* If this is a localized opcode, bump the code pointer up */
1504 if (class_flag) code++;
1505 if (negate_class)
1506 {
1507 if (class_flag) *class_flag = (*class_flag) ^ 63;
1508 for (c = 0; c < 32; c++) code[c] = ~class[c];
1509 }
1510 else
1511 memcpy(code, class, 32);
1512 code += 32;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001513 }
1514 break;
1515
1516 /* Various kinds of repeat */
1517
1518 case '{':
Guido van Rossum50700601997-12-08 17:15:20 +00001519 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001520 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
1521 if (*errorptr != NULL) goto FAILED;
1522 goto REPEAT;
1523
1524 case '*':
1525 repeat_min = 0;
1526 repeat_max = -1;
1527 goto REPEAT;
1528
1529 case '+':
1530 repeat_min = 1;
1531 repeat_max = -1;
1532 goto REPEAT;
1533
1534 case '?':
1535 repeat_min = 0;
1536 repeat_max = 1;
1537
1538 REPEAT:
1539 if (previous == NULL)
1540 {
Guido van Rossum50700601997-12-08 17:15:20 +00001541 *errorptr = ERR9;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001542 goto FAILED;
1543 }
1544
Guido van Rossumdda66961998-05-07 15:32:44 +00001545 /* If the next character is '?' this is a minimizing repeat, by default,
1546 but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001547 next character. */
1548
Guido van Rossumdda66961998-05-07 15:32:44 +00001549 if (ptr[1] == '?')
1550 { repeat_type = greedy_non_default; ptr++; }
1551 else repeat_type = greedy_default;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001552
Guido van Rossum50700601997-12-08 17:15:20 +00001553 /* If the maximum is zero then the minimum must also be zero; Perl allows
1554 this case, so we do too - by simply omitting the item altogether. */
1555
1556 if (repeat_max == 0) code = previous;
1557
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001558 /* If previous was a string of characters, chop off the last one and use it
1559 as the subject of the repeat. If there was only one character, we can
1560 abolish the previous item altogether. */
1561
Guido van Rossum50700601997-12-08 17:15:20 +00001562 else if (*previous == OP_CHARS)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001563 {
1564 int len = previous[1];
1565 if (len == 1)
1566 {
1567 c = previous[2];
1568 code = previous;
1569 }
1570 else
1571 {
1572 c = previous[len+1];
1573 previous[1]--;
1574 code--;
1575 }
1576 op_type = 0; /* Use single-char op codes */
1577 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1578 }
1579
Guido van Rossum50700601997-12-08 17:15:20 +00001580 /* If previous was a single negated character ([^a] or similar), we use
1581 one of the special opcodes, replacing it. The code is shared with single-
1582 character repeats by adding a suitable offset into repeat_type. */
1583
1584 else if ((int)*previous == OP_NOT)
1585 {
1586 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
1587 c = previous[1];
1588 code = previous;
1589 goto OUTPUT_SINGLE_REPEAT;
1590 }
1591
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001592 /* If previous was a character type match (\d or similar), abolish it and
1593 create a suitable repeat item. The code is shared with single-character
1594 repeats by adding a suitable offset into repeat_type. */
1595
Guido van Rossum50700601997-12-08 17:15:20 +00001596 else if ((int)*previous < OP_CIRC || *previous == OP_ANY)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001597 {
1598 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1599 c = *previous;
1600 code = previous;
1601
1602 OUTPUT_SINGLE_REPEAT:
1603 repeat_type += op_type; /* Combine both values for many cases */
1604
1605 /* A minimum of zero is handled either as the special case * or ?, or as
1606 an UPTO, with the maximum given. */
1607
1608 if (repeat_min == 0)
1609 {
1610 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1611 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1612 else
1613 {
1614 *code++ = OP_UPTO + repeat_type;
1615 *code++ = repeat_max >> 8;
1616 *code++ = (repeat_max & 255);
1617 }
1618 }
1619
1620 /* The case {1,} is handled as the special case + */
1621
1622 else if (repeat_min == 1 && repeat_max == -1)
1623 *code++ = OP_PLUS + repeat_type;
1624
1625 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1626 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1627
1628 else
1629 {
1630 if (repeat_min != 1)
1631 {
1632 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1633 *code++ = repeat_min >> 8;
1634 *code++ = (repeat_min & 255);
1635 }
1636
1637 /* If the mininum is 1 and the previous item was a character string,
1638 we either have to put back the item that got cancelled if the string
1639 length was 1, or add the character back onto the end of a longer
Guido van Rossumdda66961998-05-07 15:32:44 +00001640 string. For a character type nothing need be done; it will just get
1641 put back naturally. Note that the final character is always going to
1642 get added below. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001643
1644 else if (*previous == OP_CHARS)
1645 {
1646 if (code == previous) code += 2; else previous[1]++;
1647 }
1648
Guido van Rossumdda66961998-05-07 15:32:44 +00001649 /* For a single negated character we also have to put back the
1650 item that got cancelled. */
1651
1652 else if (*previous == OP_NOT) code++;
1653
Guido van Rossum557dea11997-12-22 22:46:52 +00001654 /* If the maximum is unlimited, insert an OP_STAR. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001655
Guido van Rossum557dea11997-12-22 22:46:52 +00001656 if (repeat_max < 0)
1657 {
1658 *code++ = c;
1659 *code++ = OP_STAR + repeat_type;
1660 }
1661
1662 /* Else insert an UPTO if the max is greater than the min. */
1663
1664 else if (repeat_max != repeat_min)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001665 {
1666 *code++ = c;
1667 repeat_max -= repeat_min;
1668 *code++ = OP_UPTO + repeat_type;
1669 *code++ = repeat_max >> 8;
1670 *code++ = (repeat_max & 255);
1671 }
1672 }
1673
1674 /* The character or character type itself comes last in all cases. */
1675
1676 *code++ = c;
1677 }
1678
1679 /* If previous was a character class or a back reference, we put the repeat
1680 stuff after it. */
1681
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001682 else if (*previous == OP_CLASS || *previous == OP_NEGCLASS ||
1683 *previous==OP_CLASS_L || *previous == OP_REF)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001684 {
1685 if (repeat_min == 0 && repeat_max == -1)
1686 *code++ = OP_CRSTAR + repeat_type;
1687 else if (repeat_min == 1 && repeat_max == -1)
1688 *code++ = OP_CRPLUS + repeat_type;
1689 else if (repeat_min == 0 && repeat_max == 1)
1690 *code++ = OP_CRQUERY + repeat_type;
1691 else
1692 {
1693 *code++ = OP_CRRANGE + repeat_type;
1694 *code++ = repeat_min >> 8;
1695 *code++ = repeat_min & 255;
1696 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1697 *code++ = repeat_max >> 8;
1698 *code++ = repeat_max & 255;
1699 }
1700 }
1701
1702 /* If previous was a bracket group, we may have to replicate it in certain
1703 cases. If the maximum repeat count is unlimited, check that the bracket
1704 group cannot match the empty string, and diagnose an error if it can. */
1705
1706 else if ((int)*previous >= OP_BRA)
1707 {
1708 int i;
Guido van Rossum557dea11997-12-22 22:46:52 +00001709 int len = code - previous;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001710
1711 if (repeat_max == -1 && could_be_empty(previous))
1712 {
Guido van Rossum50700601997-12-08 17:15:20 +00001713 *errorptr = ERR10;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001714 goto FAILED;
1715 }
1716
1717 /* If the minimum is greater than zero, and the maximum is unlimited or
1718 equal to the minimum, the first copy remains where it is, and is
1719 replicated up to the minimum number of times. This case includes the +
1720 repeat, but of course no replication is needed in that case. */
1721
1722 if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1723 {
1724 for (i = 1; i < repeat_min; i++)
1725 {
Guido van Rossum557dea11997-12-22 22:46:52 +00001726 memcpy(code, previous, len);
1727 code += len;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001728 }
1729 }
1730
1731 /* If the minimum is zero, stick BRAZERO in front of the first copy.
1732 Then, if there is a fixed upper limit, replicated up to that many times,
1733 sticking BRAZERO in front of all the optional ones. */
1734
1735 else
1736 {
1737 if (repeat_min == 0)
1738 {
Guido van Rossum557dea11997-12-22 22:46:52 +00001739 memmove(previous+1, previous, len);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001740 code++;
1741 *previous++ = OP_BRAZERO + repeat_type;
1742 }
1743
1744 for (i = 1; i < repeat_min; i++)
1745 {
Guido van Rossum557dea11997-12-22 22:46:52 +00001746 memcpy(code, previous, len);
1747 code += len;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001748 }
1749
1750 for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1751 {
1752 *code++ = OP_BRAZERO + repeat_type;
Guido van Rossum557dea11997-12-22 22:46:52 +00001753 memcpy(code, previous, len);
1754 code += len;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001755 }
1756 }
1757
1758 /* If the maximum is unlimited, set a repeater in the final copy. */
1759
1760 if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
1761 }
1762
1763 /* Else there's some kind of shambles */
1764
1765 else
1766 {
Guido van Rossum50700601997-12-08 17:15:20 +00001767 *errorptr = ERR11;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001768 goto FAILED;
1769 }
1770
1771 /* In all case we no longer have a previous item. */
1772
1773 previous = NULL;
1774 break;
1775
1776
1777 /* Start of nested bracket sub-expression, or comment or lookahead.
1778 First deal with special things that can come after a bracket; all are
1779 introduced by ?, and the appearance of any of them means that this is not a
1780 referencing group. They were checked for validity in the first pass over
1781 the string, so we don't have to check for syntax errors here. */
1782
1783 case '(':
1784 previous = code; /* Only real brackets can be repeated */
1785 if (*(++ptr) == '?')
1786 {
1787 bravalue = OP_BRA;
1788
1789 switch (*(++ptr))
1790 {
1791 case '#':
1792 case 'i':
Guido van Rossumbd49ac41997-12-10 23:05:53 +00001793 case 'L':
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001794 case 'm':
1795 case 's':
1796 case 'x':
1797 ptr++;
1798 while (*ptr != ')') ptr++;
1799 previous = NULL;
1800 continue;
1801
1802 case ':': /* Non-extracting bracket */
1803 ptr++;
1804 break;
1805
1806 case '=': /* Assertions can't be repeated */
1807 bravalue = OP_ASSERT;
1808 ptr++;
1809 previous = NULL;
1810 break;
1811
1812 case '!':
1813 bravalue = OP_ASSERT_NOT;
1814 ptr++;
1815 previous = NULL;
1816 break;
1817
1818 case ('P'):
1819 ptr++;
1820 if (*ptr=='<')
1821 {
1822 /* (?P<groupname>...) */
1823 int idlen;
1824 PyObject *string, *intobj;
1825
1826 ptr++;
1827 idlen = get_group_id(ptr, '>', errorptr);
1828 if (*errorptr) {
1829 goto FAILED;
1830 }
Guido van Rossum57ba4f31997-12-02 20:40:28 +00001831 string = PyString_FromStringAndSize((char*)ptr, idlen);
Guido van Rossum50700601997-12-08 17:15:20 +00001832 intobj = PyInt_FromLong( brackets[0] + 1 );
Guido van Rossum58132c61997-12-17 00:24:13 +00001833 if (intobj == NULL || string == NULL)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001834 {
1835 Py_XDECREF(string);
1836 Py_XDECREF(intobj);
1837 *errorptr = "exception raised";
1838 goto FAILED;
1839 }
1840 PyDict_SetItem(dictionary, string, intobj);
Guido van Rossum58132c61997-12-17 00:24:13 +00001841 Py_DECREF(string); Py_DECREF(intobj); /* XXX DECREF commented out! */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001842 ptr += idlen+1; /* Point to rest of expression */
1843 goto do_grouping_bracket;
1844 }
1845 if (*ptr=='=')
1846 {
1847 /* (?P=groupname) */
1848 int idlen, refnum;
1849 PyObject *string, *intobj;
1850
1851 ptr++;
1852 idlen = get_group_id(ptr, ')', errorptr);
1853 if (*errorptr) {
1854 goto FAILED;
1855 }
Guido van Rossum50700601997-12-08 17:15:20 +00001856 string = PyString_FromStringAndSize((char *)ptr, idlen);
Guido van Rossumc3861071997-10-08 02:07:40 +00001857 if (string==NULL) {
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001858 *errorptr = "exception raised";
1859 goto FAILED;
1860 }
1861 intobj = PyDict_GetItem(dictionary, string);
1862 if (intobj==NULL) {
Guido van Rossumc3861071997-10-08 02:07:40 +00001863 Py_DECREF(string);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001864 *errorptr = "?P= group identifier isn't defined";
1865 goto FAILED;
1866 }
1867
1868 refnum = PyInt_AsLong(intobj);
Guido van Rossum1eadb411997-12-15 17:33:24 +00001869 Py_DECREF(string);
Guido van Rossum58132c61997-12-17 00:24:13 +00001870 /* The caller doesn't own the reference to the value
1871 returned from PyDict_GetItem, so intobj is not
1872 DECREF'ed. */
1873
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001874 *code++ = OP_REF;
1875 *code++ = refnum;
1876 /* The continue will cause the top-level for() loop to
1877 be resumed, so ptr will be immediately incremented.
1878 Therefore, the following line adds just idlen, not
1879 idlen+1 */
1880 ptr += idlen;
1881 continue;
1882 }
1883 /* The character after ?P is neither < nor =, so
1884 report an error. Add more Python-extensions here. */
1885 *errorptr="unknown after (?P";
1886 goto FAILED;
Guido van Rossum50700601997-12-08 17:15:20 +00001887
1888 case '>': /* "Match once" brackets */
1889 if ((options & PCRE_EXTRA) != 0) /* Not yet standard */
1890 {
1891 bravalue = OP_ONCE;
1892 ptr++;
1893 previous = NULL;
1894 break;
1895 }
1896 /* Else fall through */
1897
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001898 default:
Guido van Rossum50700601997-12-08 17:15:20 +00001899 *errorptr = ERR12;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001900 goto FAILED;
1901 }
1902 }
1903
1904 /* Else we have a referencing group */
1905
1906 else
1907 {
1908 do_grouping_bracket:
Guido van Rossum50700601997-12-08 17:15:20 +00001909 if (++(*brackets) > EXTRACT_MAX)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001910 {
Guido van Rossum50700601997-12-08 17:15:20 +00001911 *errorptr = ERR13;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001912 goto FAILED;
1913 }
Guido van Rossum50700601997-12-08 17:15:20 +00001914 bravalue = OP_BRA + *brackets;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001915 }
1916
1917 /* Process nested bracketed re; at end pointer is on the bracket. We copy
1918 code into a non-register variable in order to be able to pass its address
1919 because some compilers complain otherwise. */
1920
1921 *code = bravalue;
1922 {
1923 uschar *mcode = code;
Guido van Rossum50700601997-12-08 17:15:20 +00001924 if (!compile_regex(options, brackets, &mcode, &ptr, errorptr, dictionary))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001925 goto FAILED;
1926 code = mcode;
1927 }
1928
1929 if (*ptr != ')')
1930 {
Guido van Rossum50700601997-12-08 17:15:20 +00001931 *errorptr = ERR14;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001932 goto FAILED;
1933 }
1934 break;
1935
1936 /* Check \ for being a real metacharacter; if not, fall through and handle
1937 it as a data character at the start of a string. Escape items are checked
1938 for validity in the pre-compiling pass. */
1939
1940 case '\\':
1941 oldptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00001942 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001943
1944 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1945 are arranged to be the negation of the corresponding OP_values. For the
1946 back references, the values are ESC_REF plus the reference number. Only
1947 back references and those types that consume a character may be repeated.
1948 We can test for values between ESC_b and ESC_Z for the latter; this may
1949 have to change if any new ones are ever created. */
1950
1951 if (c < 0)
1952 {
1953 if (-c >= ESC_REF)
1954 {
Guido van Rossum50700601997-12-08 17:15:20 +00001955 int refnum = -c - ESC_REF;
1956 if (*brackets < refnum)
1957 {
1958 *errorptr = ERR15;
1959 goto FAILED;
1960 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001961 previous = code;
1962 *code++ = OP_REF;
1963 *code++ = refnum;
1964 }
1965 else
1966 {
Guido van Rossum50700601997-12-08 17:15:20 +00001967 previous = (-c > ESC_b && -c < ESC_X)? code : NULL;
1968 if ( (options & PCRE_LOCALE) != 0)
1969 {
1970 switch (c)
1971 {
1972 case (-ESC_b): c = -OP_WORD_BOUNDARY_L; break;
1973 case (-ESC_B): c = -OP_NOT_WORD_BOUNDARY_L; break;
1974 case (-ESC_w): c = -OP_WORDCHAR_L; break;
1975 case (-ESC_W): c = -OP_NOT_WORDCHAR_L; break;
1976 }
1977 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001978 *code++ = -c;
1979 }
1980 continue;
1981 }
1982
Guido van Rossum58132c61997-12-17 00:24:13 +00001983 /* Data character: Reset and fall through */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001984
1985 ptr = oldptr;
1986 c = '\\';
1987
1988 /* Handle a run of data characters until a metacharacter is encountered.
1989 The first character is guaranteed not to be whitespace or # when the
1990 extended flag is set. */
1991
Guido van Rossum50700601997-12-08 17:15:20 +00001992 NORMAL_CHAR:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001993 default:
1994 previous = code;
1995 *code = OP_CHARS;
1996 code += 2;
1997 length = 0;
1998
1999 do
2000 {
Guido van Rossum50700601997-12-08 17:15:20 +00002001 if ((options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002002 {
2003 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
2004 if (c == '#')
2005 {
2006 while ((c = *(++ptr)) != 0 && c != '\n');
2007 if (c == 0) break;
2008 continue;
2009 }
2010 }
2011
2012 /* Backslash may introduce a data char or a metacharacter. Escaped items
2013 are checked for validity in the pre-compiling pass. Stop the string
2014 before a metaitem. */
2015
2016 if (c == '\\')
2017 {
2018 oldptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00002019 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002020 if (c < 0) { ptr = oldptr; break; }
2021 }
2022
2023 /* Ordinary character or single-char escape */
2024
2025 *code++ = c;
2026 length++;
2027 }
2028
2029 /* This "while" is the end of the "do" above. */
2030
2031 while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2032
2033 /* Compute the length and set it in the data vector, and advance to
2034 the next state. */
2035
2036 previous[1] = length;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00002037 if (length < 255) ptr--;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002038 break;
2039 }
2040 } /* end of big loop */
2041
2042/* Control never reaches here by falling through, only by a goto for all the
2043error states. Pass back the position in the pattern so that it can be displayed
2044to the user for diagnosing the error. */
2045
2046FAILED:
2047*ptrptr = ptr;
2048return FALSE;
2049}
2050
2051
2052
2053
2054/*************************************************
2055* Compile sequence of alternatives *
2056*************************************************/
2057
2058/* On entry, ptr is pointing past the bracket character, but on return
2059it points to the closing bracket, or vertical bar, or end of string.
2060The code variable is pointing at the byte into which the BRA operator has been
2061stored.
2062
2063Argument:
Guido van Rossum50700601997-12-08 17:15:20 +00002064 options the option bits
2065 brackets -> int containing the number of extracting brackets used
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002066 codeptr -> the address of the current code pointer
2067 ptrptr -> the address of the current pattern pointer
2068 errorptr -> pointer to error message
2069
2070Returns: TRUE on success
2071*/
2072
2073static BOOL
Guido van Rossum50700601997-12-08 17:15:20 +00002074compile_regex(int options, int *brackets, uschar **codeptr,
Guido van Rossum58132c61997-12-17 00:24:13 +00002075 const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002076{
Guido van Rossum58132c61997-12-17 00:24:13 +00002077const uschar *ptr = *ptrptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002078uschar *code = *codeptr;
2079uschar *start_bracket = code;
2080
2081for (;;)
2082 {
2083 int length;
2084 uschar *last_branch = code;
2085
2086 code += 3;
Guido van Rossum50700601997-12-08 17:15:20 +00002087 if (!compile_branch(options, brackets, &code, &ptr, errorptr, dictionary))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002088 {
2089 *ptrptr = ptr;
2090 return FALSE;
2091 }
2092
2093 /* Fill in the length of the last branch */
2094
2095 length = code - last_branch;
2096 last_branch[1] = length >> 8;
2097 last_branch[2] = length & 255;
2098
2099 /* Reached end of expression, either ')' or end of pattern. Insert a
2100 terminating ket and the length of the whole bracketed item, and return,
2101 leaving the pointer at the terminating char. */
2102
2103 if (*ptr != '|')
2104 {
2105 length = code - start_bracket;
2106 *code++ = OP_KET;
2107 *code++ = length >> 8;
2108 *code++ = length & 255;
2109 *codeptr = code;
2110 *ptrptr = ptr;
2111 return TRUE;
2112 }
2113
2114 /* Another branch follows; insert an "or" node and advance the pointer. */
2115
2116 *code = OP_ALT;
2117 ptr++;
2118 }
2119/* Control never reaches here */
2120}
2121
2122
2123
2124/*************************************************
2125* Check for anchored expression *
2126*************************************************/
2127
2128/* Try to find out if this is an anchored regular expression. Consider each
2129alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2130all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2131it's anchored. However, if this is a multiline pattern, then only OP_SOD
2132counts, since OP_CIRC can match in the middle.
2133
2134A branch is also implicitly anchored if it starts with .* because that will try
2135the rest of the pattern at all possible matching points, so there is no point
2136trying them again.
2137
2138Argument: points to start of expression (the bracket)
2139Returns: TRUE or FALSE
2140*/
2141
2142static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00002143is_anchored(register const uschar *code, BOOL multiline)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002144{
2145do {
2146 int op = (int)code[3];
Guido van Rossum50700601997-12-08 17:15:20 +00002147 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002148 { if (!is_anchored(code+3, multiline)) return FALSE; }
2149 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2150 { if (code[4] != OP_ANY) return FALSE; }
2151 else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;
2152 code += (code[1] << 8) + code[2];
2153 }
2154while (*code == OP_ALT);
2155return TRUE;
2156}
2157
2158
2159
2160/*************************************************
2161* Check for start with \n line expression *
2162*************************************************/
2163
2164/* This is called for multiline expressions to try to find out if every branch
2165starts with ^ so that "first char" processing can be done to speed things up.
2166
2167Argument: points to start of expression (the bracket)
2168Returns: TRUE or FALSE
2169*/
2170
2171static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00002172is_startline(const uschar *code)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002173{
2174do {
2175 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2176 { if (!is_startline(code+3)) return FALSE; }
2177 else if (code[3] != OP_CIRC) return FALSE;
2178 code += (code[1] << 8) + code[2];
2179 }
2180while (*code == OP_ALT);
2181return TRUE;
2182}
2183
2184
2185
2186/*************************************************
2187* Check for fixed first char *
2188*************************************************/
2189
2190/* Try to find out if there is a fixed first character. This is called for
2191unanchored expressions, as it speeds up their processing quite considerably.
2192Consider each alternative branch. If they all start with the same char, or with
2193a bracket all of whose alternatives start with the same char (recurse ad lib),
2194then we return that char, otherwise -1.
2195
2196Argument: points to start of expression (the bracket)
2197Returns: -1 or the fixed first char
2198*/
2199
2200static int
2201find_firstchar(uschar *code)
2202{
2203register int c = -1;
2204do
2205 {
2206 register int charoffset = 4;
2207
2208 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2209 {
2210 register int d;
2211 if ((d = find_firstchar(code+3)) < 0) return -1;
2212 if (c < 0) c = d; else if (c != d) return -1;
2213 }
2214
2215 else switch(code[3])
2216 {
2217 default:
2218 return -1;
2219
2220 case OP_EXACT: /* Fall through */
2221 charoffset++;
2222
2223 case OP_CHARS: /* Fall through */
2224 charoffset++;
2225
2226 case OP_PLUS:
2227 case OP_MINPLUS:
2228 if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;
2229 break;
2230 }
2231 code += (code[1] << 8) + code[2];
2232 }
2233while (*code == OP_ALT);
2234return c;
2235}
2236
2237
2238
2239/*************************************************
2240* Compile a Regular Expression *
2241*************************************************/
2242
2243/* This function takes a string and returns a pointer to a block of store
2244holding a compiled version of the expression.
2245
2246Arguments:
2247 pattern the regular expression
2248 options various option bits
2249 errorptr pointer to pointer to error text
2250 erroroffset ptr offset in pattern where error was detected
2251
2252Returns: pointer to compiled data block, or NULL on error,
2253 with errorptr and erroroffset set
2254*/
2255
2256pcre *
Guido van Rossum58132c61997-12-17 00:24:13 +00002257pcre_compile(const char *pattern, int options, const char **errorptr,
Guido van Rossum50700601997-12-08 17:15:20 +00002258 int *erroroffset, PyObject *dictionary)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002259{
2260real_pcre *re;
2261int spaces = 0;
2262int length = 3; /* For initial BRA plus length */
2263int runlength;
2264int c, size;
Guido van Rossum50700601997-12-08 17:15:20 +00002265int bracount = 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002266int brastack[200];
Guido van Rossum50700601997-12-08 17:15:20 +00002267int top_backref = 0;
Guido van Rossum58132c61997-12-17 00:24:13 +00002268unsigned int brastackptr = 0;
2269uschar *code;
2270const uschar *ptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002271
2272#ifdef DEBUG
2273uschar *code_base, *code_end;
2274#endif
2275
Guido van Rossum50700601997-12-08 17:15:20 +00002276/* We can't pass back an error message if errorptr is NULL; I guess the best we
2277can do is just return NULL. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002278
Guido van Rossum50700601997-12-08 17:15:20 +00002279if (errorptr == NULL) return NULL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002280*errorptr = NULL;
Guido van Rossum50700601997-12-08 17:15:20 +00002281
2282/* However, we can give a message for this error */
2283
2284if (erroroffset == NULL)
2285 {
2286 *errorptr = ERR16;
2287 return NULL;
2288 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002289*erroroffset = 0;
2290
2291if ((options & ~PUBLIC_OPTIONS) != 0)
2292 {
Guido van Rossum50700601997-12-08 17:15:20 +00002293 *errorptr = ERR17;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002294 return NULL;
2295 }
2296
Guido van Rossum557dea11997-12-22 22:46:52 +00002297DPRINTF(("------------------------------------------------------------------\n"));
2298DPRINTF(("%s\n", pattern));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002299
2300/* The first thing to do is to make a pass over the pattern to compute the
2301amount of store required to hold the compiled code. This does not have to be
2302perfect as long as errors are overestimates. At the same time we can detect any
2303internal flag settings. Make an attempt to correct for any counted white space
2304if an "extended" flag setting appears late in the pattern. We can't be so
2305clever for #-comments. */
2306
Guido van Rossum58132c61997-12-17 00:24:13 +00002307ptr = (const uschar *)(pattern - 1);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002308while ((c = *(++ptr)) != 0)
2309 {
Guido van Rossum50700601997-12-08 17:15:20 +00002310 int min, max;
2311 int class_charcount;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002312
2313 if ((pcre_ctypes[c] & ctype_space) != 0)
2314 {
Guido van Rossum50700601997-12-08 17:15:20 +00002315 if ((options & PCRE_EXTENDED) != 0) continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002316 spaces++;
2317 }
2318
Guido van Rossum50700601997-12-08 17:15:20 +00002319 if (c == '#' && (options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002320 {
2321 while ((c = *(++ptr)) != 0 && c != '\n');
2322 continue;
2323 }
2324
2325 switch(c)
2326 {
2327 /* A backslashed item may be an escaped "normal" character or a
2328 character type. For a "normal" character, put the pointers and
2329 character back so that tests for whitespace etc. in the input
2330 are done correctly. */
2331
2332 case '\\':
2333 {
Guido van Rossum58132c61997-12-17 00:24:13 +00002334 const uschar *save_ptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00002335 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002336 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2337 if (c >= 0)
2338 {
2339 ptr = save_ptr;
2340 c = '\\';
2341 goto NORMAL_CHAR;
2342 }
2343 }
2344 length++;
2345
2346 /* A back reference needs an additional char, plus either one or 5
Guido van Rossum50700601997-12-08 17:15:20 +00002347 bytes for a repeat. We also need to keep the value of the highest
2348 back reference. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002349
2350 if (c <= -ESC_REF)
2351 {
Guido van Rossum50700601997-12-08 17:15:20 +00002352 int refnum = -c - ESC_REF;
2353 if (refnum > top_backref) top_backref = refnum;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002354 length++; /* For single back reference */
Guido van Rossum50700601997-12-08 17:15:20 +00002355 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002356 {
2357 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2358 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2359 if ((min == 0 && (max == 1 || max == -1)) ||
2360 (min == 1 && max == -1))
2361 length++;
2362 else length += 5;
2363 if (ptr[1] == '?') ptr++;
2364 }
2365 }
2366 continue;
2367
2368 case '^':
2369 case '.':
2370 case '$':
2371 case '*': /* These repeats won't be after brackets; */
2372 case '+': /* those are handled separately */
2373 case '?':
2374 length++;
2375 continue;
2376
2377 /* This covers the cases of repeats after a single char, metachar, class,
2378 or back reference. */
2379
2380 case '{':
Guido van Rossum50700601997-12-08 17:15:20 +00002381 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002382 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
2383 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2384 if ((min == 0 && (max == 1 || max == -1)) ||
2385 (min == 1 && max == -1))
2386 length++;
2387 else
2388 {
2389 length--; /* Uncount the original char or metachar */
2390 if (min == 1) length++; else if (min > 0) length += 4;
2391 if (max > 0) length += 4; else length += 2;
2392 }
2393 if (ptr[1] == '?') ptr++;
2394 continue;
2395
2396 /* An alternation contains an offset to the next branch or ket. */
2397 case '|':
2398 length += 3;
2399 continue;
2400
Guido van Rossum50700601997-12-08 17:15:20 +00002401 /* A character class uses 33 characters. Don't worry about character types
2402 that aren't allowed in classes - they'll get picked up during the compile.
2403 A character class that contains only one character uses 2 or 3 bytes,
2404 depending on whether it is negated or not. Notice this where we can. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002405
2406 case '[':
Guido van Rossum50700601997-12-08 17:15:20 +00002407 class_charcount = 0;
2408 if (*(++ptr) == '^') ptr++;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002409 do
2410 {
Guido van Rossum50700601997-12-08 17:15:20 +00002411 if (*ptr == '\\')
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002412 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002413 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002414 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
Guido van Rossum557dea11997-12-22 22:46:52 +00002415 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002416 }
Guido van Rossum50700601997-12-08 17:15:20 +00002417 else class_charcount++;
2418 ptr++;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002419 }
2420 while (*ptr != 0 && *ptr != ']');
2421
Guido van Rossum50700601997-12-08 17:15:20 +00002422 /* Repeats for negated single chars are handled by the general code */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002423
Guido van Rossum50700601997-12-08 17:15:20 +00002424 if (class_charcount == 1) length += 3; else
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002425 {
Guido van Rossum50700601997-12-08 17:15:20 +00002426 length += 33;
2427 if (options & PCRE_LOCALE) length++; /* Add a byte for the localization flag */
2428
2429 /* A repeat needs either 1 or 5 bytes. */
2430
Guido van Rossum557dea11997-12-22 22:46:52 +00002431 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
Guido van Rossum50700601997-12-08 17:15:20 +00002432 {
2433 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2434 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2435 if ((min == 0 && (max == 1 || max == -1)) ||
2436 (min == 1 && max == -1))
2437 length++;
2438 else length += 5;
2439 if (ptr[1] == '?') ptr++;
2440 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002441 }
2442 continue;
2443
2444 /* Brackets may be genuine groups or special things */
2445
2446 case '(':
2447
2448 /* Handle special forms of bracket, which all start (? */
2449
2450 if (ptr[1] == '?') switch (c = ptr[2])
2451 {
2452 /* Skip over comments entirely */
2453 case '#':
2454 ptr += 3;
2455 while (*ptr != 0 && *ptr != ')') ptr++;
2456 if (*ptr == 0)
2457 {
Guido van Rossum50700601997-12-08 17:15:20 +00002458 *errorptr = ERR18;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002459 goto PCRE_ERROR_RETURN;
2460 }
2461 continue;
2462
2463 /* Non-referencing groups and lookaheads just move the pointer on, and
Guido van Rossum50700601997-12-08 17:15:20 +00002464 then behave like a non-special bracket, except that they don't increment
2465 the count of extracting brackets. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002466
2467 case ':':
2468 case '=':
2469 case '!':
2470 ptr += 2;
2471 break;
2472
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002473 case ('P'):
2474 {
2475 int idlen;
2476 switch (*ptr++) {
2477 case ('<'):
2478 idlen = get_group_id(ptr++, '>', errorptr);
2479 if (*errorptr) goto PCRE_ERROR_RETURN;
2480 ptr += idlen+1;
2481 break;
2482 case ('='):
2483 idlen = get_group_id(ptr++, ')', errorptr);
2484 if (*errorptr) goto PCRE_ERROR_RETURN;
2485 ptr += idlen+1;
2486 length++;
2487 break;
2488 }
2489 }
2490 break;
2491
Guido van Rossum50700601997-12-08 17:15:20 +00002492 /* Ditto for the "once only" bracket, allowed only if the extra bit
2493 is set. */
2494
2495 case '>':
2496 if ((options & PCRE_EXTRA) != 0)
2497 {
2498 ptr += 2;
2499 break;
2500 }
Guido van Rossumdda66961998-05-07 15:32:44 +00002501 /* Else fall through */
Guido van Rossum50700601997-12-08 17:15:20 +00002502
2503 /* Else loop setting valid options until ) is met. Anything else is an
2504 error. */
2505
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002506 default:
2507 ptr += 2;
2508 for (;; ptr++)
2509 {
2510 if ((c = *ptr) == 'i')
2511 {
2512 options |= PCRE_CASELESS;
2513 continue;
2514 }
Guido van Rossumbd49ac41997-12-10 23:05:53 +00002515 else if ((c = *ptr) == 'L')
Guido van Rossum50700601997-12-08 17:15:20 +00002516 {
2517 options |= PCRE_LOCALE;
2518 continue;
2519 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002520 else if ((c = *ptr) == 'm')
2521 {
2522 options |= PCRE_MULTILINE;
2523 continue;
2524 }
Guido van Rossum50700601997-12-08 17:15:20 +00002525 else if (c == 's')
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002526 {
2527 options |= PCRE_DOTALL;
2528 continue;
2529 }
2530 else if (c == 'x')
2531 {
2532 options |= PCRE_EXTENDED;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002533 length -= spaces; /* Already counted spaces */
2534 continue;
2535 }
2536 else if (c == ')') break;
2537
Guido van Rossum50700601997-12-08 17:15:20 +00002538 *errorptr = ERR12;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002539 goto PCRE_ERROR_RETURN;
2540 }
2541 continue; /* End of this bracket handling */
2542 }
2543
Guido van Rossum50700601997-12-08 17:15:20 +00002544 /* Extracting brackets must be counted so we can process escapes in a
2545 Perlish way. */
2546
2547 else bracount++;
2548
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002549 /* Non-special forms of bracket. Save length for computing whole length
2550 at end if there's a repeat that requires duplication of the group. */
2551
2552 if (brastackptr >= sizeof(brastack)/sizeof(int))
2553 {
Guido van Rossum50700601997-12-08 17:15:20 +00002554 *errorptr = ERR19;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002555 goto PCRE_ERROR_RETURN;
2556 }
2557
2558 brastack[brastackptr++] = length;
2559 length += 3;
2560 continue;
2561
2562 /* Handle ket. Look for subsequent max/min; for certain sets of values we
Guido van Rossum557dea11997-12-22 22:46:52 +00002563 have to replicate this bracket up to that many times. If brastackptr is
2564 0 this is an unmatched bracket which will generate an error, but take care
2565 not to try to access brastack[-1]. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002566
2567 case ')':
2568 length += 3;
2569 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002570 int minval = 1;
2571 int maxval = 1;
2572 int duplength = (brastackptr > 0)? length - brastack[--brastackptr] : 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002573
2574 /* Leave ptr at the final char; for read_repeat_counts this happens
2575 automatically; for the others we need an increment. */
2576
Guido van Rossum50700601997-12-08 17:15:20 +00002577 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002578 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002579 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002580 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2581 }
Guido van Rossum557dea11997-12-22 22:46:52 +00002582 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2583 else if (c == '+') { maxval = -1; ptr++; }
2584 else if (c == '?') { minval = 0; ptr++; }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002585
Guido van Rossum557dea11997-12-22 22:46:52 +00002586 /* If there is a minimum > 1 we have to replicate up to minval-1 times;
2587 if there is a limited maximum we have to replicate up to maxval-1 times
2588 and allow for a BRAZERO item before each optional copy, as we also have
2589 to do before the first copy if the minimum is zero. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002590
Guido van Rossum557dea11997-12-22 22:46:52 +00002591 if (minval == 0) length++;
2592 else if (minval > 1) length += (minval - 1) * duplength;
2593 if (maxval > minval) length += (maxval - minval) * (duplength + 1);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002594 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002595 continue;
2596
2597 /* Non-special character. For a run of such characters the length required
2598 is the number of characters + 2, except that the maximum run length is 255.
2599 We won't get a skipped space or a non-data escape or the start of a #
2600 comment as the first character, so the length can't be zero. */
2601
2602 NORMAL_CHAR:
2603 default:
2604 length += 2;
2605 runlength = 0;
2606 do
2607 {
2608 if ((pcre_ctypes[c] & ctype_space) != 0)
2609 {
Guido van Rossum50700601997-12-08 17:15:20 +00002610 if ((options & PCRE_EXTENDED) != 0) continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002611 spaces++;
2612 }
2613
Guido van Rossum50700601997-12-08 17:15:20 +00002614 if (c == '#' && (options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002615 {
2616 while ((c = *(++ptr)) != 0 && c != '\n');
2617 continue;
2618 }
2619
2620 /* Backslash may introduce a data char or a metacharacter; stop the
2621 string before the latter. */
2622
2623 if (c == '\\')
2624 {
Guido van Rossum58132c61997-12-17 00:24:13 +00002625 const uschar *saveptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00002626 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002627 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2628 if (c < 0) { ptr = saveptr; break; }
2629 }
2630
2631 /* Ordinary character or single-char escape */
2632
2633 runlength++;
2634 }
2635
2636 /* This "while" is the end of the "do" above. */
2637
2638 while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2639
2640 ptr--;
2641 length += runlength;
2642 continue;
2643 }
2644 }
2645
2646length += 4; /* For final KET and END */
2647
2648if (length > 65539)
2649 {
Guido van Rossum50700601997-12-08 17:15:20 +00002650 *errorptr = ERR20;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002651 return NULL;
2652 }
2653
2654/* Compute the size of data block needed and get it, either from malloc or
Guido van Rossum557dea11997-12-22 22:46:52 +00002655externally provided function. We specify "code[0]" in the offsetof() expression
2656rather than just "code", because it has been reported that one broken compiler
2657fails on "code" because it is also an independent variable. It should make no
2658difference to the value of the offsetof(). */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002659
Guido van Rossum557dea11997-12-22 22:46:52 +00002660size = length + offsetof(real_pcre, code[0]);
Guido van Rossum50700601997-12-08 17:15:20 +00002661re = (real_pcre *)(pcre_malloc)(size+50);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002662
2663if (re == NULL)
2664 {
Guido van Rossum50700601997-12-08 17:15:20 +00002665 *errorptr = ERR21;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002666 return NULL;
2667 }
2668
Guido van Rossum557dea11997-12-22 22:46:52 +00002669/* Put in the magic number and the options. */
2670
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002671re->magic_number = MAGIC_NUMBER;
2672re->options = options;
2673
2674/* Set up a starting, non-extracting bracket, then compile the expression. On
2675error, *errorptr will be set non-NULL, so we don't need to look at the result
2676of the function here. */
2677
Guido van Rossum58132c61997-12-17 00:24:13 +00002678ptr = (const uschar *)pattern;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002679code = re->code;
2680*code = OP_BRA;
Guido van Rossum50700601997-12-08 17:15:20 +00002681bracount = 0;
2682(void)compile_regex(options, &bracount, &code, &ptr, errorptr, dictionary);
2683re->top_bracket = bracount;
2684re->top_backref = top_backref;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002685
2686/* If not reached end of pattern on success, there's an excess bracket. */
2687
Guido van Rossum50700601997-12-08 17:15:20 +00002688if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
2689
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002690/* Fill in the terminating state and check for disastrous overflow, but
2691if debugging, leave the test till after things are printed out. */
2692
2693*code++ = OP_END;
2694
Guido van Rossum50700601997-12-08 17:15:20 +00002695
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002696#ifndef DEBUG
Guido van Rossum50700601997-12-08 17:15:20 +00002697if (code - re->code > length) *errorptr = ERR23;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002698#endif
2699
2700/* Failed to compile */
2701
2702if (*errorptr != NULL)
2703 {
2704 (pcre_free)(re);
2705 PCRE_ERROR_RETURN:
Guido van Rossum58132c61997-12-17 00:24:13 +00002706 *erroroffset = ptr - (const uschar *)pattern;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002707 return NULL;
2708 }
2709
2710/* If the anchored option was not passed, set flag if we can determine that it
2711is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
2712we can determine what the first character has to be, because that speeds up
2713unanchored matches no end. In the case of multiline matches, an alternative is
2714to set the PCRE_STARTLINE flag if all branches start with ^. */
2715
2716if ((options & PCRE_ANCHORED) == 0)
2717 {
2718 if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))
2719 re->options |= PCRE_ANCHORED;
2720 else
2721 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002722 int ch = find_firstchar(re->code);
2723 if (ch >= 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002724 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002725 re->first_char = ch;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002726 re->options |= PCRE_FIRSTSET;
2727 }
2728 else if (is_startline(re->code))
2729 re->options |= PCRE_STARTLINE;
2730 }
2731 }
2732
2733/* Print out the compiled data for debugging */
2734
2735#ifdef DEBUG
2736
Guido van Rossum50700601997-12-08 17:15:20 +00002737printf("Length = %d top_bracket = %d top_backref=%d\n",
2738 length, re->top_bracket, re->top_backref);
2739
2740if (re->options != 0)
2741 {
Guido van Rossumdda66961998-05-07 15:32:44 +00002742 printf("%s%s%s%s%s%s%s%s\n",
Guido van Rossum50700601997-12-08 17:15:20 +00002743 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2744 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2745 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2746 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2747 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2748 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
Guido van Rossumdda66961998-05-07 15:32:44 +00002749 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
2750 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
Guido van Rossum50700601997-12-08 17:15:20 +00002751 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002752
2753if ((re->options & PCRE_FIRSTSET) != 0)
2754 {
2755 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2756 else printf("First char = \\x%02x\n", re->first_char);
2757 }
2758
2759code_end = code;
2760code_base = code = re->code;
2761
2762while (code < code_end)
2763 {
2764 int charlength;
2765
2766 printf("%3d ", code - code_base);
2767
2768 if (*code >= OP_BRA)
2769 {
2770 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2771 code += 2;
2772 }
2773
2774 else switch(*code)
2775 {
2776 case OP_CHARS:
2777 charlength = *(++code);
2778 printf("%3d ", charlength);
2779 while (charlength-- > 0)
2780 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2781 break;
2782
2783 case OP_KETRMAX:
2784 case OP_KETRMIN:
2785 case OP_ALT:
2786 case OP_KET:
2787 case OP_ASSERT:
2788 case OP_ASSERT_NOT:
Guido van Rossum50700601997-12-08 17:15:20 +00002789 case OP_ONCE:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002790 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2791 code += 2;
2792 break;
2793
2794 case OP_STAR:
2795 case OP_MINSTAR:
2796 case OP_PLUS:
2797 case OP_MINPLUS:
2798 case OP_QUERY:
2799 case OP_MINQUERY:
2800 case OP_TYPESTAR:
2801 case OP_TYPEMINSTAR:
2802 case OP_TYPEPLUS:
2803 case OP_TYPEMINPLUS:
2804 case OP_TYPEQUERY:
2805 case OP_TYPEMINQUERY:
2806 if (*code >= OP_TYPESTAR)
2807 printf(" %s", OP_names[code[1]]);
2808 else if (isprint(c = code[1])) printf(" %c", c);
2809 else printf(" \\x%02x", c);
2810 printf("%s", OP_names[*code++]);
2811 break;
2812
2813 case OP_EXACT:
2814 case OP_UPTO:
2815 case OP_MINUPTO:
2816 if (isprint(c = code[3])) printf(" %c{", c);
2817 else printf(" \\x%02x{", c);
Guido van Rossum557dea11997-12-22 22:46:52 +00002818 if (*code != OP_EXACT) printf("0,");
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002819 printf("%d}", (code[1] << 8) + code[2]);
2820 if (*code == OP_MINUPTO) printf("?");
2821 code += 3;
2822 break;
2823
2824 case OP_TYPEEXACT:
2825 case OP_TYPEUPTO:
2826 case OP_TYPEMINUPTO:
2827 printf(" %s{", OP_names[code[3]]);
2828 if (*code != OP_TYPEEXACT) printf(",");
2829 printf("%d}", (code[1] << 8) + code[2]);
2830 if (*code == OP_TYPEMINUPTO) printf("?");
2831 code += 3;
2832 break;
2833
Guido van Rossum50700601997-12-08 17:15:20 +00002834 case OP_NOT:
2835 if (isprint(c = *(++code))) printf(" [^%c]", c);
2836 else printf(" [^\\x%02x]", c);
2837 break;
2838
2839 case OP_NOTSTAR:
2840 case OP_NOTMINSTAR:
2841 case OP_NOTPLUS:
2842 case OP_NOTMINPLUS:
2843 case OP_NOTQUERY:
2844 case OP_NOTMINQUERY:
2845 if (isprint(c = code[1])) printf(" [^%c]", c);
2846 else printf(" [^\\x%02x]", c);
2847 printf("%s", OP_names[*code++]);
2848 break;
2849
2850 case OP_NOTEXACT:
2851 case OP_NOTUPTO:
2852 case OP_NOTMINUPTO:
2853 if (isprint(c = code[3])) printf(" [^%c]{", c);
2854 else printf(" [^\\x%02x]{", c);
2855 if (*code != OP_NOTEXACT) printf(",");
2856 printf("%d}", (code[1] << 8) + code[2]);
2857 if (*code == OP_NOTMINUPTO) printf("?");
2858 code += 3;
2859 break;
2860
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002861 case OP_REF:
2862 printf(" \\%d", *(++code));
Guido van Rossum557dea11997-12-22 22:46:52 +00002863 code ++;
2864 goto CLASS_REF_REPEAT;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002865
2866 case OP_CLASS:
Guido van Rossum042ff9e1998-04-03 21:13:31 +00002867 case OP_NEGCLASS:
Guido van Rossum50700601997-12-08 17:15:20 +00002868 case OP_CLASS_L:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002869 {
2870 int i, min, max;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002871
Guido van Rossum50700601997-12-08 17:15:20 +00002872 if (*code==OP_CLASS_L)
2873 {
2874 code++;
2875 printf("Locflag = %i ", *code++);
Guido van Rossum042ff9e1998-04-03 21:13:31 +00002876 printf(" [");
Guido van Rossum50700601997-12-08 17:15:20 +00002877 }
2878 else
Guido van Rossum042ff9e1998-04-03 21:13:31 +00002879 {
2880 if (*code++ == OP_CLASS) printf(" [");
2881 else printf(" ^[");
2882 }
2883
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002884
Guido van Rossum50700601997-12-08 17:15:20 +00002885 for (i = 0; i < 256; i++)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002886 {
Guido van Rossum50700601997-12-08 17:15:20 +00002887 if ((code[i/8] & (1 << (i&7))) != 0)
2888 {
2889 int j;
2890 for (j = i+1; j < 256; j++)
2891 if ((code[j/8] & (1 << (j&7))) == 0) break;
2892 if (i == '-' || i == ']') printf("\\");
2893 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2894 if (--j > i)
2895 {
2896 printf("-");
2897 if (j == '-' || j == ']') printf("\\");
2898 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2899 }
2900 i = j;
2901 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002902 }
2903 printf("]");
Guido van Rossum50700601997-12-08 17:15:20 +00002904 code += 32;
2905 /* code ++;*/
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002906
Guido van Rossum557dea11997-12-22 22:46:52 +00002907 CLASS_REF_REPEAT:
2908
Guido van Rossum50700601997-12-08 17:15:20 +00002909 switch(*code)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002910 {
2911 case OP_CRSTAR:
2912 case OP_CRMINSTAR:
2913 case OP_CRPLUS:
2914 case OP_CRMINPLUS:
2915 case OP_CRQUERY:
2916 case OP_CRMINQUERY:
2917 printf("%s", OP_names[*code]);
2918 break;
2919
2920 case OP_CRRANGE:
2921 case OP_CRMINRANGE:
2922 min = (code[1] << 8) + code[2];
2923 max = (code[3] << 8) + code[4];
2924 if (max == 0) printf("{%d,}", min);
2925 else printf("{%d,%d}", min, max);
2926 if (*code == OP_CRMINRANGE) printf("?");
2927 code += 4;
2928 break;
2929
2930 default:
2931 code--;
2932 }
2933 }
2934 break;
2935
2936 /* Anything else is just a one-node item */
2937
2938 default:
2939 printf(" %s", OP_names[*code]);
2940 break;
2941 }
2942
2943 code++;
2944 printf("\n");
2945 }
2946printf("------------------------------------------------------------------\n");
2947
2948/* This check is done here in the debugging case so that the code that
2949was compiled can be seen. */
2950
2951if (code - re->code > length)
2952 {
Guido van Rossum50700601997-12-08 17:15:20 +00002953 printf("length=%i, code length=%i\n", length, code-re->code);
2954 *errorptr = ERR23;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002955 (pcre_free)(re);
2956 *erroroffset = ptr - (uschar *)pattern;
2957 return NULL;
2958 }
2959#endif
2960
2961return (pcre *)re;
2962}
2963
2964
2965
2966/*************************************************
2967* Match a character type *
2968*************************************************/
2969
2970/* Not used in all the places it might be as it's sometimes faster
2971to put the code inline.
2972
2973Arguments:
2974 type the character type
2975 c the character
Guido van Rossum50700601997-12-08 17:15:20 +00002976 dotall the dotall flag
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002977
2978Returns: TRUE if character is of the type
2979*/
2980
2981static BOOL
2982match_type(int type, int c, BOOL dotall)
2983{
2984
2985#ifdef DEBUG
2986if (isprint(c)) printf("matching subject %c against ", c);
2987 else printf("matching subject \\x%02x against ", c);
2988printf("%s\n", OP_names[type]);
2989#endif
2990
2991switch(type)
2992 {
2993 case OP_ANY: return dotall || c != '\n';
2994 case OP_NOT_DIGIT: return (pcre_ctypes[c] & ctype_digit) == 0;
2995 case OP_DIGIT: return (pcre_ctypes[c] & ctype_digit) != 0;
2996 case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
2997 case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0;
2998 case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0;
2999 case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0;
Guido van Rossum58132c61997-12-17 00:24:13 +00003000 case OP_NOT_WORDCHAR_L: return (c!='_' && !isalnum(c));
3001 case OP_WORDCHAR_L: return (c=='_' || isalnum(c));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003002 }
3003return FALSE;
3004}
3005
3006
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003007
3008/*************************************************
3009* Match a back-reference *
3010*************************************************/
3011
3012/* If a back reference hasn't been set, the match fails.
3013
3014Arguments:
3015 number reference number
3016 eptr points into the subject
3017 length length to be matched
3018 md points to match data block
3019
3020Returns: TRUE if matched
3021*/
3022
3023static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00003024match_ref(int number, register const uschar *eptr, int length, match_data *md)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003025{
Guido van Rossum58132c61997-12-17 00:24:13 +00003026const uschar *p = md->start_subject + md->offset_vector[number];
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003027
3028#ifdef DEBUG
3029if (eptr >= md->end_subject)
3030 printf("matching subject <null>");
3031else
3032 {
3033 printf("matching subject ");
3034 pchars(eptr, length, TRUE, md);
3035 }
3036printf(" against backref ");
3037pchars(p, length, FALSE, md);
3038printf("\n");
3039#endif
3040
3041/* Always fail if not enough characters left */
3042
3043if (length > md->end_subject - p) return FALSE;
3044
Guido van Rossum58132c61997-12-17 00:24:13 +00003045/* Separate the caseless case for speed */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003046
3047if (md->caseless)
3048 { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }
3049else
3050 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3051
3052return TRUE;
3053}
3054
3055static int free_stack(match_data *md)
3056{
3057/* Free any stack space that was allocated by the call to match(). */
3058if (md->off_num) free(md->off_num);
3059if (md->offset_top) free(md->offset_top);
3060if (md->r1) free(md->r1);
3061if (md->r2) free(md->r2);
Guido van Rossum39b0f891998-04-10 21:52:06 +00003062if (md->eptr) free((char *)md->eptr);
3063if (md->ecode) free((char *)md->ecode);
Guido van Rossumc3861071997-10-08 02:07:40 +00003064return 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003065}
3066
3067static int grow_stack(match_data *md)
3068{
Guido van Rossum50700601997-12-08 17:15:20 +00003069 if (md->length != 0)
3070 {
3071 md->length = md->length + md->length/2;
3072 }
3073 else
3074 {
3075 int string_len = md->end_subject - md->start_subject + 1;
3076 if (string_len < 80) {md->length = string_len; }
3077 else {md->length = 80;}
3078 }
3079 PyMem_RESIZE(md->offset_top, int, md->length);
Guido van Rossum58132c61997-12-17 00:24:13 +00003080 PyMem_RESIZE(md->eptr, const uschar *, md->length);
3081 PyMem_RESIZE(md->ecode, const uschar *, md->length);
Guido van Rossum50700601997-12-08 17:15:20 +00003082 PyMem_RESIZE(md->off_num, int, md->length);
3083 PyMem_RESIZE(md->r1, int, md->length);
3084 PyMem_RESIZE(md->r2, int, md->length);
3085 if (md->offset_top == NULL || md->eptr == NULL || md->ecode == NULL ||
3086 md->off_num == NULL || md->r1 == NULL || md->r2 == NULL)
3087 {
Guido van Rossumdda66961998-05-07 15:32:44 +00003088 PyErr_NoMemory();
Guido van Rossum50700601997-12-08 17:15:20 +00003089 longjmp(md->error_env, 1);
3090 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003091 return 0;
3092}
3093
Guido van Rossum50700601997-12-08 17:15:20 +00003094
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003095/*************************************************
3096* Match from current position *
3097*************************************************/
3098
3099/* On entry ecode points to the first opcode, and eptr to the first character.
3100
3101Arguments:
3102 eptr pointer in subject
3103 ecode position in code
3104 offset_top current top pointer
3105 md pointer to "static" info for the match
3106
3107Returns: TRUE if matched
3108*/
3109
3110static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00003111match(register const uschar *eptr, register const uschar *ecode, int offset_top,
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003112 match_data *md)
3113{
3114 int save_stack_position = md->point;
3115match_loop:
3116
3117#define SUCCEED goto succeed
3118#define FAIL goto fail
3119
3120for (;;)
3121 {
3122 int min, max, ctype;
3123 register int i;
3124 register int c;
Guido van Rossum58132c61997-12-17 00:24:13 +00003125 BOOL minimize = FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003126
3127 /* Opening bracket. Check the alternative branches in turn, failing if none
3128 match. We have to set the start offset if required and there is space
3129 in the offset vector so that it is available for subsequent back references
3130 if the bracket matches. However, if the bracket fails, we must put back the
3131 previous value of both offsets in case they were set by a previous copy of
3132 the same bracket. Don't worry about setting the flag for the error case here;
3133 that is handled in the code for KET. */
3134
3135 if ((int)*ecode >= OP_BRA)
3136 {
3137 int number = (*ecode - OP_BRA) << 1;
Guido van Rossum58132c61997-12-17 00:24:13 +00003138 int save_offset1 = 0, save_offset2 = 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003139
Guido van Rossum557dea11997-12-22 22:46:52 +00003140 DPRINTF(("start bracket %d\n", number/2));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003141
3142 if (number > 0 && number < md->offset_end)
3143 {
3144 save_offset1 = md->offset_vector[number];
3145 save_offset2 = md->offset_vector[number+1];
3146 md->offset_vector[number] = eptr - md->start_subject;
3147
Guido van Rossum557dea11997-12-22 22:46:52 +00003148 DPRINTF(("saving %d %d\n", save_offset1, save_offset2));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003149 }
3150
3151 /* Recurse for all the alternatives. */
3152
3153 do
3154 {
3155 if (match(eptr, ecode+3, offset_top, md)) SUCCEED;
3156 ecode += (ecode[1] << 8) + ecode[2];
3157 }
3158 while (*ecode == OP_ALT);
3159
Guido van Rossum557dea11997-12-22 22:46:52 +00003160 DPRINTF(("bracket %d failed\n", number/2));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003161
3162 if (number > 0 && number < md->offset_end)
3163 {
3164 md->offset_vector[number] = save_offset1;
3165 md->offset_vector[number+1] = save_offset2;
3166 }
3167
3168 FAIL;
3169 }
3170
3171 /* Other types of node can be handled by a switch */
3172
3173 switch(*ecode)
3174 {
3175 case OP_END:
3176 md->end_match_ptr = eptr; /* Record where we ended */
3177 md->end_offset_top = offset_top; /* and how many extracts were taken */
3178 SUCCEED;
3179
Guido van Rossum50700601997-12-08 17:15:20 +00003180 /* The equivalent of Prolog's "cut" - if the rest doesn't match, the
3181 whole thing doesn't match, so we have to get out via a longjmp(). */
3182
3183 case OP_CUT:
3184 if (match(eptr, ecode+1, offset_top, md)) SUCCEED;
3185 longjmp(md->fail_env, 1);
3186
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003187 /* Assertion brackets. Check the alternative branches in turn - the
3188 matching won't pass the KET for an assertion. If any one branch matches,
3189 the assertion is true. */
3190
3191 case OP_ASSERT:
3192 do
3193 {
3194 if (match(eptr, ecode+3, offset_top, md)) break;
3195 ecode += (ecode[1] << 8) + ecode[2];
3196 }
3197 while (*ecode == OP_ALT);
3198 if (*ecode == OP_KET) FAIL;
3199
3200 /* Continue from after the assertion, updating the offsets high water
3201 mark, since extracts may have been taken during the assertion. */
3202
3203 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3204 ecode += 3;
3205 offset_top = md->end_offset_top;
3206 continue;
3207
3208 /* Negative assertion: all branches must fail to match */
3209
3210 case OP_ASSERT_NOT:
3211 do
3212 {
3213 if (match(eptr, ecode+3, offset_top, md)) FAIL;
3214 ecode += (ecode[1] << 8) + ecode[2];
3215 }
3216 while (*ecode == OP_ALT);
3217 ecode += 3;
3218 continue;
3219
Guido van Rossum50700601997-12-08 17:15:20 +00003220 /* "Once" brackets are like assertion brackets except that after a match,
3221 the point in the subject string is not moved back. Thus there can never be
3222 a move back into the brackets. Check the alternative branches in turn - the
3223 matching won't pass the KET for this kind of subpattern. If any one branch
3224 matches, we carry on, leaving the subject pointer. */
3225
3226 case OP_ONCE:
3227 do
3228 {
3229 if (match(eptr, ecode+3, offset_top, md)) break;
3230 ecode += (ecode[1] << 8) + ecode[2];
3231 }
3232 while (*ecode == OP_ALT);
Guido van Rossum557dea11997-12-22 22:46:52 +00003233 if (*ecode == OP_KET) FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00003234
3235 /* Continue as from after the assertion, updating the offsets high water
3236 mark, since extracts may have been taken. */
3237
3238 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3239 ecode += 3;
3240 offset_top = md->end_offset_top;
3241 eptr = md->end_match_ptr;
3242 continue;
3243
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003244 /* An alternation is the end of a branch; scan along to find the end of the
3245 bracketed group and go to there. */
3246
3247 case OP_ALT:
3248 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3249 break;
3250
3251 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3252 that it may occur zero times. It may repeat infinitely, or not at all -
3253 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3254 repeat limits are compiled as a number of copies, with the optional ones
3255 preceded by BRAZERO or BRAMINZERO. */
3256
3257 case OP_BRAZERO:
3258 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003259 const uschar *next = ecode+1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003260 if (match(eptr, next, offset_top, md)) SUCCEED;
3261 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3262 ecode = next + 3;
3263 }
3264 break;
3265
3266 case OP_BRAMINZERO:
3267 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003268 const uschar *next = ecode+1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003269 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3270 if (match(eptr, next+3, offset_top, md)) SUCCEED;
3271 ecode++;
3272 }
3273 break;;
3274
3275 /* End of a group, repeated or non-repeating. If we are at the end of
3276 an assertion "group", stop matching and SUCCEED, but record the
3277 current high water mark for use by positive assertions. */
3278
3279 case OP_KET:
3280 case OP_KETRMIN:
3281 case OP_KETRMAX:
3282 {
Guido van Rossum50700601997-12-08 17:15:20 +00003283 int number;
Guido van Rossum58132c61997-12-17 00:24:13 +00003284 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003285
Guido van Rossum50700601997-12-08 17:15:20 +00003286 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003287 {
Guido van Rossum50700601997-12-08 17:15:20 +00003288 md->end_match_ptr = eptr; /* For ONCE */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003289 md->end_offset_top = offset_top;
3290 SUCCEED;
3291 }
3292
3293 /* In all other cases we have to check the group number back at the
3294 start and if necessary complete handling an extraction by setting the
3295 final offset and bumping the high water mark. */
3296
3297 number = (*prev - OP_BRA) << 1;
3298
Guido van Rossum557dea11997-12-22 22:46:52 +00003299 DPRINTF(("end bracket %d\n", number/2));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003300
3301 if (number > 0)
3302 {
3303 if (number >= md->offset_end) md->offset_overflow = TRUE; else
3304 {
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003305 md->offset_vector[number+1] = eptr - md->start_subject;
3306 if (offset_top <= number) offset_top = number + 2;
3307 }
3308 }
3309
3310 /* For a non-repeating ket, just advance to the next node and continue at
3311 this level. */
3312
3313 if (*ecode == OP_KET)
3314 {
3315 ecode += 3;
3316 break;
3317 }
3318
3319 /* The repeating kets try the rest of the pattern or restart from the
3320 preceding bracket, in the appropriate order. */
3321
3322 if (*ecode == OP_KETRMIN)
3323 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003324 const uschar *ptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003325 if (match(eptr, ecode+3, offset_top, md)) goto succeed;
3326 /* Handle alternation inside the BRA...KET; push the additional
Guido van Rossum58132c61997-12-17 00:24:13 +00003327 alternatives onto the stack */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003328 ptr=prev;
3329 do {
3330 ptr += (ptr[1]<<8)+ ptr[2];
3331 if (*ptr==OP_ALT)
3332 {
Guido van Rossum50700601997-12-08 17:15:20 +00003333 if (md->length == md->point)
3334 {
3335 grow_stack(md);
3336 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003337 md->offset_top[md->point] = offset_top;
3338 md->eptr[md->point] = eptr;
3339 md->ecode[md->point] = ptr+3;
3340 md->r1[md->point] = 0;
3341 md->r2[md->point] = 0;
3342 md->off_num[md->point] = 0;
3343 md->point++;
3344 }
3345 } while (*ptr==OP_ALT);
3346 ecode=prev+3; goto match_loop;
3347 }
3348 else /* OP_KETRMAX */
3349 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003350 const uschar *ptr;
3351 /*int points_pushed=0;*/
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003352
3353 /* Push one failure point, that will resume matching at the code after
3354 the KETRMAX opcode. */
Guido van Rossum50700601997-12-08 17:15:20 +00003355 if (md->length == md->point)
3356 {
3357 grow_stack(md);
3358 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003359 md->offset_top[md->point] = offset_top;
3360 md->eptr[md->point] = eptr;
3361 md->ecode[md->point] = ecode+3;
3362 md->r1[md->point] = md->offset_vector[number];
3363 md->r2[md->point] = md->offset_vector[number+1];
3364 md->off_num[md->point] = number;
3365 md->point++;
3366
3367 md->offset_vector[number] = eptr - md->start_subject;
3368 /* Handle alternation inside the BRA...KET; push each of the
Guido van Rossum58132c61997-12-17 00:24:13 +00003369 additional alternatives onto the stack */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003370 ptr=prev;
3371 do {
3372 ptr += (ptr[1]<<8)+ ptr[2];
3373 if (*ptr==OP_ALT)
3374 {
Guido van Rossum50700601997-12-08 17:15:20 +00003375 if (md->length == md->point)
3376 if (md->length == md->point)
3377 {
3378 grow_stack(md);
3379 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003380 md->offset_top[md->point] = offset_top;
3381 md->eptr[md->point] = eptr;
3382 md->ecode[md->point] = ptr+3;
3383 md->r1[md->point] = 0;
3384 md->r2[md->point] = 0;
3385 md->off_num[md->point] = 0;
3386 md->point++;
Guido van Rossum58132c61997-12-17 00:24:13 +00003387 /*points_pushed++;*/
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003388 }
3389 } while (*ptr==OP_ALT);
3390 /* Jump to the first (or only) alternative and resume trying to match */
3391 ecode=prev+3; goto match_loop;
3392 }
3393 }
Guido van Rossum58132c61997-12-17 00:24:13 +00003394
Guido van Rossum50700601997-12-08 17:15:20 +00003395 /* Start of subject unless notbol, or after internal newline if multiline */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003396
3397 case OP_CIRC:
Guido van Rossum50700601997-12-08 17:15:20 +00003398 if (md->notbol && eptr == md->start_subject) FAIL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003399 if (md->multiline)
3400 {
3401 if (eptr != md->start_subject && eptr[-1] != '\n') FAIL;
3402 ecode++;
3403 break;
3404 }
3405 /* ... else fall through */
3406
3407 /* Start of subject assertion */
3408
3409 case OP_SOD:
3410 if (eptr != md->start_subject) FAIL;
3411 ecode++;
3412 break;
3413
Guido van Rossum50700601997-12-08 17:15:20 +00003414 /* Assert before internal newline if multiline, or before
3415 a terminating newline unless endonly is set, else end of subject unless
3416 noteol is set. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003417
3418 case OP_DOLL:
Guido van Rossum50700601997-12-08 17:15:20 +00003419 if (md->noteol && eptr >= md->end_subject) FAIL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003420 if (md->multiline)
3421 {
3422 if (eptr < md->end_subject && *eptr != '\n') FAIL;
3423 ecode++;
3424 break;
3425 }
Guido van Rossum50700601997-12-08 17:15:20 +00003426 else if (!md->endonly)
3427 {
3428 if (eptr < md->end_subject - 1 ||
3429 (eptr == md->end_subject - 1 && *eptr != '\n')) FAIL;
3430 ecode++;
3431 break;
3432 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003433 /* ... else fall through */
3434
3435 /* End of subject assertion */
3436
3437 case OP_EOD:
3438 if (eptr < md->end_subject) FAIL;
3439 ecode++;
3440 break;
3441
3442 /* Word boundary assertions */
3443
3444 case OP_NOT_WORD_BOUNDARY:
3445 case OP_WORD_BOUNDARY:
3446 {
3447 BOOL prev_is_word = (eptr != md->start_subject) &&
3448 ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
3449 BOOL cur_is_word = (eptr < md->end_subject) &&
3450 ((pcre_ctypes[*eptr] & ctype_word) != 0);
3451 if ((*ecode++ == OP_WORD_BOUNDARY)?
3452 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3453 FAIL;
3454 }
3455 break;
3456
Guido van Rossum50700601997-12-08 17:15:20 +00003457 case OP_NOT_WORD_BOUNDARY_L:
3458 case OP_WORD_BOUNDARY_L:
3459 {
3460 BOOL prev_is_word = (eptr != md->start_subject) &&
Guido van Rossum58132c61997-12-17 00:24:13 +00003461 (isalnum(eptr[-1]) || eptr[-1]=='_');
Guido van Rossum50700601997-12-08 17:15:20 +00003462 BOOL cur_is_word = (eptr < md->end_subject) &&
Guido van Rossum58132c61997-12-17 00:24:13 +00003463 (isalnum(*eptr) || *eptr=='_');
Guido van Rossum50700601997-12-08 17:15:20 +00003464 if ((*ecode++ == OP_WORD_BOUNDARY_L)?
3465 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3466 FAIL;
3467 }
3468 break;
3469
3470
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003471 /* Match a single character type; inline for speed */
3472
3473 case OP_ANY:
3474 if (!md->dotall && eptr < md->end_subject && *eptr == '\n') FAIL;
3475 if (eptr++ >= md->end_subject) FAIL;
3476 ecode++;
3477 break;
3478
3479 case OP_NOT_DIGIT:
3480 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)
3481 FAIL;
3482 ecode++;
3483 break;
3484
3485 case OP_DIGIT:
3486 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)
3487 FAIL;
3488 ecode++;
3489 break;
3490
3491 case OP_NOT_WHITESPACE:
3492 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)
3493 FAIL;
3494 ecode++;
3495 break;
3496
3497 case OP_WHITESPACE:
3498 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)
3499 FAIL;
3500 ecode++;
3501 break;
3502
3503 case OP_NOT_WORDCHAR:
3504 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)
3505 FAIL;
3506 ecode++;
3507 break;
3508
3509 case OP_WORDCHAR:
3510 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)
3511 FAIL;
3512 ecode++;
3513 break;
3514
Guido van Rossum50700601997-12-08 17:15:20 +00003515 case OP_NOT_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00003516 if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) ))
Guido van Rossum557dea11997-12-22 22:46:52 +00003517 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00003518 eptr++;
3519 ecode++;
3520 break;
3521
3522 case OP_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00003523 if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) ))
Guido van Rossum557dea11997-12-22 22:46:52 +00003524 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00003525 eptr++;
3526 ecode++;
3527 break;
3528
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003529 /* Match a back reference, possibly repeatedly. Look past the end of the
3530 item to see if there is repeat information following. The code is similar
3531 to that for character classes, but repeated for efficiency. Then obey
3532 similar code to character type repeats - written out again for speed.
3533 However, if the referenced string is the empty string, always treat
3534 it as matched, any number of times (otherwise there could be infinite
3535 loops). */
3536
3537 case OP_REF:
3538 {
3539 int length;
3540 int number = ecode[1] << 1; /* Doubled reference number */
3541 ecode += 2; /* Advance past the item */
3542
3543 if (number >= offset_top || md->offset_vector[number] < 0)
3544 {
3545 md->errorcode = PCRE_ERROR_BADREF;
3546 FAIL;
3547 }
3548
3549 length = md->offset_vector[number+1] - md->offset_vector[number];
3550
3551 switch (*ecode)
3552 {
3553 case OP_CRSTAR:
3554 case OP_CRMINSTAR:
3555 case OP_CRPLUS:
3556 case OP_CRMINPLUS:
3557 case OP_CRQUERY:
3558 case OP_CRMINQUERY:
3559 c = *ecode++ - OP_CRSTAR;
3560 minimize = (c & 1) != 0;
3561 min = rep_min[c]; /* Pick up values from tables; */
3562 max = rep_max[c]; /* zero for max => infinity */
3563 if (max == 0) max = INT_MAX;
3564 break;
3565
3566 case OP_CRRANGE:
3567 case OP_CRMINRANGE:
3568 minimize = (*ecode == OP_CRMINRANGE);
3569 min = (ecode[1] << 8) + ecode[2];
3570 max = (ecode[3] << 8) + ecode[4];
3571 if (max == 0) max = INT_MAX;
3572 ecode += 5;
3573 break;
3574
3575 default: /* No repeat follows */
3576 if (!match_ref(number, eptr, length, md)) FAIL;
3577 eptr += length;
3578 continue; /* With the main loop */
3579 }
3580
3581 /* If the length of the reference is zero, just continue with the
3582 main loop. */
3583
3584 if (length == 0) continue;
3585
3586 /* First, ensure the minimum number of matches are present. We get back
3587 the length of the reference string explicitly rather than passing the
3588 address of eptr, so that eptr can be a register variable. */
3589
3590 for (i = 1; i <= min; i++)
3591 {
3592 if (!match_ref(number, eptr, length, md)) FAIL;
3593 eptr += length;
3594 }
3595
3596 /* If min = max, continue at the same level without recursion.
3597 They are not both allowed to be zero. */
3598
3599 if (min == max) continue;
3600
3601 /* If minimizing, keep trying and advancing the pointer */
3602
3603 if (minimize)
3604 {
3605 for (i = min;; i++)
3606 {
3607 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3608 if (i >= max || !match_ref(number, eptr, length, md))
3609 FAIL;
3610 eptr += length;
3611 }
3612 /* Control never gets here */
3613 }
3614
3615 /* If maximizing, find the longest string and work backwards */
3616
3617 else
3618 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003619 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003620 for (i = min; i < max; i++)
3621 {
3622 if (!match_ref(number, eptr, length, md)) break;
3623 eptr += length;
3624 }
3625 while (eptr >= pp)
3626 {
3627 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3628 eptr -= length;
3629 }
3630 FAIL;
3631 }
3632 }
3633 /* Control never gets here */
3634
3635 /* Match a character class, possibly repeatedly. Look past the end of the
3636 item to see if there is repeat information following. Then obey similar
Guido van Rossum50700601997-12-08 17:15:20 +00003637 code to character type repeats - written out again for speed. If caseless
3638 matching was set at runtime but not at compile time, we have to check both
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003639 versions of a character, and we have to behave differently for positive and
3640 negative classes. This is the only time where OP_CLASS and OP_NEGCLASS are
3641 treated differently. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003642
3643 case OP_CLASS:
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003644 case OP_NEGCLASS:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003645 {
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003646 BOOL nasty_case = *ecode == OP_NEGCLASS && md->runtime_caseless;
Guido van Rossum58132c61997-12-17 00:24:13 +00003647 const uschar *data = ecode + 1; /* Save for matching */
3648 ecode += 33; /* Advance past the item */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003649
3650 switch (*ecode)
3651 {
3652 case OP_CRSTAR:
3653 case OP_CRMINSTAR:
3654 case OP_CRPLUS:
3655 case OP_CRMINPLUS:
3656 case OP_CRQUERY:
3657 case OP_CRMINQUERY:
3658 c = *ecode++ - OP_CRSTAR;
3659 minimize = (c & 1) != 0;
3660 min = rep_min[c]; /* Pick up values from tables; */
3661 max = rep_max[c]; /* zero for max => infinity */
3662 if (max == 0) max = INT_MAX;
3663 break;
3664
3665 case OP_CRRANGE:
3666 case OP_CRMINRANGE:
3667 minimize = (*ecode == OP_CRMINRANGE);
3668 min = (ecode[1] << 8) + ecode[2];
3669 max = (ecode[3] << 8) + ecode[4];
3670 if (max == 0) max = INT_MAX;
3671 ecode += 5;
3672 break;
3673
3674 default: /* No repeat follows */
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003675 min = max = 1;
3676 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003677 }
3678
3679 /* First, ensure the minimum number of matches are present. */
3680
3681 for (i = 1; i <= min; i++)
Guido van Rossum50700601997-12-08 17:15:20 +00003682 {
3683 if (eptr >= md->end_subject) FAIL;
3684 c = *eptr++;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003685
3686 /* Either not runtime caseless, or it was a positive class. For
3687 runtime caseless, continue if either case is in the map. */
3688
3689 if (!nasty_case)
Guido van Rossum50700601997-12-08 17:15:20 +00003690 {
Guido van Rossum50700601997-12-08 17:15:20 +00003691 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003692 if (md->runtime_caseless)
3693 {
3694 c = pcre_fcc[c];
3695 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3696 }
Guido van Rossum50700601997-12-08 17:15:20 +00003697 }
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003698
3699 /* Runtime caseless and it was a negative class. Continue only if
3700 both cases are in the map. */
3701
3702 else
3703 {
3704 if ((data[c/8] & (1 << (c&7))) == 0) FAIL;
3705 c = pcre_fcc[c];
3706 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3707 }
3708
3709 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00003710 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003711
3712 /* If max == min we can continue with the main loop without the
3713 need to recurse. */
3714
3715 if (min == max) continue;
3716
3717 /* If minimizing, keep testing the rest of the expression and advancing
3718 the pointer while it matches the class. */
3719
3720 if (minimize)
3721 {
3722 for (i = min;; i++)
3723 {
3724 if (match(eptr, ecode, offset_top, md)) SUCCEED;
Guido van Rossum50700601997-12-08 17:15:20 +00003725 if (i >= max || eptr >= md->end_subject) FAIL;
3726 c = *eptr++;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003727
3728 /* Either not runtime caseless, or it was a positive class. For
3729 runtime caseless, continue if either case is in the map. */
3730
3731 if (!nasty_case)
Guido van Rossum50700601997-12-08 17:15:20 +00003732 {
Guido van Rossum50700601997-12-08 17:15:20 +00003733 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003734 if (md->runtime_caseless)
3735 {
3736 c = pcre_fcc[c];
3737 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3738 }
Guido van Rossum50700601997-12-08 17:15:20 +00003739 }
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003740
3741 /* Runtime caseless and it was a negative class. Continue only if
3742 both cases are in the map. */
3743
3744 else
3745 {
3746 if ((data[c/8] & (1 << (c&7))) == 0) return FALSE;
3747 c = pcre_fcc[c];
3748 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3749 }
3750
Guido van Rossum50700601997-12-08 17:15:20 +00003751 FAIL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003752 }
3753 /* Control never gets here */
3754 }
3755
3756 /* If maximizing, find the longest possible run, then work backwards. */
3757
3758 else
3759 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003760 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00003761 for (i = min; i < max; eptr++, i++)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003762 {
Guido van Rossum50700601997-12-08 17:15:20 +00003763 if (eptr >= md->end_subject) break;
3764 c = *eptr;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003765
3766 /* Either not runtime caseless, or it was a positive class. For
3767 runtime caseless, continue if either case is in the map. */
3768
3769 if (!nasty_case)
Guido van Rossum50700601997-12-08 17:15:20 +00003770 {
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003771 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3772 if (md->runtime_caseless)
3773 {
3774 c = pcre_fcc[c];
3775 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3776 }
3777 }
3778
3779 /* Runtime caseless and it was a negative class. Continue only if
3780 both cases are in the map. */
3781
3782 else
3783 {
3784 if ((data[c/8] & (1 << (c&7))) == 0) break;
Guido van Rossum50700601997-12-08 17:15:20 +00003785 c = pcre_fcc[c];
3786 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3787 }
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003788
Guido van Rossum50700601997-12-08 17:15:20 +00003789 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003790 }
Guido van Rossum50700601997-12-08 17:15:20 +00003791
3792 while (eptr >= pp)
3793 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3794 FAIL;
3795 }
3796 }
3797 /* Control never gets here */
3798
3799 /* OP_CLASS_L opcode: handles localized character classes */
3800
3801 case OP_CLASS_L:
3802 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003803 const uschar *data = ecode + 1; /* Save for matching */
3804 const uschar locale_flag = *data;
Guido van Rossum50700601997-12-08 17:15:20 +00003805 ecode++; data++; /* The localization support adds an extra byte */
3806
3807 ecode += 33; /* Advance past the item */
3808
3809 switch (*ecode)
3810 {
3811 case OP_CRSTAR:
3812 case OP_CRMINSTAR:
3813 case OP_CRPLUS:
3814 case OP_CRMINPLUS:
3815 case OP_CRQUERY:
3816 case OP_CRMINQUERY:
3817 c = *ecode++ - OP_CRSTAR;
3818 minimize = (c & 1) != 0;
3819 min = rep_min[c]; /* Pick up values from tables; */
3820 max = rep_max[c]; /* zero for max => infinity */
3821 if (max == 0) max = INT_MAX;
3822 break;
3823
3824 case OP_CRRANGE:
3825 case OP_CRMINRANGE:
3826 minimize = (*ecode == OP_CRMINRANGE);
3827 min = (ecode[1] << 8) + ecode[2];
3828 max = (ecode[3] << 8) + ecode[4];
3829 if (max == 0) max = INT_MAX;
3830 ecode += 5;
3831 break;
3832
3833 default: /* No repeat follows */
3834 if (eptr >= md->end_subject) FAIL;
3835 c = *eptr++;
3836 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
Guido van Rossum58132c61997-12-17 00:24:13 +00003837 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3838 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003839#if 0
3840 if ( (locale_flag & 4) && isdigit(c) ) continue; /* Locale \d */
3841 if ( (locale_flag & 8) && !isdigit(c) ) continue; /* Locale \D */
3842 if ( (locale_flag & 16) && isspace(c) ) continue; /* Locale \s */
3843 if ( (locale_flag & 32) && !isspace(c) ) continue; /* Locale \S */
3844#endif
3845
3846 if (md->runtime_caseless)
3847 {
3848 c = pcre_fcc[c];
3849 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
3850
Guido van Rossum58132c61997-12-17 00:24:13 +00003851 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3852 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003853 }
3854 FAIL;
3855 }
3856
3857 /* First, ensure the minimum number of matches are present. */
3858
3859 for (i = 1; i <= min; i++)
3860 {
3861 if (eptr >= md->end_subject) FAIL;
3862 c = *eptr++;
3863 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003864 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3865 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003866
3867 if (md->runtime_caseless)
3868 {
3869 c = pcre_fcc[c];
3870 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003871 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3872 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003873 }
3874 FAIL;
3875 }
3876
3877 /* If max == min we can continue with the main loop without the
3878 need to recurse. */
3879
3880 if (min == max) continue;
3881
3882 /* If minimizing, keep testing the rest of the expression and advancing
3883 the pointer while it matches the class. */
3884
3885 if (minimize)
3886 {
3887 for (i = min;; i++)
3888 {
3889 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3890 if (i >= max || eptr >= md->end_subject) FAIL;
3891 c = *eptr++;
3892 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003893 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3894 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003895
3896 if (md->runtime_caseless)
3897 {
3898 c = pcre_fcc[c];
3899 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003900 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3901 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003902 }
3903 FAIL;
3904 }
3905 /* Control never gets here */
3906 }
3907
3908 /* If maximizing, find the longest possible run, then work backwards. */
3909
3910 else
3911 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003912 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00003913 for (i = min; i < max; eptr++, i++)
3914 {
3915 if (eptr >= md->end_subject) break;
3916 c = *eptr;
3917 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003918 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3919 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003920 if (md->runtime_caseless)
3921 {
3922 c = pcre_fcc[c];
3923 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003924 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3925 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003926 }
3927 break;
3928 }
3929
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003930 while (eptr >= pp)
3931 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3932 FAIL;
3933 }
3934 }
3935 /* Control never gets here */
3936
3937 /* Match a run of characters */
3938
3939 case OP_CHARS:
3940 {
3941 register int length = ecode[1];
3942 ecode += 2;
3943
Guido van Rossum557dea11997-12-22 22:46:52 +00003944#ifdef DEBUG /* Sigh. Some compilers never learn. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003945 if (eptr >= md->end_subject)
3946 printf("matching subject <null> against pattern ");
3947 else
3948 {
3949 printf("matching subject ");
3950 pchars(eptr, length, TRUE, md);
3951 printf(" against pattern ");
3952 }
3953 pchars(ecode, length, FALSE, md);
3954 printf("\n");
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003955#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003956
3957 if (length > md->end_subject - eptr) FAIL;
3958 if (md->caseless)
3959 {
3960 while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) FAIL;
3961 }
3962 else
3963 {
3964 while (length-- > 0) if (*ecode++ != *eptr++) FAIL;
3965 }
3966 }
3967 break;
3968
3969 /* Match a single character repeatedly; different opcodes share code. */
3970
3971 case OP_EXACT:
3972 min = max = (ecode[1] << 8) + ecode[2];
3973 ecode += 3;
3974 goto REPEATCHAR;
3975
3976 case OP_UPTO:
3977 case OP_MINUPTO:
3978 min = 0;
3979 max = (ecode[1] << 8) + ecode[2];
3980 minimize = *ecode == OP_MINUPTO;
3981 ecode += 3;
3982 goto REPEATCHAR;
3983
3984 case OP_STAR:
3985 case OP_MINSTAR:
3986 case OP_PLUS:
3987 case OP_MINPLUS:
3988 case OP_QUERY:
3989 case OP_MINQUERY:
3990 c = *ecode++ - OP_STAR;
3991 minimize = (c & 1) != 0;
3992 min = rep_min[c]; /* Pick up values from tables; */
3993 max = rep_max[c]; /* zero for max => infinity */
3994 if (max == 0) max = INT_MAX;
3995
3996 /* Common code for all repeated single-character matches. We can give
3997 up quickly if there are fewer than the minimum number of characters left in
3998 the subject. */
3999
4000 REPEATCHAR:
4001 if (min > md->end_subject - eptr) FAIL;
4002 c = *ecode++;
4003
4004 /* The code is duplicated for the caseless and caseful cases, for speed,
4005 since matching characters is likely to be quite common. First, ensure the
4006 minimum number of matches are present. If min = max, continue at the same
4007 level without recursing. Otherwise, if minimizing, keep trying the rest of
4008 the expression and advancing one matching character if failing, up to the
4009 maximum. Alternatively, if maximizing, find the maximum number of
4010 characters and work backwards. */
4011
Guido van Rossum557dea11997-12-22 22:46:52 +00004012 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
4013 max, eptr));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004014
4015 if (md->caseless)
4016 {
4017 c = pcre_lcc[c];
4018 for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) FAIL;
4019 if (min == max) continue;
4020 if (minimize)
4021 {
4022 for (i = min;; i++)
4023 {
4024 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4025 if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])
4026 FAIL;
4027 }
4028 /* Control never gets here */
4029 }
4030 else
4031 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004032 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004033 for (i = min; i < max; i++)
4034 {
4035 if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;
4036 eptr++;
4037 }
4038 while (eptr >= pp)
4039 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4040 FAIL;
4041 }
Guido van Rossum50700601997-12-08 17:15:20 +00004042 /* Control never gets here */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004043 }
4044
4045 /* Caseful comparisons */
4046
4047 else
4048 {
4049 for (i = 1; i <= min; i++) if (c != *eptr++) FAIL;
4050 if (min == max) continue;
4051 if (minimize)
4052 {
4053 for (i = min;; i++)
4054 {
4055 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4056 if (i >= max || eptr >= md->end_subject || c != *eptr++) FAIL;
4057 }
4058 /* Control never gets here */
4059 }
4060 else
4061 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004062 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004063 for (i = min; i < max; i++)
4064 {
4065 if (eptr >= md->end_subject || c != *eptr) break;
4066 eptr++;
4067 }
4068 while (eptr >= pp)
4069 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4070 FAIL;
4071 }
4072 }
4073 /* Control never gets here */
4074
Guido van Rossum50700601997-12-08 17:15:20 +00004075 /* Match a negated single character */
4076
4077 case OP_NOT:
Guido van Rossum557dea11997-12-22 22:46:52 +00004078 if (eptr >= md->end_subject) FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00004079 ecode++;
4080 if (md->caseless)
4081 {
4082 if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) FAIL;
4083 }
4084 else
4085 {
4086 if (*ecode++ == *eptr++) FAIL;
4087 }
4088 break;
4089
4090 /* Match a negated single character repeatedly. This is almost a repeat of
4091 the code for a repeated single character, but I haven't found a nice way of
4092 commoning these up that doesn't require a test of the positive/negative
4093 option for each character match. Maybe that wouldn't add very much to the
4094 time taken, but character matching *is* what this is all about... */
4095
4096 case OP_NOTEXACT:
4097 min = max = (ecode[1] << 8) + ecode[2];
4098 ecode += 3;
4099 goto REPEATNOTCHAR;
4100
4101 case OP_NOTUPTO:
4102 case OP_NOTMINUPTO:
4103 min = 0;
4104 max = (ecode[1] << 8) + ecode[2];
4105 minimize = *ecode == OP_NOTMINUPTO;
4106 ecode += 3;
4107 goto REPEATNOTCHAR;
4108
4109 case OP_NOTSTAR:
4110 case OP_NOTMINSTAR:
4111 case OP_NOTPLUS:
4112 case OP_NOTMINPLUS:
4113 case OP_NOTQUERY:
4114 case OP_NOTMINQUERY:
4115 c = *ecode++ - OP_NOTSTAR;
4116 minimize = (c & 1) != 0;
4117 min = rep_min[c]; /* Pick up values from tables; */
4118 max = rep_max[c]; /* zero for max => infinity */
4119 if (max == 0) max = INT_MAX;
4120
4121 /* Common code for all repeated single-character matches. We can give
4122 up quickly if there are fewer than the minimum number of characters left in
4123 the subject. */
4124
4125 REPEATNOTCHAR:
4126 if (min > md->end_subject - eptr) FAIL;
4127 c = *ecode++;
4128
4129 /* The code is duplicated for the caseless and caseful cases, for speed,
4130 since matching characters is likely to be quite common. First, ensure the
4131 minimum number of matches are present. If min = max, continue at the same
4132 level without recursing. Otherwise, if minimizing, keep trying the rest of
4133 the expression and advancing one matching character if failing, up to the
4134 maximum. Alternatively, if maximizing, find the maximum number of
4135 characters and work backwards. */
4136
Guido van Rossum557dea11997-12-22 22:46:52 +00004137 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
4138 max, eptr));
Guido van Rossum50700601997-12-08 17:15:20 +00004139
4140 if (md->caseless)
4141 {
4142 c = pcre_lcc[c];
4143 for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) FAIL;
4144 if (min == max) continue;
4145 if (minimize)
4146 {
4147 for (i = min;; i++)
4148 {
4149 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4150 if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])
4151 FAIL;
4152 }
4153 /* Control never gets here */
4154 }
4155 else
4156 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004157 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00004158 for (i = min; i < max; i++)
4159 {
4160 if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;
4161 eptr++;
4162 }
4163 while (eptr >= pp)
4164 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4165 FAIL;
4166 }
4167 /* Control never gets here */
4168 }
4169
4170 /* Caseful comparisons */
4171
4172 else
4173 {
4174 for (i = 1; i <= min; i++) if (c == *eptr++) FAIL;
4175 if (min == max) continue;
4176 if (minimize)
4177 {
4178 for (i = min;; i++)
4179 {
4180 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4181 if (i >= max || eptr >= md->end_subject || c == *eptr++) FAIL;
4182 }
4183 /* Control never gets here */
4184 }
4185 else
4186 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004187 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00004188 for (i = min; i < max; i++)
4189 {
4190 if (eptr >= md->end_subject || c == *eptr) break;
4191 eptr++;
4192 }
4193 while (eptr >= pp)
4194 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4195 FAIL;
4196 }
4197 }
4198 /* Control never gets here */
4199
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004200 /* Match a single character type repeatedly; several different opcodes
4201 share code. This is very similar to the code for single characters, but we
4202 repeat it in the interests of efficiency. */
4203
4204 case OP_TYPEEXACT:
4205 min = max = (ecode[1] << 8) + ecode[2];
4206 minimize = TRUE;
4207 ecode += 3;
4208 goto REPEATTYPE;
4209
4210 case OP_TYPEUPTO:
4211 case OP_TYPEMINUPTO:
4212 min = 0;
4213 max = (ecode[1] << 8) + ecode[2];
4214 minimize = *ecode == OP_TYPEMINUPTO;
4215 ecode += 3;
4216 goto REPEATTYPE;
4217
4218 case OP_TYPESTAR:
4219 case OP_TYPEMINSTAR:
4220 case OP_TYPEPLUS:
4221 case OP_TYPEMINPLUS:
4222 case OP_TYPEQUERY:
4223 case OP_TYPEMINQUERY:
4224 c = *ecode++ - OP_TYPESTAR;
4225 minimize = (c & 1) != 0;
4226 min = rep_min[c]; /* Pick up values from tables; */
4227 max = rep_max[c]; /* zero for max => infinity */
4228 if (max == 0) max = INT_MAX;
4229
4230 /* Common code for all repeated single character type matches */
4231
4232 REPEATTYPE:
4233 ctype = *ecode++; /* Code for the character type */
4234
4235 /* First, ensure the minimum number of matches are present. Use inline
4236 code for maximizing the speed, and do the type test once at the start
4237 (i.e. keep it out of the loop). Also test that there are at least the
4238 minimum number of characters before we start. */
4239
4240 if (min > md->end_subject - eptr) FAIL;
4241 if (min > 0) switch(ctype)
4242 {
4243 case OP_ANY:
4244 if (!md->dotall)
4245 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') FAIL; }
4246 else eptr += min;
4247 break;
4248
4249 case OP_NOT_DIGIT:
4250 for (i = 1; i <= min; i++)
4251 if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) FAIL;
4252 break;
4253
4254 case OP_DIGIT:
4255 for (i = 1; i <= min; i++)
4256 if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) FAIL;
4257 break;
4258
4259 case OP_NOT_WHITESPACE:
4260 for (i = 1; i <= min; i++)
4261 if ((pcre_ctypes[*eptr++] & ctype_space) != 0) FAIL;
4262 break;
4263
4264 case OP_WHITESPACE:
4265 for (i = 1; i <= min; i++)
4266 if ((pcre_ctypes[*eptr++] & ctype_space) == 0) FAIL;
4267 break;
4268
4269 case OP_NOT_WORDCHAR:
4270 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
4271 FAIL;
4272 break;
4273
4274 case OP_WORDCHAR:
4275 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
4276 FAIL;
4277 break;
Guido van Rossum50700601997-12-08 17:15:20 +00004278
4279 case OP_NOT_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00004280 for (i = 1; i <= min; i++, eptr++) if (*eptr=='_' || isalnum(*eptr))
Guido van Rossum557dea11997-12-22 22:46:52 +00004281 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00004282 break;
4283
4284 case OP_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00004285 for (i = 1; i <= min; i++, eptr++) if (*eptr!='_' && !isalnum(*eptr))
Guido van Rossum557dea11997-12-22 22:46:52 +00004286 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00004287 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004288 }
4289
4290 /* If min = max, continue at the same level without recursing */
4291
4292 if (min == max) continue;
4293
4294 /* If minimizing, we have to test the rest of the pattern before each
4295 subsequent match, so inlining isn't much help; just use the function. */
4296
4297 if (minimize)
4298 {
4299 for (i = min;; i++)
4300 {
4301 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4302 if (i >= max || eptr >= md->end_subject ||
4303 !match_type(ctype, *eptr++, md->dotall))
4304 FAIL;
4305 }
4306 /* Control never gets here */
4307 }
4308
4309 /* If maximizing it is worth using inline code for speed, doing the type
4310 test once at the start (i.e. keep it out of the loop). */
4311
4312 else
4313 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004314 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004315 switch(ctype)
4316 {
4317 case OP_ANY:
4318 if (!md->dotall)
4319 {
4320 for (i = min; i < max; i++)
4321 {
4322 if (eptr >= md->end_subject || *eptr == '\n') break;
4323 eptr++;
4324 }
4325 }
4326 else
4327 {
4328 c = max - min;
4329 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4330 eptr += c;
4331 }
4332 break;
4333
4334 case OP_NOT_DIGIT:
4335 for (i = min; i < max; i++)
4336 {
4337 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)
4338 break;
4339 eptr++;
4340 }
4341 break;
4342
4343 case OP_DIGIT:
4344 for (i = min; i < max; i++)
4345 {
4346 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)
4347 break;
4348 eptr++;
4349 }
4350 break;
4351
4352 case OP_NOT_WHITESPACE:
4353 for (i = min; i < max; i++)
4354 {
4355 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)
4356 break;
4357 eptr++;
4358 }
4359 break;
4360
4361 case OP_WHITESPACE:
4362 for (i = min; i < max; i++)
4363 {
4364 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)
4365 break;
4366 eptr++;
4367 }
4368 break;
4369
4370 case OP_NOT_WORDCHAR:
4371 for (i = min; i < max; i++)
4372 {
4373 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)
4374 break;
4375 eptr++;
4376 }
4377 break;
4378
4379 case OP_WORDCHAR:
4380 for (i = min; i < max; i++)
4381 {
Guido van Rossum50700601997-12-08 17:15:20 +00004382 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)
4383 break;
4384 eptr++;
4385 }
4386 break;
4387 case OP_NOT_WORDCHAR_L:
4388 for (i = min; i < max; i++)
4389 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004390 if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) ) )
Guido van Rossum50700601997-12-08 17:15:20 +00004391 break;
4392 eptr++;
4393 }
4394 break;
4395
4396 case OP_WORDCHAR_L:
4397 for (i = min; i < max; i++)
4398 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004399 if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) ) )
Guido van Rossum50700601997-12-08 17:15:20 +00004400 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004401 eptr++;
4402 }
4403 break;
4404 }
4405
4406 while (eptr >= pp)
4407 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4408 FAIL;
4409 }
4410 /* Control never gets here */
4411
4412 /* There's been some horrible disaster. */
4413
4414 default:
Guido van Rossum557dea11997-12-22 22:46:52 +00004415 DPRINTF(("Unknown opcode %d\n", *ecode));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004416 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4417 FAIL;
4418 }
4419
4420 /* Do not stick any code in here without much thought; it is assumed
4421 that "continue" in the code above comes out to here to repeat the main
4422 loop. */
4423
4424 } /* End of main loop */
4425/* Control never reaches here */
4426
4427fail:
4428 if (md->point > save_stack_position)
4429 {
4430 /* If there are still points remaining on the stack, pop the next one off */
Guido van Rossumc3861071997-10-08 02:07:40 +00004431 int off_num;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004432
4433 md->point--;
4434 offset_top = md->offset_top[md->point];
4435 eptr = md->eptr[md->point];
4436 ecode = md->ecode[md->point];
4437 off_num = md->off_num[md->point];
4438 md->offset_vector[off_num] = md->r1[md->point];
4439 md->offset_vector[off_num+1] = md->r2[md->point];
4440 goto match_loop;
4441 }
4442 /* Failure, and nothing left on the stack, so end this function call */
4443
4444 /* Restore the top of the stack to where it was before this function
4445 call. This lets us use one stack for everything; recursive calls
4446 can push and pop information, and may increase the stack. When
4447 the call returns, the parent function can resume pushing and
4448 popping wherever it was. */
4449
4450 md->point = save_stack_position;
4451 return FALSE;
4452
4453succeed:
4454 return TRUE;
4455}
4456
4457
Guido van Rossum50700601997-12-08 17:15:20 +00004458
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004459/*************************************************
Guido van Rossum557dea11997-12-22 22:46:52 +00004460* Segregate setjmp() *
4461*************************************************/
4462
4463/* The -Wall option of gcc gives warnings for all local variables when setjmp()
4464is used, even if the coding conforms to the rules of ANSI C. To avoid this, we
4465hide it in a separate function. This is called only when PCRE_EXTRA is set,
4466since it's needed only for the extension \X option, and with any luck, a good
4467compiler will spot the tail recursion and compile it efficiently.
4468
4469Arguments:
4470 eptr pointer in subject
4471 ecode position in code
4472 offset_top current top pointer
4473 md pointer to "static" info for the match
4474
4475Returns: TRUE if matched
4476*/
4477
4478static BOOL
4479match_with_setjmp(const uschar *eptr, const uschar *ecode, int offset_top,
4480 match_data *match_block)
4481{
4482return setjmp(match_block->fail_env) == 0 &&
4483 match(eptr, ecode, offset_top, match_block);
4484}
4485
4486
4487
4488/*************************************************
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004489* Execute a Regular Expression *
4490*************************************************/
4491
4492/* This function applies a compiled re to a subject string and picks out
4493portions of the string if it matches. Two elements in the vector are set for
4494each substring: the offsets to the start and end of the substring.
4495
4496Arguments:
Guido van Rossum50700601997-12-08 17:15:20 +00004497 external_re points to the compiled expression
4498 external_extra points to "hints" from pcre_study() or is NULL
4499 subject points to the subject string
4500 length length of subject string (may contain binary zeros)
4501 options option bits
4502 offsets points to a vector of ints to be filled in with offsets
4503 offsetcount the number of elements in the vector
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004504
Guido van Rossum50700601997-12-08 17:15:20 +00004505Returns: > 0 => success; value is the number of elements filled in
4506 = 0 => success, but offsets is not big enough
4507 -1 => failed to match
4508 < -1 => some kind of unexpected problem
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004509*/
4510
4511int
Guido van Rossum50700601997-12-08 17:15:20 +00004512pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
Guido van Rossum816671c1998-03-10 04:55:29 +00004513 const char *subject, int length, int start_pos, int options,
4514 int *offsets, int offsetcount)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004515{
Guido van Rossum58132c61997-12-17 00:24:13 +00004516 /* The "volatile" directives are to make gcc -Wall stop complaining
4517 that these variables can be clobbered by the longjmp. Hopefully
4518 they won't cost too much performance. */
Guido van Rossum042ff9e1998-04-03 21:13:31 +00004519volatile int resetcount, ocount;
4520volatile int first_char = -1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004521match_data match_block;
Guido van Rossum557dea11997-12-22 22:46:52 +00004522const uschar *start_bits = NULL;
Guido van Rossum816671c1998-03-10 04:55:29 +00004523const uschar *start_match = (const uschar *)subject + start_pos;
Guido van Rossum58132c61997-12-17 00:24:13 +00004524const uschar *end_subject;
4525const real_pcre *re = (const real_pcre *)external_re;
4526const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00004527volatile BOOL using_temporary_offsets = FALSE;
4528volatile BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4529volatile BOOL startline = (re->options & PCRE_STARTLINE) != 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004530
4531if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4532
4533if (re == NULL || subject == NULL ||
4534 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4535if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4536
Guido van Rossum58132c61997-12-17 00:24:13 +00004537match_block.start_subject = (const uschar *)subject;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004538match_block.end_subject = match_block.start_subject + length;
4539end_subject = match_block.end_subject;
4540
Guido van Rossum50700601997-12-08 17:15:20 +00004541match_block.caseless = ((re->options | options) & PCRE_CASELESS) != 0;
4542match_block.runtime_caseless = match_block.caseless &&
4543 (re->options & PCRE_CASELESS) == 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004544
Guido van Rossum50700601997-12-08 17:15:20 +00004545match_block.multiline = ((re->options | options) & PCRE_MULTILINE) != 0;
4546match_block.dotall = ((re->options | options) & PCRE_DOTALL) != 0;
4547match_block.endonly = ((re->options | options) & PCRE_DOLLAR_ENDONLY) != 0;
4548
4549match_block.notbol = (options & PCRE_NOTBOL) != 0;
4550match_block.noteol = (options & PCRE_NOTEOL) != 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004551
4552match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
4553
4554/* Set the stack state to empty */
4555 match_block.off_num = match_block.offset_top = NULL;
4556 match_block.r1 = match_block.r2 = NULL;
4557 match_block.eptr = match_block.ecode = NULL;
4558 match_block.point = match_block.length = 0;
4559
Guido van Rossum50700601997-12-08 17:15:20 +00004560/* If the expression has got more back references than the offsets supplied can
4561hold, we get a temporary bit of working store to use during the matching.
Guido van Rossum557dea11997-12-22 22:46:52 +00004562Otherwise, we can use the vector supplied, rounding down its size to a multiple
4563of 2. */
Guido van Rossum50700601997-12-08 17:15:20 +00004564
Guido van Rossum557dea11997-12-22 22:46:52 +00004565ocount = offsetcount & (-2);
4566if (re->top_backref > 0 && re->top_backref >= ocount/2)
Guido van Rossum50700601997-12-08 17:15:20 +00004567 {
4568 ocount = re->top_backref * 2 + 2;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00004569 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
Guido van Rossum50700601997-12-08 17:15:20 +00004570 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
Guido van Rossum557dea11997-12-22 22:46:52 +00004571 using_temporary_offsets = TRUE;
4572 DPRINTF(("Got memory to hold back references\n"));
Guido van Rossum50700601997-12-08 17:15:20 +00004573 }
4574else match_block.offset_vector = offsets;
4575
4576match_block.offset_end = ocount;
4577match_block.offset_overflow = FALSE;
4578
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004579/* Compute the minimum number of offsets that we need to reset each time. Doing
4580this makes a huge difference to execution time when there aren't many brackets
4581in the pattern. */
4582
4583resetcount = 2 + re->top_bracket * 2;
Guido van Rossum50700601997-12-08 17:15:20 +00004584if (resetcount > offsetcount) resetcount = ocount;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004585
4586/* If MULTILINE is set at exec time but was not set at compile time, and the
4587anchored flag is set, we must re-check because a setting provoked by ^ in the
4588pattern is not right in multi-line mode. Calling is_anchored() again here does
4589the right check, because multiline is now set. If it now yields FALSE, the
4590expression must have had ^ starting some of its branches. Check to see if
4591that is true for *all* branches, and if so, set the startline flag. */
4592
Guido van Rossum557dea11997-12-22 22:46:52 +00004593if (match_block.multiline && anchored && (re->options & PCRE_MULTILINE) == 0 &&
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004594 !is_anchored(re->code, match_block.multiline))
4595 {
4596 anchored = FALSE;
4597 if (is_startline(re->code)) startline = TRUE;
4598 }
4599
4600/* Set up the first character to match, if available. The first_char value is
4601never set for an anchored regular expression, but the anchoring may be forced
4602at run time, so we have to test for anchoring. The first char may be unset for
4603an unanchored pattern, of course. If there's no first char and the pattern was
4604studied, the may be a bitmap of possible first characters. However, we can
4605use this only if the caseless state of the studying was correct. */
4606
4607if (!anchored)
4608 {
4609 if ((re->options & PCRE_FIRSTSET) != 0)
4610 {
4611 first_char = re->first_char;
4612 if (match_block.caseless) first_char = pcre_lcc[first_char];
4613 }
4614 else
4615 if (!startline && extra != NULL &&
4616 (extra->options & PCRE_STUDY_MAPPED) != 0 &&
4617 ((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless)
4618 start_bits = extra->start_bits;
4619 }
4620
4621/* Loop for unanchored matches; for anchored regexps the loop runs just once. */
4622
4623do
4624 {
Guido van Rossum557dea11997-12-22 22:46:52 +00004625 int rc;
Guido van Rossum50700601997-12-08 17:15:20 +00004626 register int *iptr = match_block.offset_vector;
4627 register int *iend = iptr + resetcount;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004628
4629 /* Reset the maximum number of extractions we might see. */
4630
4631 while (iptr < iend) *iptr++ = -1;
4632
4633 /* Advance to a unique first char if possible */
4634
4635 if (first_char >= 0)
4636 {
4637 if (match_block.caseless)
4638 while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
4639 start_match++;
4640 else
4641 while (start_match < end_subject && *start_match != first_char)
4642 start_match++;
4643 }
4644
4645 /* Or to just after \n for a multiline match if possible */
4646
4647 else if (startline)
4648 {
4649 if (start_match > match_block.start_subject)
4650 {
4651 while (start_match < end_subject && start_match[-1] != '\n')
4652 start_match++;
4653 }
4654 }
4655
4656 /* Or to a non-unique first char */
4657
4658 else if (start_bits != NULL)
4659 {
4660 while (start_match < end_subject)
4661 {
4662 register int c = *start_match;
Guido van Rossum50700601997-12-08 17:15:20 +00004663 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004664 }
4665 }
4666
Guido van Rossum557dea11997-12-22 22:46:52 +00004667#ifdef DEBUG /* Sigh. Some compilers never learn. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004668 printf(">>>> Match against: ");
4669 pchars(start_match, end_subject - start_match, TRUE, &match_block);
4670 printf("\n");
Guido van Rossum57ba4f31997-12-02 20:40:28 +00004671#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004672
4673 /* When a match occurs, substrings will be set for all internal extractions;
4674 we just need to set up the whole thing as substring 0 before returning. If
Guido van Rossum50700601997-12-08 17:15:20 +00004675 there were too many extractions, set the return code to zero. In the case
4676 where we had to get some local store to hold offsets for backreferences, copy
4677 those back references that we can. In this case there need not be overflow
4678 if certain parts of the pattern were not used.
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004679
Guido van Rossum50700601997-12-08 17:15:20 +00004680 Before starting the match, we have to set up a longjmp() target to enable
Guido van Rossum557dea11997-12-22 22:46:52 +00004681 the "cut" operation to fail a match completely without backtracking. This
4682 is done in a separate function to avoid compiler warnings. We need not do
4683 it unless PCRE_EXTRA is set, since only in that case is the "cut" operation
4684 enabled. */
Guido van Rossum50700601997-12-08 17:15:20 +00004685
4686 /* To handle errors such as running out of memory for the failure
4687 stack, we need to save this location via setjmp(), so
4688 error-handling code can call longjmp() to jump out of deeply-nested code. */
4689 if (setjmp(match_block.error_env)==0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004690 {
Guido van Rossum50700601997-12-08 17:15:20 +00004691
Guido van Rossum557dea11997-12-22 22:46:52 +00004692 if ((re->options & PCRE_EXTRA) != 0)
Guido van Rossum50700601997-12-08 17:15:20 +00004693 {
Guido van Rossum557dea11997-12-22 22:46:52 +00004694 if (!match_with_setjmp(start_match, re->code, 2, &match_block))
4695 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004696 }
Guido van Rossum557dea11997-12-22 22:46:52 +00004697 else if (!match(start_match, re->code, 2, &match_block)) continue;
4698
4699 /* Copy the offset information from temporary store if necessary */
4700
4701 if (using_temporary_offsets)
4702 {
4703 if (offsetcount >= 4)
4704 {
4705 memcpy(offsets + 2, match_block.offset_vector + 2,
4706 (offsetcount - 2) * sizeof(int));
4707 DPRINTF(("Copied offsets from temporary memory\n"));
4708 }
4709 if (match_block.end_offset_top > offsetcount)
4710 match_block.offset_overflow = TRUE;
4711
4712 DPRINTF(("Freeing temporary memory\n"));
4713 (pcre_free)(match_block.offset_vector);
4714 }
4715
4716 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
4717
4718 if (match_block.offset_end < 2) rc = 0; else
4719 {
4720 offsets[0] = start_match - match_block.start_subject;
4721 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
4722 }
4723
4724 DPRINTF((">>>> returning %d\n", rc));
4725 free_stack(&match_block);
4726 return rc;
Guido van Rossum50700601997-12-08 17:15:20 +00004727 } /* End of (if setjmp(match_block.error_env)...) */
Guido van Rossum042ff9e1998-04-03 21:13:31 +00004728 free_stack(&match_block);
4729
Guido van Rossum50700601997-12-08 17:15:20 +00004730 /* Return an error code; pcremodule.c will preserve the exception */
4731 if (PyErr_Occurred()) return PCRE_ERROR_NOMEMORY;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004732 }
4733while (!anchored &&
4734 match_block.errorcode == PCRE_ERROR_NOMATCH &&
4735 start_match++ < end_subject);
4736
Guido van Rossum557dea11997-12-22 22:46:52 +00004737if (using_temporary_offsets)
4738 {
4739 DPRINTF(("Freeing temporary memory\n"));
4740 (pcre_free)(match_block.offset_vector);
4741 }
4742
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004743#ifdef DEBUG
4744printf(">>>> returning %d\n", match_block.errorcode);
4745#endif
Guido van Rossum50700601997-12-08 17:15:20 +00004746
Guido van Rossum557dea11997-12-22 22:46:52 +00004747 return match_block.errorcode;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004748}
4749
4750/* End of pcre.c */