blob: 4e474c59375a8b055b4bb0d16b9f95b4d925f0cf [file] [log] [blame]
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001
2/*************************************************
3* Perl-Compatible Regular Expressions *
4*************************************************/
5
6/* DO NOT EDIT THIS FILE! */
7
8/* This file is automatically written by the merge-files.py script
9included with the PCRE distribution for Python; it's produced from
10several C files, and code is removed in the process. If you want to
11modify the code or track down bugs, it will be much easier to work
12with the code in its original, multiple-file form. Don't edit this
13file by hand, or submit patches to it.
14
15The Python-specific PCRE distribution can be retrieved from
16 http://starship.skyport.net/crew/amk/regex/
17
Guido van Rossum58132c61997-12-17 00:24:13 +000018The unmodified original PCRE distribution is available at
19ftp://ftp.cus.cam.ac.uk/pub/software/programs/pcre/, and is originally
20written by: Philip Hazel <ph10@cam.ac.uk>
Guido van Rossum51b3aa31997-10-06 14:43:11 +000021
22Extensively modified by the Python String-SIG: <string-sig@python.org>
23Send bug reports to: <string-sig@python.org>
24(They'll figure out if it's a bug in PCRE or in the Python-specific
25changes.)
26
27 Copyright (c) 1997 University of Cambridge
28
29-----------------------------------------------------------------------------
30Permission is granted to anyone to use this software for any purpose on any
31computer system, and to redistribute it freely, subject to the following
32restrictions:
33
341. This software is distributed in the hope that it will be useful,
35 but WITHOUT ANY WARRANTY; without even the implied warranty of
36 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
37
382. The origin of this software must not be misrepresented, either by
39 explicit claim or by omission.
40
413. Altered versions must be plainly marked as such, and must not be
42 misrepresented as being the original software.
43-----------------------------------------------------------------------------
44*/
45
46
47#define FOR_PYTHON
Guido van Rossum58132c61997-12-17 00:24:13 +000048#include "pcre-int.h"
Guido van Rossum51b3aa31997-10-06 14:43:11 +000049#include "Python.h"
Guido van Rossum50700601997-12-08 17:15:20 +000050#include "mymalloc.h"
51#include <ctype.h>
Guido van Rossum51b3aa31997-10-06 14:43:11 +000052#include "graminit.h"
53
54/*************************************************
55* Perl-Compatible Regular Expressions *
56*************************************************/
57
58/* This file is automatically written by the makechartables auxiliary
59program. If you edit it by hand, you might like to edit the Makefile to
60prevent its ever being regenerated. */
61
62/* This table is a lower casing table. */
63
64unsigned char pcre_lcc[] = {
65 0, 1, 2, 3, 4, 5, 6, 7,
66 8, 9, 10, 11, 12, 13, 14, 15,
67 16, 17, 18, 19, 20, 21, 22, 23,
68 24, 25, 26, 27, 28, 29, 30, 31,
69 32, 33, 34, 35, 36, 37, 38, 39,
70 40, 41, 42, 43, 44, 45, 46, 47,
71 48, 49, 50, 51, 52, 53, 54, 55,
72 56, 57, 58, 59, 60, 61, 62, 63,
73 64, 97, 98, 99,100,101,102,103,
74 104,105,106,107,108,109,110,111,
75 112,113,114,115,116,117,118,119,
76 120,121,122, 91, 92, 93, 94, 95,
77 96, 97, 98, 99,100,101,102,103,
78 104,105,106,107,108,109,110,111,
79 112,113,114,115,116,117,118,119,
80 120,121,122,123,124,125,126,127,
81 128,129,130,131,132,133,134,135,
82 136,137,138,139,140,141,142,143,
83 144,145,146,147,148,149,150,151,
84 152,153,154,155,156,157,158,159,
85 160,161,162,163,164,165,166,167,
86 168,169,170,171,172,173,174,175,
87 176,177,178,179,180,181,182,183,
88 184,185,186,187,188,189,190,191,
89 192,193,194,195,196,197,198,199,
90 200,201,202,203,204,205,206,207,
91 208,209,210,211,212,213,214,215,
92 216,217,218,219,220,221,222,223,
93 224,225,226,227,228,229,230,231,
94 232,233,234,235,236,237,238,239,
95 240,241,242,243,244,245,246,247,
96 248,249,250,251,252,253,254,255 };
97
Guido van Rossum50700601997-12-08 17:15:20 +000098/* This table is a case flipping table. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +000099
Guido van Rossum50700601997-12-08 17:15:20 +0000100unsigned char pcre_fcc[] = {
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000101 0, 1, 2, 3, 4, 5, 6, 7,
102 8, 9, 10, 11, 12, 13, 14, 15,
103 16, 17, 18, 19, 20, 21, 22, 23,
104 24, 25, 26, 27, 28, 29, 30, 31,
105 32, 33, 34, 35, 36, 37, 38, 39,
106 40, 41, 42, 43, 44, 45, 46, 47,
107 48, 49, 50, 51, 52, 53, 54, 55,
108 56, 57, 58, 59, 60, 61, 62, 63,
Guido van Rossum50700601997-12-08 17:15:20 +0000109 64, 97, 98, 99,100,101,102,103,
110 104,105,106,107,108,109,110,111,
111 112,113,114,115,116,117,118,119,
112 120,121,122, 91, 92, 93, 94, 95,
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000113 96, 65, 66, 67, 68, 69, 70, 71,
114 72, 73, 74, 75, 76, 77, 78, 79,
115 80, 81, 82, 83, 84, 85, 86, 87,
116 88, 89, 90,123,124,125,126,127,
117 128,129,130,131,132,133,134,135,
118 136,137,138,139,140,141,142,143,
119 144,145,146,147,148,149,150,151,
120 152,153,154,155,156,157,158,159,
121 160,161,162,163,164,165,166,167,
122 168,169,170,171,172,173,174,175,
123 176,177,178,179,180,181,182,183,
124 184,185,186,187,188,189,190,191,
125 192,193,194,195,196,197,198,199,
126 200,201,202,203,204,205,206,207,
127 208,209,210,211,212,213,214,215,
128 216,217,218,219,220,221,222,223,
129 224,225,226,227,228,229,230,231,
130 232,233,234,235,236,237,238,239,
131 240,241,242,243,244,245,246,247,
132 248,249,250,251,252,253,254,255 };
133
Guido van Rossum50700601997-12-08 17:15:20 +0000134/* This table contains bit maps for digits, letters, 'word' chars, and
135white space. Each map is 32 bytes long and the bits run from the least
136significant end of each byte. */
137
138unsigned char pcre_cbits[] = {
139 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
140 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
141 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
142 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
143
144 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
145 0xfe,0xff,0xff,0x07,0xfe,0xff,0xff,0x07,
146 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
147 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
148
149 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03,
150 0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07,
151 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
152 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
153
154 0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00,
155 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
156 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
157 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 };
158
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000159/* This table identifies various classes of character by individual bits:
Guido van Rossum50700601997-12-08 17:15:20 +0000160 0x01 white space character
161 0x02 letter
162 0x04 decimal digit
163 0x08 hexadecimal digit
164 0x10 alphanumeric or '_'
165 0x80 regular expression metacharacter or binary zero
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000166*/
167
168unsigned char pcre_ctypes[] = {
169 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
170 0x00,0x01,0x01,0x01,0x01,0x01,0x00,0x00, /* 8- 15 */
171 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
172 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
173 0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /* - ' */
174 0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /* ( - / */
Guido van Rossum50700601997-12-08 17:15:20 +0000175 0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c,0x3c, /* 0 - 7 */
176 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x80, /* 8 - ? */
177 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* @ - G */
178 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* H - O */
179 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* P - W */
180 0x12,0x12,0x12,0x80,0x00,0x00,0x80,0x10, /* X - _ */
181 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* ` - g */
182 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* h - o */
183 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* p - w */
184 0x12,0x12,0x12,0x80,0x80,0x00,0x00,0x00, /* x -127 */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000185 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
186 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
187 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
188 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
189 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
190 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
191 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
192 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
193 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
194 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
195 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
196 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
197 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
198 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
199 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
200 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
201
Guido van Rossum50700601997-12-08 17:15:20 +0000202/* End of chartables.c */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000203/*************************************************
204* Perl-Compatible Regular Expressions *
205*************************************************/
206
207/*
208This is a library of functions to support regular expressions whose syntax
209and semantics are as close as possible to those of the Perl 5 language. See
210the file Tech.Notes for some information on the internals.
211
212Written by: Philip Hazel <ph10@cam.ac.uk>
213
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000214 Copyright (c) 1998 University of Cambridge
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000215
216-----------------------------------------------------------------------------
217Permission is granted to anyone to use this software for any purpose on any
218computer system, and to redistribute it freely, subject to the following
219restrictions:
220
2211. This software is distributed in the hope that it will be useful,
222 but WITHOUT ANY WARRANTY; without even the implied warranty of
223 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
224
2252. The origin of this software must not be misrepresented, either by
226 explicit claim or by omission.
227
2283. Altered versions must be plainly marked as such, and must not be
229 misrepresented as being the original software.
230-----------------------------------------------------------------------------
231*/
232
233
234/* Include the internals header, which itself includes Standard C headers plus
235the external pcre header. */
236
237
238
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000239
240/*************************************************
241* Create bitmap of starting chars *
242*************************************************/
243
244/* This function scans a compiled unanchored expression and attempts to build a
245bitmap of the set of initial characters. If it can't, it returns FALSE. As time
246goes by, we may be able to get more clever at doing this.
247
248Arguments:
249 code points to an expression
250 start_bits points to a 32-byte table, initialized to 0
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000251
252Returns: TRUE if table built, FALSE otherwise
253*/
254
255static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +0000256set_start_bits(const uschar *code, uschar *start_bits)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000257{
Guido van Rossum50700601997-12-08 17:15:20 +0000258register int c;
259
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000260do
261 {
Guido van Rossum58132c61997-12-17 00:24:13 +0000262 const uschar *tcode = code + 3;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000263 BOOL try_next = TRUE;
264
265 while (try_next)
266 {
267 try_next = FALSE;
268
269 if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
270 {
Guido van Rossum50700601997-12-08 17:15:20 +0000271 if (!set_start_bits(tcode, start_bits)) return FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000272 }
273
274 else switch(*tcode)
275 {
276 default:
277 return FALSE;
278
279 /* BRAZERO does the bracket, but carries on. */
280
281 case OP_BRAZERO:
282 case OP_BRAMINZERO:
Guido van Rossum50700601997-12-08 17:15:20 +0000283 if (!set_start_bits(++tcode, start_bits)) return FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000284 do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
285 tcode += 3;
286 try_next = TRUE;
287 break;
288
289 /* Single-char * or ? sets the bit and tries the next item */
290
291 case OP_STAR:
292 case OP_MINSTAR:
293 case OP_QUERY:
294 case OP_MINQUERY:
Guido van Rossum50700601997-12-08 17:15:20 +0000295 start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000296 tcode += 2;
297 try_next = TRUE;
298 break;
299
300 /* Single-char upto sets the bit and tries the next */
301
302 case OP_UPTO:
303 case OP_MINUPTO:
Guido van Rossum50700601997-12-08 17:15:20 +0000304 start_bits[tcode[3]/8] |= (1 << (tcode[3]&7));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000305 tcode += 4;
306 try_next = TRUE;
307 break;
308
309 /* At least one single char sets the bit and stops */
310
311 case OP_EXACT: /* Fall through */
312 tcode++;
313
314 case OP_CHARS: /* Fall through */
315 tcode++;
316
317 case OP_PLUS:
318 case OP_MINPLUS:
Guido van Rossum50700601997-12-08 17:15:20 +0000319 start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000320 break;
321
322 /* Single character type sets the bits and stops */
323
324 case OP_NOT_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000325 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000326 break;
327
328 case OP_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000329 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000330 break;
331
332 case OP_NOT_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000333 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000334 break;
335
336 case OP_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000337 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000338 break;
339
340 case OP_NOT_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000341 for (c = 0; c < 32; c++)
342 start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000343 break;
344
345 case OP_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000346 for (c = 0; c < 32; c++)
347 start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000348 break;
349
350 /* One or more character type fudges the pointer and restarts, knowing
351 it will hit a single character type and stop there. */
352
353 case OP_TYPEPLUS:
354 case OP_TYPEMINPLUS:
355 tcode++;
356 try_next = TRUE;
357 break;
358
359 case OP_TYPEEXACT:
360 tcode += 3;
361 try_next = TRUE;
362 break;
363
364 /* Zero or more repeats of character types set the bits and then
365 try again. */
366
367 case OP_TYPEUPTO:
368 case OP_TYPEMINUPTO:
369 tcode += 2; /* Fall through */
370
371 case OP_TYPESTAR:
372 case OP_TYPEMINSTAR:
373 case OP_TYPEQUERY:
374 case OP_TYPEMINQUERY:
375 switch(tcode[1])
376 {
377 case OP_NOT_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000378 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000379 break;
380
381 case OP_DIGIT:
Guido van Rossum50700601997-12-08 17:15:20 +0000382 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000383 break;
384
385 case OP_NOT_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000386 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000387 break;
388
389 case OP_WHITESPACE:
Guido van Rossum50700601997-12-08 17:15:20 +0000390 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000391 break;
392
393 case OP_NOT_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000394 for (c = 0; c < 32; c++)
395 start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000396 break;
397
398 case OP_WORDCHAR:
Guido van Rossum50700601997-12-08 17:15:20 +0000399 for (c = 0; c < 32; c++)
400 start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000401 break;
402 }
403
404 tcode += 2;
405 try_next = TRUE;
406 break;
407
408 /* Character class: set the bits and either carry on or not,
409 according to the repeat count. */
410
411 case OP_CLASS:
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000412 case OP_NEGCLASS:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000413 {
Guido van Rossum50700601997-12-08 17:15:20 +0000414 tcode++;
415 for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
416 tcode += 32;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000417 switch (*tcode)
418 {
419 case OP_CRSTAR:
420 case OP_CRMINSTAR:
421 case OP_CRQUERY:
422 case OP_CRMINQUERY:
423 tcode++;
424 try_next = TRUE;
425 break;
426
427 case OP_CRRANGE:
428 case OP_CRMINRANGE:
429 if (((tcode[1] << 8) + tcode[2]) == 0)
430 {
431 tcode += 5;
432 try_next = TRUE;
433 }
434 break;
435 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000436 }
437 break; /* End of class handling */
438
439 } /* End of switch */
440 } /* End of try_next loop */
441
442 code += (code[1] << 8) + code[2]; /* Advance to next branch */
443 }
444while (*code == OP_ALT);
445return TRUE;
446}
447
448
449
450/*************************************************
451* Study a compiled expression *
452*************************************************/
453
454/* This function is handed a compiled expression that it must study to produce
455information that will speed up the matching. It returns a pcre_extra block
456which then gets handed back to pcre_exec().
457
458Arguments:
459 re points to the compiled expression
460 options contains option bits
461 errorptr points to where to place error messages;
462 set NULL unless error
463
464Returns: pointer to a pcre_extra block,
465 NULL on error or if no optimization possible
466*/
467
468pcre_extra *
Guido van Rossum58132c61997-12-17 00:24:13 +0000469pcre_study(const pcre *external_re, int options, const char **errorptr)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000470{
471BOOL caseless;
472uschar start_bits[32];
473real_pcre_extra *extra;
Guido van Rossum58132c61997-12-17 00:24:13 +0000474const real_pcre *re = (const real_pcre *)external_re;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000475
476*errorptr = NULL;
477
478if (re == NULL || re->magic_number != MAGIC_NUMBER)
479 {
480 *errorptr = "argument is not a compiled regular expression";
481 return NULL;
482 }
483
484if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
485 {
486 *errorptr = "unknown or incorrect option bit(s) set";
487 return NULL;
488 }
489
Guido van Rossum50700601997-12-08 17:15:20 +0000490/* Caseless can either be from the compiled regex or from options. */
491
492caseless = ((re->options | options) & PCRE_CASELESS) != 0;
493
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000494/* For an anchored pattern, or an unchored pattern that has a first char, or a
495multiline pattern that matches only at "line starts", no further processing at
496present. */
497
498if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
499 return NULL;
500
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000501/* See if we can find a fixed set of initial characters for the pattern. */
502
Guido van Rossum50700601997-12-08 17:15:20 +0000503memset(start_bits, 0, 32 * sizeof(uschar));
504if (!set_start_bits(re->code, start_bits)) return NULL;
505
506/* If this studying is caseless, scan the created bit map and duplicate the
507bits for any letters. */
508
509if (caseless)
510 {
511 register int c;
512 for (c = 0; c < 256; c++)
513 {
514 if ((start_bits[c/8] & (1 << (c&7))) != 0 &&
515 (pcre_ctypes[c] & ctype_letter) != 0)
516 {
517 int d = pcre_fcc[c];
518 start_bits[d/8] |= (1 << (d&7));
519 }
520 }
521 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000522
523/* Get an "extra" block and put the information therein. */
524
525extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
526
527if (extra == NULL)
528 {
529 *errorptr = "failed to get memory";
530 return NULL;
531 }
Guido van Rossum50700601997-12-08 17:15:20 +0000532
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000533extra->options = PCRE_STUDY_MAPPED | (caseless? PCRE_STUDY_CASELESS : 0);
Guido van Rossum50700601997-12-08 17:15:20 +0000534memcpy(extra->start_bits, start_bits, sizeof(start_bits));
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000535
536return (pcre_extra *)extra;
537}
538
Guido van Rossum50700601997-12-08 17:15:20 +0000539/* End of study.c */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000540/*************************************************
541* Perl-Compatible Regular Expressions *
542*************************************************/
543
544/*
545This is a library of functions to support regular expressions whose syntax
546and semantics are as close as possible to those of the Perl 5 language. See
547the file Tech.Notes for some information on the internals.
548
549Written by: Philip Hazel <ph10@cam.ac.uk>
550
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000551 Copyright (c) 1998 University of Cambridge
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000552
553-----------------------------------------------------------------------------
554Permission is granted to anyone to use this software for any purpose on any
555computer system, and to redistribute it freely, subject to the following
556restrictions:
557
5581. This software is distributed in the hope that it will be useful,
559 but WITHOUT ANY WARRANTY; without even the implied warranty of
560 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
561
5622. The origin of this software must not be misrepresented, either by
563 explicit claim or by omission.
564
5653. Altered versions must be plainly marked as such, and must not be
566 misrepresented as being the original software.
567-----------------------------------------------------------------------------
568*/
569
570
571/* Define DEBUG to get debugging output on stdout. */
572
573/* #define DEBUG */
574
Guido van Rossum557dea11997-12-22 22:46:52 +0000575/* Use a macro for debugging printing, 'cause that eliminates the the use
576of #ifdef inline, and there are *still* stupid compilers about that don't like
577indented pre-processor statements. I suppose it's only been 10 years... */
578
579#ifdef DEBUG
580#define DPRINTF(p) printf p
581#else
582#define DPRINTF(p) /*nothing*/
583#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000584
585/* Include the internals header, which itself includes Standard C headers plus
586the external pcre header. */
587
588
Guido van Rossum50700601997-12-08 17:15:20 +0000589
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000590
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000591#ifndef Py_eval_input
592/* For Python 1.4, graminit.h has to be explicitly included */
593#define Py_eval_input eval_input
Guido van Rossum50700601997-12-08 17:15:20 +0000594
595#endif /* FOR_PYTHON */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000596
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000597/* Allow compilation as C++ source code, should anybody want to do that. */
598
599#ifdef __cplusplus
600#define class pcre_class
601#endif
602
603
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000604/* Min and max values for the common repeats; for the maxima, 0 => infinity */
605
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000606static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
607static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000608
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000609/* Text forms of OP_ values and things, for debugging (not all used) */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000610
611#ifdef DEBUG
Guido van Rossum58132c61997-12-17 00:24:13 +0000612static const char *OP_names[] = {
613 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
Guido van Rossum50700601997-12-08 17:15:20 +0000614 "\\S", "\\s", "\\W", "\\w", "Cut", "\\Z",
615 "localized \\B", "localized \\b", "localized \\W", "localized \\w",
616 "^", "$", "Any", "chars",
617 "not",
618 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000619 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
620 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
621 "*", "*?", "+", "+?", "?", "??", "{", "{",
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000622 "class", "negclass", "classL", "Ref",
Guido van Rossum50700601997-12-08 17:15:20 +0000623 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000624 "Brazero", "Braminzero", "Bra"
625};
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000626#endif
627
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000628/* Table for handling escaped characters in the range '0'-'z'. Positive returns
629are simple data values; negative values are for special things like \d and so
630on. Zero means further processing is needed (for things like \x), or the escape
631is invalid. */
632
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000633static const short int escapes[] = {
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000634 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
635 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
636 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
637 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
638 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
639 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
640 '`', 7, -ESC_b, 0, -ESC_d, 0, '\f', 0, /* ` - g */
641 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
642 0, 0, '\r', -ESC_s, '\t', 0, '\v', -ESC_w, /* p - w */
643 0, 0, 0 /* x - z */
644};
645
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000646/* Definition to allow mutual recursion */
647
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000648static BOOL
649compile_regex(int, int *, uschar **, const uschar **, const char **,
650 PyObject *);
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000651
652/* Structure for passing "static" information around between the functions
653doing the matching, so that they are thread-safe. */
654
655typedef struct match_data {
656 int errorcode; /* As it says */
657 int *offset_vector; /* Offset vector */
658 int offset_end; /* One past the end */
659 BOOL offset_overflow; /* Set if too many extractions */
660 BOOL caseless; /* Case-independent flag */
Guido van Rossum50700601997-12-08 17:15:20 +0000661 BOOL runtime_caseless; /* Caseless forced at run time */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000662 BOOL multiline; /* Multiline flag */
Guido van Rossum50700601997-12-08 17:15:20 +0000663 BOOL notbol; /* NOTBOL flag */
664 BOOL noteol; /* NOTEOL flag */
665 BOOL dotall; /* Dot matches any char */
666 BOOL endonly; /* Dollar not before final \n */
Guido van Rossum58132c61997-12-17 00:24:13 +0000667 const uschar *start_subject; /* Start of the subject string */
668 const uschar *end_subject; /* End of the subject string */
Guido van Rossum50700601997-12-08 17:15:20 +0000669 jmp_buf fail_env; /* Environment for longjump() break out */
Guido van Rossum58132c61997-12-17 00:24:13 +0000670 const uschar *end_match_ptr; /* Subject position at end match */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000671 int end_offset_top; /* Highwater mark at end of match */
Guido van Rossum50700601997-12-08 17:15:20 +0000672 jmp_buf error_env; /* For longjmp() if an error occurs deep inside a
673 matching operation */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000674 int length; /* Length of the allocated stacks */
675 int point; /* Point to add next item pushed onto stacks */
676 /* Pointers to the 6 stacks */
677 int *off_num, *offset_top, *r1, *r2;
Guido van Rossum58132c61997-12-17 00:24:13 +0000678 const uschar **eptr, **ecode;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000679} match_data;
680
681
682
683/*************************************************
Guido van Rossum50700601997-12-08 17:15:20 +0000684* Global variables *
685*************************************************/
686
687/* PCRE is thread-clean and doesn't use any global variables in the normal
688sense. However, it calls memory allocation and free functions via the two
689indirections below, which are can be changed by the caller, but are shared
690between all threads. */
691
692void *(*pcre_malloc)(size_t) = malloc;
693void (*pcre_free)(void *) = free;
694
695
696
697
698/*************************************************
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000699* Return version string *
700*************************************************/
701
Guido van Rossum58132c61997-12-17 00:24:13 +0000702const char *
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000703pcre_version(void)
704{
705return PCRE_VERSION;
706}
707
708
709
710
711/*************************************************
712* Return info about a compiled pattern *
713*************************************************/
714
715/* This function picks potentially useful data out of the private
716structure.
717
718Arguments:
719 external_re points to compiled code
720 optptr where to pass back the options
721 first_char where to pass back the first character,
722 or -1 if multiline and all branches start ^,
723 or -2 otherwise
724
725Returns: number of identifying extraction brackets
726 or negative values on error
727*/
728
729int
Guido van Rossum50700601997-12-08 17:15:20 +0000730pcre_info(const pcre *external_re, int *optptr, int *first_char)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000731{
Guido van Rossum58132c61997-12-17 00:24:13 +0000732const real_pcre *re = (real_pcre *)external_re;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000733if (re == NULL) return PCRE_ERROR_NULL;
734if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
735if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
736if (first_char != NULL)
737 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
738 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
739return re->top_bracket;
740}
741
742
743
744
745#ifdef DEBUG
746/*************************************************
747* Debugging function to print chars *
748*************************************************/
749
750/* Print a sequence of chars in printable format, stopping at the end of the
751subject if the requested.
752
753Arguments:
754 p points to characters
755 length number to print
756 is_subject TRUE if printing from within md->start_subject
757 md pointer to matching data block, if is_subject is TRUE
758
759Returns: nothing
760*/
761
Guido van Rossum557dea11997-12-22 22:46:52 +0000762static void
763pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000764{
765int c;
766if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
767while (length-- > 0)
768 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
769}
770#endif
771
772
773
774
775/*************************************************
776* Check subpattern for empty operand *
777*************************************************/
778
779/* This function checks a bracketed subpattern to see if any of the paths
780through it could match an empty string. This is used to diagnose an error if
781such a subpattern is followed by a quantifier with an unlimited upper bound.
782
783Argument:
784 code points to the opening bracket
785
786Returns: TRUE or FALSE
787*/
788
789static BOOL
790could_be_empty(uschar *code)
791{
792do {
793 uschar *cc = code + 3;
794
795 /* Scan along the opcodes for this branch; as soon as we find something
796 that matches a non-empty string, break out and advance to test the next
797 branch. If we get to the end of the branch, return TRUE for the whole
798 sub-expression. */
799
800 for (;;)
801 {
802 /* Test an embedded subpattern; if it could not be empty, break the
803 loop. Otherwise carry on in the branch. */
804
Guido van Rossum50700601997-12-08 17:15:20 +0000805 if ((int)(*cc) >= OP_BRA || (int)(*cc) == OP_ONCE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000806 {
807 if (!could_be_empty(cc)) break;
808 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
809 cc += 3;
810 }
811
812 else switch (*cc)
813 {
814 /* Reached end of a branch: the subpattern may match the empty string */
815
816 case OP_ALT:
817 case OP_KET:
818 case OP_KETRMAX:
819 case OP_KETRMIN:
820 return TRUE;
821
Guido van Rossumd0f432b1998-02-20 21:45:14 +0000822 /* Skip over entire bracket groups with zero lower bound */
823
824 case OP_BRAZERO:
825 case OP_BRAMINZERO:
826 cc++;
827 /* Fall through */
828
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000829 /* Skip over assertive subpatterns */
830
831 case OP_ASSERT:
832 case OP_ASSERT_NOT:
833 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
834 cc += 3;
835 break;
836
837 /* Skip over things that don't match chars */
838
839 case OP_SOD:
840 case OP_EOD:
841 case OP_CIRC:
842 case OP_DOLL:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000843 case OP_NOT_WORD_BOUNDARY:
844 case OP_WORD_BOUNDARY:
Guido van Rossum50700601997-12-08 17:15:20 +0000845 case OP_NOT_WORD_BOUNDARY_L:
846 case OP_WORD_BOUNDARY_L:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000847 cc++;
848 break;
849
850 /* Skip over simple repeats with zero lower bound */
851
852 case OP_STAR:
853 case OP_MINSTAR:
854 case OP_QUERY:
855 case OP_MINQUERY:
Guido van Rossum50700601997-12-08 17:15:20 +0000856 case OP_NOTSTAR:
857 case OP_NOTMINSTAR:
858 case OP_NOTQUERY:
859 case OP_NOTMINQUERY:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000860 case OP_TYPESTAR:
861 case OP_TYPEMINSTAR:
862 case OP_TYPEQUERY:
863 case OP_TYPEMINQUERY:
864 cc += 2;
865 break;
866
867 /* Skip over UPTOs (lower bound is zero) */
868
869 case OP_UPTO:
870 case OP_MINUPTO:
871 case OP_TYPEUPTO:
872 case OP_TYPEMINUPTO:
873 cc += 4;
874 break;
875
876 /* Check a class or a back reference for a zero minimum */
877
878 case OP_CLASS:
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000879 case OP_NEGCLASS:
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000880 case OP_REF:
Guido van Rossum50700601997-12-08 17:15:20 +0000881 case OP_CLASS_L:
882 switch(*cc)
883 {
884 case (OP_REF): cc += 2; break;
Guido van Rossum042ff9e1998-04-03 21:13:31 +0000885 case (OP_CLASS): case (OP_NEGCLASS): cc += 1+32; break;
Guido van Rossum50700601997-12-08 17:15:20 +0000886 case (OP_CLASS_L): cc += 1+1+32; break;
887 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000888
889 switch (*cc)
890 {
891 case OP_CRSTAR:
892 case OP_CRMINSTAR:
893 case OP_CRQUERY:
894 case OP_CRMINQUERY:
895 cc++;
896 break;
897
898 case OP_CRRANGE:
899 case OP_CRMINRANGE:
900 if ((cc[1] << 8) + cc[2] != 0) goto NEXT_BRANCH;
901 cc += 3;
902 break;
903
904 default:
905 goto NEXT_BRANCH;
906 }
907 break;
908
909 /* Anything else matches at least one character */
910
911 default:
912 goto NEXT_BRANCH;
913 }
914 }
915
916 NEXT_BRANCH:
917 code += (code[1] << 8) + code[2];
918 }
919while (*code == OP_ALT);
920
921/* No branches match the empty string */
922
923return FALSE;
924}
925
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000926/* Determine the length of a group ID in an expression like
927 (?P<foo_123>...)
928Arguments:
929 ptr pattern position pointer (say that 3 times fast)
930 finalchar the character that will mark the end of the ID
931 errorptr points to the pointer to the error message
932*/
933
934static int
Guido van Rossum58132c61997-12-17 00:24:13 +0000935get_group_id(const uschar *ptr, char finalchar, const char **errorptr)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000936{
Guido van Rossum58132c61997-12-17 00:24:13 +0000937 const uschar *start = ptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000938
939 /* If the first character is not in \w, or is in \w but is a digit,
940 report an error */
941 if (!(pcre_ctypes[*ptr] & ctype_word) ||
942 (pcre_ctypes[*ptr++] & ctype_digit))
943 {
944 *errorptr = "(?P identifier must start with a letter or underscore";
945 return 0;
946 }
947
948 /* Increment ptr until we either hit a null byte, the desired
949 final character, or a non-word character */
950 for(; (*ptr != 0) && (*ptr != finalchar) &&
951 (pcre_ctypes[*ptr] & ctype_word); ptr++)
952 {
Guido van Rossumc3861071997-10-08 02:07:40 +0000953 /* Empty loop body */
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000954 }
955 if (*ptr==finalchar)
956 return ptr-start;
957 if (*ptr==0)
958 {
959 *errorptr = "unterminated (?P identifier";
960 return 0;
961 }
962 *errorptr = "illegal character in (?P identifier";
963 return 0;
964}
965
966/*************************************************
967* Handle escapes *
968*************************************************/
969
970/* This function is called when a \ has been encountered. It either returns a
971positive value for a simple escape such as \n, or a negative value which
972encodes one of the more complicated things such as \d. On entry, ptr is
973pointing at the \. On exit, it is on the final character of the escape
974sequence.
975
976Arguments:
977 ptrptr points to the pattern position pointer
978 errorptr points to the pointer to the error message
Guido van Rossum50700601997-12-08 17:15:20 +0000979 bracount number of previous extracting brackets
980 options the options bits
981 isclass TRUE if inside a character class
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000982
983Returns: zero or positive => a data character
984 negative => a special escape sequence
985 on error, errorptr is set
986*/
987
988static int
Guido van Rossum58132c61997-12-17 00:24:13 +0000989check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
990 int options, BOOL isclass)
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000991{
Guido van Rossum58132c61997-12-17 00:24:13 +0000992const uschar *ptr = *ptrptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000993int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
994int i;
995
Guido van Rossum50700601997-12-08 17:15:20 +0000996if (c == 0) *errorptr = ERR1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +0000997
998/* Digits or letters may have special meaning; all others are literals. */
999
1000else if (c < '0' || c > 'z') {}
1001
1002/* Do an initial lookup in a table. A non-zero result is something that can be
1003returned immediately. Otherwise further processing may be required. */
1004
1005else if ((i = escapes[c - '0']) != 0) c = i;
1006
1007/* Escapes that need further processing, or are illegal. */
1008
Guido van Rossum50700601997-12-08 17:15:20 +00001009else
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001010 {
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001011
Guido van Rossum50700601997-12-08 17:15:20 +00001012 switch (c)
1013 {
1014 /* The handling of escape sequences consisting of a string of digits
1015 starting with one that is not zero is not straightforward. By experiment,
1016 the way Perl works seems to be as follows:
1017
1018 Outside a character class, the digits are read as a decimal number. If the
1019 number is less than 10, or if there are that many previous extracting
1020 left brackets, then it is a back reference. Otherwise, up to three octal
1021 digits are read to form an escaped byte. Thus \123 is likely to be octal
1022 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
1023 value is greater than 377, the least significant 8 bits are taken. Inside a
1024 character class, \ followed by a digit is always an octal number. */
1025
1026 case '1': case '2': case '3': case '4': case '5':
1027 case '6': case '7': case '8': case '9':
1028
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001029 {
1030 /* PYTHON: Try to compute an octal value for a character */
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001031 for(c=0, i=0; ptr[i]!=0 && i<3; i++)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001032 {
1033 if (( pcre_ctypes[ ptr[i] ] & ctype_odigit) != 0)
1034 c = c * 8 + ptr[i]-'0';
1035 else
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001036 break; /* Non-octal character--break out of the loop */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001037 }
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001038 /* It's a character if there were exactly 3 octal digits, or if
1039 we're inside a character class and there was at least one
1040 octal digit. */
1041 if ( (i == 3) || (isclass && i!=0) )
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001042 {
1043 ptr += i-1;
1044 break;
1045 }
1046 c = ptr[0]; /* Restore the first character after the \ */
1047 c -= '0'; i = 1;
1048 while (i<2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0)
1049 {
1050 c = c * 10 + ptr[1] - '0';
1051 ptr++; i++;
1052 }
1053 if (c > 255 - ESC_REF) *errorptr = "back reference too big";
1054 c = -(ESC_REF + c);
1055 }
1056 break;
1057
Guido van Rossum50700601997-12-08 17:15:20 +00001058 /* \0 always starts an octal number, but we may drop through to here with a
1059 larger first octal digit */
1060
1061 case '0':
1062 c -= '0';
1063 while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 &&
1064 ptr[1] != '8' && ptr[1] != '9')
1065 c = c * 8 + *(++ptr) - '0';
1066 break;
1067
1068 /* Special escapes not starting with a digit are straightforward */
1069
1070 case 'x':
1071 c = 0;
1072 while ( (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001073 {
Guido van Rossum50700601997-12-08 17:15:20 +00001074 ptr++;
1075 c = c * 16 + pcre_lcc[*ptr] -
1076 (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
1077 c &= 255;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001078 }
1079 break;
1080
1081
Guido van Rossum50700601997-12-08 17:15:20 +00001082 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1083 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
1084 for Perl compatibility, it is a literal. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001085
Guido van Rossum50700601997-12-08 17:15:20 +00001086 default:
1087 if ((options & PCRE_EXTRA) != 0) switch(c)
1088 {
1089 case 'X':
1090 c = -ESC_X; /* This could be a lookup if it ever got into Perl */
1091 break;
1092
1093 default:
1094 *errorptr = ERR3;
1095 break;
1096 }
1097 break;
1098 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001099 }
1100
1101*ptrptr = ptr;
1102return c;
1103}
1104
1105
1106
1107/*************************************************
Guido van Rossum50700601997-12-08 17:15:20 +00001108* Check for counted repeat *
1109*************************************************/
1110
1111/* This function is called when a '{' is encountered in a place where it might
1112start a quantifier. It looks ahead to see if it really is a quantifier or not.
1113It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1114where the ddds are digits.
1115
1116Arguments:
1117 p pointer to the first char after '{'
1118
1119Returns: TRUE or FALSE
1120*/
1121
1122static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00001123is_counted_repeat(const uschar *p)
Guido van Rossum50700601997-12-08 17:15:20 +00001124{
1125if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1126while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1127if (*p == '}') return TRUE;
1128
1129if (*p++ != ',') return FALSE;
1130if (*p == '}') return TRUE;
1131
1132if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE;
1133while ((pcre_ctypes[*p] & ctype_digit) != 0) p++;
1134return (*p == '}');
1135}
1136
1137
1138
1139/*************************************************
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001140* Read repeat counts *
1141*************************************************/
1142
Guido van Rossum50700601997-12-08 17:15:20 +00001143/* Read an item of the form {n,m} and return the values. This is called only
1144after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1145so the syntax is guaranteed to be correct, but we need to check the values.
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001146
1147Arguments:
1148 p pointer to first char after '{'
1149 minp pointer to int for min
1150 maxp pointer to int for max
1151 returned as -1 if no max
1152 errorptr points to pointer to error message
1153
1154Returns: pointer to '}' on success;
1155 current ptr on error, with errorptr set
1156*/
1157
Guido van Rossum58132c61997-12-17 00:24:13 +00001158static const uschar *
1159read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001160{
1161int min = 0;
1162int max = -1;
1163
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001164while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
1165
1166if (*p == '}') max = min; else
1167 {
Guido van Rossum50700601997-12-08 17:15:20 +00001168 if (*(++p) != '}')
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001169 {
1170 max = 0;
1171 while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001172 if (max < min)
1173 {
Guido van Rossum50700601997-12-08 17:15:20 +00001174 *errorptr = ERR4;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001175 return p;
1176 }
1177 }
1178 }
1179
1180/* Do paranoid checks, then fill in the required variables, and pass back the
1181pointer to the terminating '}'. */
1182
Guido van Rossum50700601997-12-08 17:15:20 +00001183if (min > 65535 || max > 65535)
1184 *errorptr = ERR5;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001185else
1186 {
1187 *minp = min;
1188 *maxp = max;
1189 }
1190return p;
1191}
1192
1193
1194
1195/*************************************************
1196* Compile one branch *
1197*************************************************/
1198
1199/* Scan the pattern, compiling it into the code vector.
1200
1201Arguments:
Guido van Rossum50700601997-12-08 17:15:20 +00001202 options the option bits
1203 bracket points to number of brackets used
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001204 code points to the pointer to the current code point
1205 ptrptr points to the current pattern pointer
1206 errorptr points to pointer to error message
1207
1208Returns: TRUE on success
1209 FALSE, with *errorptr set on error
1210*/
1211
1212static BOOL
Guido van Rossum50700601997-12-08 17:15:20 +00001213compile_branch(int options, int *brackets, uschar **codeptr,
Guido van Rossum58132c61997-12-17 00:24:13 +00001214 const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001215{
1216int repeat_type, op_type;
1217int repeat_min, repeat_max;
1218int bravalue, length;
Guido van Rossumdda66961998-05-07 15:32:44 +00001219int greedy_default, greedy_non_default;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001220register int c;
1221register uschar *code = *codeptr;
Guido van Rossum58132c61997-12-17 00:24:13 +00001222const uschar *ptr = *ptrptr;
1223const uschar *oldptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001224uschar *previous = NULL;
Guido van Rossum50700601997-12-08 17:15:20 +00001225uschar class[32];
1226uschar *class_flag; /* Pointer to the single-byte flag for OP_CLASS_L */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001227
Guido van Rossumdda66961998-05-07 15:32:44 +00001228/* Set up the default and non-default settings for greediness */
1229
1230greedy_default = ((options & PCRE_UNGREEDY) != 0);
1231greedy_non_default = greedy_default ^ 1;
1232
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001233/* Switch on next character until the end of the branch */
1234
1235for (;; ptr++)
1236 {
Guido van Rossum50700601997-12-08 17:15:20 +00001237 BOOL negate_class;
1238 int class_charcount;
1239 int class_lastchar;
1240
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001241 c = *ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00001242 if ((options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001243 {
1244 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
1245 if (c == '#')
1246 {
1247 while ((c = *(++ptr)) != 0 && c != '\n');
1248 continue;
1249 }
1250 }
1251
1252 switch(c)
1253 {
1254 /* The branch terminates at end of string, |, or ). */
1255
1256 case 0:
1257 case '|':
1258 case ')':
1259 *codeptr = code;
1260 *ptrptr = ptr;
1261 return TRUE;
1262
1263 /* Handle single-character metacharacters */
1264
1265 case '^':
1266 previous = NULL;
1267 *code++ = OP_CIRC;
1268 break;
1269
1270 case '$':
1271 previous = NULL;
1272 *code++ = OP_DOLL;
1273 break;
1274
1275 case '.':
1276 previous = code;
1277 *code++ = OP_ANY;
1278 break;
1279
Guido van Rossum50700601997-12-08 17:15:20 +00001280 /* Character classes. These always build a 32-byte bitmap of the permitted
1281 characters, except in the special case where there is only one character.
1282 For negated classes, we build the map as usual, then invert it at the end.
1283 */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001284
1285 case '[':
Guido van Rossum50700601997-12-08 17:15:20 +00001286 previous = code;
1287 if (options & PCRE_LOCALE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001288 {
Guido van Rossum50700601997-12-08 17:15:20 +00001289 *code++ = OP_CLASS_L;
1290 /* Set the flag for localized classes (like \w) to 0 */
1291 class_flag = code;
1292 *class_flag = 0;
1293 }
1294 else
1295 {
1296 *code++ = OP_CLASS;
1297 class_flag = NULL;
1298 }
1299
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001300 /* If the first character is '^', set the negation flag, and use a
1301 different opcode. This only matters if caseless matching is specified at
1302 runtime. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001303
Guido van Rossum50700601997-12-08 17:15:20 +00001304 if ((c = *(++ptr)) == '^')
1305 {
1306 negate_class = TRUE;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001307 if (*(code-1)==OP_CLASS) *(code-1) = OP_NEGCLASS;
Guido van Rossum50700601997-12-08 17:15:20 +00001308 c = *(++ptr);
1309 }
1310 else negate_class = FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001311
Guido van Rossum50700601997-12-08 17:15:20 +00001312 /* Keep a count of chars so that we can optimize the case of just a single
1313 character. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001314
Guido van Rossum50700601997-12-08 17:15:20 +00001315 class_charcount = 0;
1316 class_lastchar = -1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001317
Guido van Rossum50700601997-12-08 17:15:20 +00001318 /* Initialize the 32-char bit map to all zeros. We have to build the
1319 map in a temporary bit of store, in case the class contains only 1
1320 character, because in that case the compiled code doesn't use the
1321 bit map. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001322
Guido van Rossum50700601997-12-08 17:15:20 +00001323 memset(class, 0, 32 * sizeof(uschar));
1324
1325 /* Process characters until ] is reached. By writing this as a "do" it
1326 means that an initial ] is taken as a data character. */
1327
1328 do
1329 {
1330 if (c == 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001331 {
Guido van Rossum50700601997-12-08 17:15:20 +00001332 *errorptr = ERR6;
1333 goto FAILED;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001334 }
1335
Guido van Rossum50700601997-12-08 17:15:20 +00001336 /* Backslash may introduce a single character, or it may introduce one
1337 of the specials, which just set a flag. Escaped items are checked for
1338 validity in the pre-compiling pass. The sequence \b is a special case.
Guido van Rossum58132c61997-12-17 00:24:13 +00001339 Inside a class (and only there) it is treated as backspace. Elsewhere
Guido van Rossum50700601997-12-08 17:15:20 +00001340 it marks a word boundary. Other escapes have preset maps ready to
1341 or into the one we are building. We assume they have more than one
1342 character in them, so set class_count bigger than one. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001343
Guido van Rossum50700601997-12-08 17:15:20 +00001344 if (c == '\\')
1345 {
1346 c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1347 if (-c == ESC_b) c = '\b';
1348 else if (c < 0)
1349 {
1350 class_charcount = 10;
1351 switch (-c)
1352 {
1353 case ESC_d:
Guido van Rossum50700601997-12-08 17:15:20 +00001354 {
1355 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit];
1356 }
1357 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001358
Guido van Rossum50700601997-12-08 17:15:20 +00001359 case ESC_D:
Guido van Rossum50700601997-12-08 17:15:20 +00001360 {
1361 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit];
1362 }
1363 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001364
Guido van Rossum50700601997-12-08 17:15:20 +00001365 case ESC_w:
1366 if (options & PCRE_LOCALE)
1367 {
1368 *class_flag |= 1;
1369 }
1370 else
1371 {
1372 for (c = 0; c < 32; c++)
1373 class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1374 }
1375 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001376
Guido van Rossum50700601997-12-08 17:15:20 +00001377 case ESC_W:
1378 if (options & PCRE_LOCALE)
1379 {
1380 *class_flag |= 2;
1381 }
1382 else
1383 {
1384 for (c = 0; c < 32; c++)
1385 class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
1386 }
1387 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001388
Guido van Rossum50700601997-12-08 17:15:20 +00001389 case ESC_s:
Guido van Rossum50700601997-12-08 17:15:20 +00001390 {
1391 for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space];
1392 }
1393 continue;
1394
1395 case ESC_S:
Guido van Rossum50700601997-12-08 17:15:20 +00001396 {
1397 for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space];
1398 }
1399 continue;
1400
1401 default:
1402 *errorptr = ERR7;
1403 goto FAILED;
1404 }
1405 }
1406 /* Fall through if single character */
1407 }
1408
1409 /* A single character may be followed by '-' to form a range. However,
1410 Perl does not permit ']' to be the end of the range. A '-' character
1411 here is treated as a literal. */
1412
1413 if (ptr[1] == '-' && ptr[2] != ']')
1414 {
1415 int d;
1416 ptr += 2;
1417 d = *ptr;
1418
1419 if (d == 0)
1420 {
1421 *errorptr = ERR6;
1422 goto FAILED;
1423 }
1424
1425 /* The second part of a range can be a single-character escape, but
1426 not any of the other escapes. */
1427
1428 if (d == '\\')
1429 {
1430 d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
1431 if (d < 0)
1432 {
1433 if (d == -ESC_b) d = '\b'; else
1434 {
1435 *errorptr = ERR7;
1436 goto FAILED;
1437 }
1438 }
1439 }
1440
1441 if (d < c)
1442 {
1443 *errorptr = ERR8;
1444 goto FAILED;
1445 }
1446
1447 for (; c <= d; c++)
1448 {
1449 class[c/8] |= (1 << (c&7));
1450 if ((options & PCRE_CASELESS) != 0)
1451 {
1452 int uc = pcre_fcc[c]; /* flip case */
1453 class[uc/8] |= (1 << (uc&7));
1454 }
1455 class_charcount++; /* in case a one-char range */
1456 class_lastchar = c;
1457 }
1458 continue; /* Go get the next char in the class */
1459 }
1460
1461 /* Handle a lone single character - we can get here for a normal
1462 non-escape char, or after \ that introduces a single character. */
1463
1464 class [c/8] |= (1 << (c&7));
1465 if ((options & PCRE_CASELESS) != 0)
1466 {
1467 c = pcre_fcc[c]; /* flip case */
1468 class[c/8] |= (1 << (c&7));
1469 }
1470 class_charcount++;
1471 class_lastchar = c;
1472 }
1473
1474 /* Loop until ']' reached; the check for end of string happens inside the
1475 loop. This "while" is the end of the "do" above. */
1476
1477 while ((c = *(++ptr)) != ']');
1478
1479 /* If class_charcount is 1 and class_lastchar is not negative, we saw
1480 precisely one character. This doesn't need the whole 32-byte bit map.
1481 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1482 it's negative. */
1483
1484 if (class_charcount == 1 && class_lastchar >= 0)
1485 {
1486 if (negate_class)
1487 {
1488 code[-1] = OP_NOT;
1489 }
1490 else
1491 {
1492 code[-1] = OP_CHARS;
1493 *code++ = 1;
1494 }
1495 *code++ = class_lastchar;
1496 }
1497
1498 /* Otherwise, negate the 32-byte map if necessary, and copy it into
1499 the code vector. */
1500
1501 else
1502 {
1503 /* If this is a localized opcode, bump the code pointer up */
1504 if (class_flag) code++;
1505 if (negate_class)
1506 {
1507 if (class_flag) *class_flag = (*class_flag) ^ 63;
1508 for (c = 0; c < 32; c++) code[c] = ~class[c];
1509 }
1510 else
1511 memcpy(code, class, 32);
1512 code += 32;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001513 }
1514 break;
1515
1516 /* Various kinds of repeat */
1517
1518 case '{':
Guido van Rossum50700601997-12-08 17:15:20 +00001519 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001520 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
1521 if (*errorptr != NULL) goto FAILED;
1522 goto REPEAT;
1523
1524 case '*':
1525 repeat_min = 0;
1526 repeat_max = -1;
1527 goto REPEAT;
1528
1529 case '+':
1530 repeat_min = 1;
1531 repeat_max = -1;
1532 goto REPEAT;
1533
1534 case '?':
1535 repeat_min = 0;
1536 repeat_max = 1;
1537
1538 REPEAT:
1539 if (previous == NULL)
1540 {
Guido van Rossum50700601997-12-08 17:15:20 +00001541 *errorptr = ERR9;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001542 goto FAILED;
1543 }
1544
Guido van Rossumdda66961998-05-07 15:32:44 +00001545 /* If the next character is '?' this is a minimizing repeat, by default,
1546 but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001547 next character. */
1548
Guido van Rossumdda66961998-05-07 15:32:44 +00001549 if (ptr[1] == '?')
1550 { repeat_type = greedy_non_default; ptr++; }
1551 else repeat_type = greedy_default;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001552
Guido van Rossum50700601997-12-08 17:15:20 +00001553 /* If the maximum is zero then the minimum must also be zero; Perl allows
1554 this case, so we do too - by simply omitting the item altogether. */
1555
1556 if (repeat_max == 0) code = previous;
1557
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001558 /* If previous was a string of characters, chop off the last one and use it
1559 as the subject of the repeat. If there was only one character, we can
1560 abolish the previous item altogether. */
1561
Guido van Rossum50700601997-12-08 17:15:20 +00001562 else if (*previous == OP_CHARS)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001563 {
1564 int len = previous[1];
1565 if (len == 1)
1566 {
1567 c = previous[2];
1568 code = previous;
1569 }
1570 else
1571 {
1572 c = previous[len+1];
1573 previous[1]--;
1574 code--;
1575 }
1576 op_type = 0; /* Use single-char op codes */
1577 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1578 }
1579
Guido van Rossum50700601997-12-08 17:15:20 +00001580 /* If previous was a single negated character ([^a] or similar), we use
1581 one of the special opcodes, replacing it. The code is shared with single-
1582 character repeats by adding a suitable offset into repeat_type. */
1583
1584 else if ((int)*previous == OP_NOT)
1585 {
1586 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
1587 c = previous[1];
1588 code = previous;
1589 goto OUTPUT_SINGLE_REPEAT;
1590 }
1591
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001592 /* If previous was a character type match (\d or similar), abolish it and
1593 create a suitable repeat item. The code is shared with single-character
1594 repeats by adding a suitable offset into repeat_type. */
1595
Guido van Rossum50700601997-12-08 17:15:20 +00001596 else if ((int)*previous < OP_CIRC || *previous == OP_ANY)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001597 {
1598 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1599 c = *previous;
1600 code = previous;
1601
1602 OUTPUT_SINGLE_REPEAT:
1603 repeat_type += op_type; /* Combine both values for many cases */
1604
1605 /* A minimum of zero is handled either as the special case * or ?, or as
1606 an UPTO, with the maximum given. */
1607
1608 if (repeat_min == 0)
1609 {
1610 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1611 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1612 else
1613 {
1614 *code++ = OP_UPTO + repeat_type;
1615 *code++ = repeat_max >> 8;
1616 *code++ = (repeat_max & 255);
1617 }
1618 }
1619
1620 /* The case {1,} is handled as the special case + */
1621
1622 else if (repeat_min == 1 && repeat_max == -1)
1623 *code++ = OP_PLUS + repeat_type;
1624
1625 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1626 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1627
1628 else
1629 {
1630 if (repeat_min != 1)
1631 {
1632 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1633 *code++ = repeat_min >> 8;
1634 *code++ = (repeat_min & 255);
1635 }
1636
1637 /* If the mininum is 1 and the previous item was a character string,
1638 we either have to put back the item that got cancelled if the string
1639 length was 1, or add the character back onto the end of a longer
Guido van Rossumdda66961998-05-07 15:32:44 +00001640 string. For a character type nothing need be done; it will just get
1641 put back naturally. Note that the final character is always going to
1642 get added below. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001643
1644 else if (*previous == OP_CHARS)
1645 {
1646 if (code == previous) code += 2; else previous[1]++;
1647 }
1648
Guido van Rossumdda66961998-05-07 15:32:44 +00001649 /* For a single negated character we also have to put back the
1650 item that got cancelled. */
1651
1652 else if (*previous == OP_NOT) code++;
1653
Guido van Rossum557dea11997-12-22 22:46:52 +00001654 /* If the maximum is unlimited, insert an OP_STAR. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001655
Guido van Rossum557dea11997-12-22 22:46:52 +00001656 if (repeat_max < 0)
1657 {
1658 *code++ = c;
1659 *code++ = OP_STAR + repeat_type;
1660 }
1661
1662 /* Else insert an UPTO if the max is greater than the min. */
1663
1664 else if (repeat_max != repeat_min)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001665 {
1666 *code++ = c;
1667 repeat_max -= repeat_min;
1668 *code++ = OP_UPTO + repeat_type;
1669 *code++ = repeat_max >> 8;
1670 *code++ = (repeat_max & 255);
1671 }
1672 }
1673
1674 /* The character or character type itself comes last in all cases. */
1675
1676 *code++ = c;
1677 }
1678
1679 /* If previous was a character class or a back reference, we put the repeat
1680 stuff after it. */
1681
Guido van Rossum042ff9e1998-04-03 21:13:31 +00001682 else if (*previous == OP_CLASS || *previous == OP_NEGCLASS ||
1683 *previous==OP_CLASS_L || *previous == OP_REF)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001684 {
1685 if (repeat_min == 0 && repeat_max == -1)
1686 *code++ = OP_CRSTAR + repeat_type;
1687 else if (repeat_min == 1 && repeat_max == -1)
1688 *code++ = OP_CRPLUS + repeat_type;
1689 else if (repeat_min == 0 && repeat_max == 1)
1690 *code++ = OP_CRQUERY + repeat_type;
1691 else
1692 {
1693 *code++ = OP_CRRANGE + repeat_type;
1694 *code++ = repeat_min >> 8;
1695 *code++ = repeat_min & 255;
1696 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1697 *code++ = repeat_max >> 8;
1698 *code++ = repeat_max & 255;
1699 }
1700 }
1701
1702 /* If previous was a bracket group, we may have to replicate it in certain
1703 cases. If the maximum repeat count is unlimited, check that the bracket
1704 group cannot match the empty string, and diagnose an error if it can. */
1705
1706 else if ((int)*previous >= OP_BRA)
1707 {
1708 int i;
Guido van Rossum557dea11997-12-22 22:46:52 +00001709 int len = code - previous;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001710
1711 if (repeat_max == -1 && could_be_empty(previous))
1712 {
Guido van Rossum50700601997-12-08 17:15:20 +00001713 *errorptr = ERR10;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001714 goto FAILED;
1715 }
1716
1717 /* If the minimum is greater than zero, and the maximum is unlimited or
1718 equal to the minimum, the first copy remains where it is, and is
1719 replicated up to the minimum number of times. This case includes the +
1720 repeat, but of course no replication is needed in that case. */
1721
1722 if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
1723 {
1724 for (i = 1; i < repeat_min; i++)
1725 {
Guido van Rossum557dea11997-12-22 22:46:52 +00001726 memcpy(code, previous, len);
1727 code += len;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001728 }
1729 }
1730
1731 /* If the minimum is zero, stick BRAZERO in front of the first copy.
1732 Then, if there is a fixed upper limit, replicated up to that many times,
1733 sticking BRAZERO in front of all the optional ones. */
1734
1735 else
1736 {
1737 if (repeat_min == 0)
1738 {
Guido van Rossum557dea11997-12-22 22:46:52 +00001739 memmove(previous+1, previous, len);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001740 code++;
1741 *previous++ = OP_BRAZERO + repeat_type;
1742 }
1743
1744 for (i = 1; i < repeat_min; i++)
1745 {
Guido van Rossum557dea11997-12-22 22:46:52 +00001746 memcpy(code, previous, len);
1747 code += len;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001748 }
1749
1750 for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
1751 {
1752 *code++ = OP_BRAZERO + repeat_type;
Guido van Rossum557dea11997-12-22 22:46:52 +00001753 memcpy(code, previous, len);
1754 code += len;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001755 }
1756 }
1757
1758 /* If the maximum is unlimited, set a repeater in the final copy. */
1759
1760 if (repeat_max == -1) code[-3] = OP_KETRMAX + repeat_type;
1761 }
1762
1763 /* Else there's some kind of shambles */
1764
1765 else
1766 {
Guido van Rossum50700601997-12-08 17:15:20 +00001767 *errorptr = ERR11;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001768 goto FAILED;
1769 }
1770
1771 /* In all case we no longer have a previous item. */
1772
1773 previous = NULL;
1774 break;
1775
1776
1777 /* Start of nested bracket sub-expression, or comment or lookahead.
1778 First deal with special things that can come after a bracket; all are
1779 introduced by ?, and the appearance of any of them means that this is not a
1780 referencing group. They were checked for validity in the first pass over
1781 the string, so we don't have to check for syntax errors here. */
1782
1783 case '(':
1784 previous = code; /* Only real brackets can be repeated */
1785 if (*(++ptr) == '?')
1786 {
1787 bravalue = OP_BRA;
1788
1789 switch (*(++ptr))
1790 {
1791 case '#':
1792 case 'i':
Guido van Rossumbd49ac41997-12-10 23:05:53 +00001793 case 'L':
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001794 case 'm':
1795 case 's':
1796 case 'x':
1797 ptr++;
1798 while (*ptr != ')') ptr++;
1799 previous = NULL;
1800 continue;
1801
1802 case ':': /* Non-extracting bracket */
1803 ptr++;
1804 break;
1805
1806 case '=': /* Assertions can't be repeated */
1807 bravalue = OP_ASSERT;
1808 ptr++;
1809 previous = NULL;
1810 break;
1811
1812 case '!':
1813 bravalue = OP_ASSERT_NOT;
1814 ptr++;
1815 previous = NULL;
1816 break;
1817
1818 case ('P'):
1819 ptr++;
1820 if (*ptr=='<')
1821 {
1822 /* (?P<groupname>...) */
1823 int idlen;
1824 PyObject *string, *intobj;
1825
1826 ptr++;
1827 idlen = get_group_id(ptr, '>', errorptr);
1828 if (*errorptr) {
1829 goto FAILED;
1830 }
Guido van Rossum57ba4f31997-12-02 20:40:28 +00001831 string = PyString_FromStringAndSize((char*)ptr, idlen);
Guido van Rossum50700601997-12-08 17:15:20 +00001832 intobj = PyInt_FromLong( brackets[0] + 1 );
Guido van Rossum58132c61997-12-17 00:24:13 +00001833 if (intobj == NULL || string == NULL)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001834 {
1835 Py_XDECREF(string);
1836 Py_XDECREF(intobj);
1837 *errorptr = "exception raised";
1838 goto FAILED;
1839 }
1840 PyDict_SetItem(dictionary, string, intobj);
Guido van Rossum58132c61997-12-17 00:24:13 +00001841 Py_DECREF(string); Py_DECREF(intobj); /* XXX DECREF commented out! */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001842 ptr += idlen+1; /* Point to rest of expression */
1843 goto do_grouping_bracket;
1844 }
1845 if (*ptr=='=')
1846 {
1847 /* (?P=groupname) */
1848 int idlen, refnum;
1849 PyObject *string, *intobj;
1850
1851 ptr++;
1852 idlen = get_group_id(ptr, ')', errorptr);
1853 if (*errorptr) {
1854 goto FAILED;
1855 }
Guido van Rossum50700601997-12-08 17:15:20 +00001856 string = PyString_FromStringAndSize((char *)ptr, idlen);
Guido van Rossumc3861071997-10-08 02:07:40 +00001857 if (string==NULL) {
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001858 *errorptr = "exception raised";
1859 goto FAILED;
1860 }
1861 intobj = PyDict_GetItem(dictionary, string);
1862 if (intobj==NULL) {
Guido van Rossumc3861071997-10-08 02:07:40 +00001863 Py_DECREF(string);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001864 *errorptr = "?P= group identifier isn't defined";
1865 goto FAILED;
1866 }
1867
1868 refnum = PyInt_AsLong(intobj);
Guido van Rossum1eadb411997-12-15 17:33:24 +00001869 Py_DECREF(string);
Guido van Rossum58132c61997-12-17 00:24:13 +00001870 /* The caller doesn't own the reference to the value
1871 returned from PyDict_GetItem, so intobj is not
1872 DECREF'ed. */
1873
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001874 *code++ = OP_REF;
1875 *code++ = refnum;
1876 /* The continue will cause the top-level for() loop to
1877 be resumed, so ptr will be immediately incremented.
1878 Therefore, the following line adds just idlen, not
1879 idlen+1 */
1880 ptr += idlen;
1881 continue;
1882 }
1883 /* The character after ?P is neither < nor =, so
1884 report an error. Add more Python-extensions here. */
1885 *errorptr="unknown after (?P";
1886 goto FAILED;
1887 break;
Guido van Rossum50700601997-12-08 17:15:20 +00001888
1889 case '>': /* "Match once" brackets */
1890 if ((options & PCRE_EXTRA) != 0) /* Not yet standard */
1891 {
1892 bravalue = OP_ONCE;
1893 ptr++;
1894 previous = NULL;
1895 break;
1896 }
1897 /* Else fall through */
1898
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001899 default:
Guido van Rossum50700601997-12-08 17:15:20 +00001900 *errorptr = ERR12;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001901 goto FAILED;
1902 }
1903 }
1904
1905 /* Else we have a referencing group */
1906
1907 else
1908 {
1909 do_grouping_bracket:
Guido van Rossum50700601997-12-08 17:15:20 +00001910 if (++(*brackets) > EXTRACT_MAX)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001911 {
Guido van Rossum50700601997-12-08 17:15:20 +00001912 *errorptr = ERR13;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001913 goto FAILED;
1914 }
Guido van Rossum50700601997-12-08 17:15:20 +00001915 bravalue = OP_BRA + *brackets;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001916 }
1917
1918 /* Process nested bracketed re; at end pointer is on the bracket. We copy
1919 code into a non-register variable in order to be able to pass its address
1920 because some compilers complain otherwise. */
1921
1922 *code = bravalue;
1923 {
1924 uschar *mcode = code;
Guido van Rossum50700601997-12-08 17:15:20 +00001925 if (!compile_regex(options, brackets, &mcode, &ptr, errorptr, dictionary))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001926 goto FAILED;
1927 code = mcode;
1928 }
1929
1930 if (*ptr != ')')
1931 {
Guido van Rossum50700601997-12-08 17:15:20 +00001932 *errorptr = ERR14;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001933 goto FAILED;
1934 }
1935 break;
1936
1937 /* Check \ for being a real metacharacter; if not, fall through and handle
1938 it as a data character at the start of a string. Escape items are checked
1939 for validity in the pre-compiling pass. */
1940
1941 case '\\':
1942 oldptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00001943 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001944
1945 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1946 are arranged to be the negation of the corresponding OP_values. For the
1947 back references, the values are ESC_REF plus the reference number. Only
1948 back references and those types that consume a character may be repeated.
1949 We can test for values between ESC_b and ESC_Z for the latter; this may
1950 have to change if any new ones are ever created. */
1951
1952 if (c < 0)
1953 {
1954 if (-c >= ESC_REF)
1955 {
Guido van Rossum50700601997-12-08 17:15:20 +00001956 int refnum = -c - ESC_REF;
1957 if (*brackets < refnum)
1958 {
1959 *errorptr = ERR15;
1960 goto FAILED;
1961 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001962 previous = code;
1963 *code++ = OP_REF;
1964 *code++ = refnum;
1965 }
1966 else
1967 {
Guido van Rossum50700601997-12-08 17:15:20 +00001968 previous = (-c > ESC_b && -c < ESC_X)? code : NULL;
1969 if ( (options & PCRE_LOCALE) != 0)
1970 {
1971 switch (c)
1972 {
1973 case (-ESC_b): c = -OP_WORD_BOUNDARY_L; break;
1974 case (-ESC_B): c = -OP_NOT_WORD_BOUNDARY_L; break;
1975 case (-ESC_w): c = -OP_WORDCHAR_L; break;
1976 case (-ESC_W): c = -OP_NOT_WORDCHAR_L; break;
1977 }
1978 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001979 *code++ = -c;
1980 }
1981 continue;
1982 }
1983
Guido van Rossum58132c61997-12-17 00:24:13 +00001984 /* Data character: Reset and fall through */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001985
1986 ptr = oldptr;
1987 c = '\\';
1988
1989 /* Handle a run of data characters until a metacharacter is encountered.
1990 The first character is guaranteed not to be whitespace or # when the
1991 extended flag is set. */
1992
Guido van Rossum50700601997-12-08 17:15:20 +00001993 NORMAL_CHAR:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00001994 default:
1995 previous = code;
1996 *code = OP_CHARS;
1997 code += 2;
1998 length = 0;
1999
2000 do
2001 {
Guido van Rossum50700601997-12-08 17:15:20 +00002002 if ((options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002003 {
2004 if ((pcre_ctypes[c] & ctype_space) != 0) continue;
2005 if (c == '#')
2006 {
2007 while ((c = *(++ptr)) != 0 && c != '\n');
2008 if (c == 0) break;
2009 continue;
2010 }
2011 }
2012
2013 /* Backslash may introduce a data char or a metacharacter. Escaped items
2014 are checked for validity in the pre-compiling pass. Stop the string
2015 before a metaitem. */
2016
2017 if (c == '\\')
2018 {
2019 oldptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00002020 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002021 if (c < 0) { ptr = oldptr; break; }
2022 }
2023
2024 /* Ordinary character or single-char escape */
2025
2026 *code++ = c;
2027 length++;
2028 }
2029
2030 /* This "while" is the end of the "do" above. */
2031
2032 while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2033
2034 /* Compute the length and set it in the data vector, and advance to
2035 the next state. */
2036
2037 previous[1] = length;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00002038 if (length < 255) ptr--;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002039 break;
2040 }
2041 } /* end of big loop */
2042
2043/* Control never reaches here by falling through, only by a goto for all the
2044error states. Pass back the position in the pattern so that it can be displayed
2045to the user for diagnosing the error. */
2046
2047FAILED:
2048*ptrptr = ptr;
2049return FALSE;
2050}
2051
2052
2053
2054
2055/*************************************************
2056* Compile sequence of alternatives *
2057*************************************************/
2058
2059/* On entry, ptr is pointing past the bracket character, but on return
2060it points to the closing bracket, or vertical bar, or end of string.
2061The code variable is pointing at the byte into which the BRA operator has been
2062stored.
2063
2064Argument:
Guido van Rossum50700601997-12-08 17:15:20 +00002065 options the option bits
2066 brackets -> int containing the number of extracting brackets used
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002067 codeptr -> the address of the current code pointer
2068 ptrptr -> the address of the current pattern pointer
2069 errorptr -> pointer to error message
2070
2071Returns: TRUE on success
2072*/
2073
2074static BOOL
Guido van Rossum50700601997-12-08 17:15:20 +00002075compile_regex(int options, int *brackets, uschar **codeptr,
Guido van Rossum58132c61997-12-17 00:24:13 +00002076 const uschar **ptrptr, const char **errorptr, PyObject *dictionary)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002077{
Guido van Rossum58132c61997-12-17 00:24:13 +00002078const uschar *ptr = *ptrptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002079uschar *code = *codeptr;
2080uschar *start_bracket = code;
2081
2082for (;;)
2083 {
2084 int length;
2085 uschar *last_branch = code;
2086
2087 code += 3;
Guido van Rossum50700601997-12-08 17:15:20 +00002088 if (!compile_branch(options, brackets, &code, &ptr, errorptr, dictionary))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002089 {
2090 *ptrptr = ptr;
2091 return FALSE;
2092 }
2093
2094 /* Fill in the length of the last branch */
2095
2096 length = code - last_branch;
2097 last_branch[1] = length >> 8;
2098 last_branch[2] = length & 255;
2099
2100 /* Reached end of expression, either ')' or end of pattern. Insert a
2101 terminating ket and the length of the whole bracketed item, and return,
2102 leaving the pointer at the terminating char. */
2103
2104 if (*ptr != '|')
2105 {
2106 length = code - start_bracket;
2107 *code++ = OP_KET;
2108 *code++ = length >> 8;
2109 *code++ = length & 255;
2110 *codeptr = code;
2111 *ptrptr = ptr;
2112 return TRUE;
2113 }
2114
2115 /* Another branch follows; insert an "or" node and advance the pointer. */
2116
2117 *code = OP_ALT;
2118 ptr++;
2119 }
2120/* Control never reaches here */
2121}
2122
2123
2124
2125/*************************************************
2126* Check for anchored expression *
2127*************************************************/
2128
2129/* Try to find out if this is an anchored regular expression. Consider each
2130alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2131all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2132it's anchored. However, if this is a multiline pattern, then only OP_SOD
2133counts, since OP_CIRC can match in the middle.
2134
2135A branch is also implicitly anchored if it starts with .* because that will try
2136the rest of the pattern at all possible matching points, so there is no point
2137trying them again.
2138
2139Argument: points to start of expression (the bracket)
2140Returns: TRUE or FALSE
2141*/
2142
2143static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00002144is_anchored(register const uschar *code, BOOL multiline)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002145{
2146do {
2147 int op = (int)code[3];
Guido van Rossum50700601997-12-08 17:15:20 +00002148 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002149 { if (!is_anchored(code+3, multiline)) return FALSE; }
2150 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2151 { if (code[4] != OP_ANY) return FALSE; }
2152 else if (op != OP_SOD && (multiline || op != OP_CIRC)) return FALSE;
2153 code += (code[1] << 8) + code[2];
2154 }
2155while (*code == OP_ALT);
2156return TRUE;
2157}
2158
2159
2160
2161/*************************************************
2162* Check for start with \n line expression *
2163*************************************************/
2164
2165/* This is called for multiline expressions to try to find out if every branch
2166starts with ^ so that "first char" processing can be done to speed things up.
2167
2168Argument: points to start of expression (the bracket)
2169Returns: TRUE or FALSE
2170*/
2171
2172static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00002173is_startline(const uschar *code)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002174{
2175do {
2176 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2177 { if (!is_startline(code+3)) return FALSE; }
2178 else if (code[3] != OP_CIRC) return FALSE;
2179 code += (code[1] << 8) + code[2];
2180 }
2181while (*code == OP_ALT);
2182return TRUE;
2183}
2184
2185
2186
2187/*************************************************
2188* Check for fixed first char *
2189*************************************************/
2190
2191/* Try to find out if there is a fixed first character. This is called for
2192unanchored expressions, as it speeds up their processing quite considerably.
2193Consider each alternative branch. If they all start with the same char, or with
2194a bracket all of whose alternatives start with the same char (recurse ad lib),
2195then we return that char, otherwise -1.
2196
2197Argument: points to start of expression (the bracket)
2198Returns: -1 or the fixed first char
2199*/
2200
2201static int
2202find_firstchar(uschar *code)
2203{
2204register int c = -1;
2205do
2206 {
2207 register int charoffset = 4;
2208
2209 if ((int)code[3] >= OP_BRA || code[3] == OP_ASSERT)
2210 {
2211 register int d;
2212 if ((d = find_firstchar(code+3)) < 0) return -1;
2213 if (c < 0) c = d; else if (c != d) return -1;
2214 }
2215
2216 else switch(code[3])
2217 {
2218 default:
2219 return -1;
2220
2221 case OP_EXACT: /* Fall through */
2222 charoffset++;
2223
2224 case OP_CHARS: /* Fall through */
2225 charoffset++;
2226
2227 case OP_PLUS:
2228 case OP_MINPLUS:
2229 if (c < 0) c = code[charoffset]; else if (c != code[charoffset]) return -1;
2230 break;
2231 }
2232 code += (code[1] << 8) + code[2];
2233 }
2234while (*code == OP_ALT);
2235return c;
2236}
2237
2238
2239
2240/*************************************************
2241* Compile a Regular Expression *
2242*************************************************/
2243
2244/* This function takes a string and returns a pointer to a block of store
2245holding a compiled version of the expression.
2246
2247Arguments:
2248 pattern the regular expression
2249 options various option bits
2250 errorptr pointer to pointer to error text
2251 erroroffset ptr offset in pattern where error was detected
2252
2253Returns: pointer to compiled data block, or NULL on error,
2254 with errorptr and erroroffset set
2255*/
2256
2257pcre *
Guido van Rossum58132c61997-12-17 00:24:13 +00002258pcre_compile(const char *pattern, int options, const char **errorptr,
Guido van Rossum50700601997-12-08 17:15:20 +00002259 int *erroroffset, PyObject *dictionary)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002260{
2261real_pcre *re;
2262int spaces = 0;
2263int length = 3; /* For initial BRA plus length */
2264int runlength;
2265int c, size;
Guido van Rossum50700601997-12-08 17:15:20 +00002266int bracount = 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002267int brastack[200];
Guido van Rossum50700601997-12-08 17:15:20 +00002268int top_backref = 0;
Guido van Rossum58132c61997-12-17 00:24:13 +00002269unsigned int brastackptr = 0;
2270uschar *code;
2271const uschar *ptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002272
2273#ifdef DEBUG
2274uschar *code_base, *code_end;
2275#endif
2276
Guido van Rossum50700601997-12-08 17:15:20 +00002277/* We can't pass back an error message if errorptr is NULL; I guess the best we
2278can do is just return NULL. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002279
Guido van Rossum50700601997-12-08 17:15:20 +00002280if (errorptr == NULL) return NULL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002281*errorptr = NULL;
Guido van Rossum50700601997-12-08 17:15:20 +00002282
2283/* However, we can give a message for this error */
2284
2285if (erroroffset == NULL)
2286 {
2287 *errorptr = ERR16;
2288 return NULL;
2289 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002290*erroroffset = 0;
2291
2292if ((options & ~PUBLIC_OPTIONS) != 0)
2293 {
Guido van Rossum50700601997-12-08 17:15:20 +00002294 *errorptr = ERR17;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002295 return NULL;
2296 }
2297
Guido van Rossum557dea11997-12-22 22:46:52 +00002298DPRINTF(("------------------------------------------------------------------\n"));
2299DPRINTF(("%s\n", pattern));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002300
2301/* The first thing to do is to make a pass over the pattern to compute the
2302amount of store required to hold the compiled code. This does not have to be
2303perfect as long as errors are overestimates. At the same time we can detect any
2304internal flag settings. Make an attempt to correct for any counted white space
2305if an "extended" flag setting appears late in the pattern. We can't be so
2306clever for #-comments. */
2307
Guido van Rossum58132c61997-12-17 00:24:13 +00002308ptr = (const uschar *)(pattern - 1);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002309while ((c = *(++ptr)) != 0)
2310 {
Guido van Rossum50700601997-12-08 17:15:20 +00002311 int min, max;
2312 int class_charcount;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002313
2314 if ((pcre_ctypes[c] & ctype_space) != 0)
2315 {
Guido van Rossum50700601997-12-08 17:15:20 +00002316 if ((options & PCRE_EXTENDED) != 0) continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002317 spaces++;
2318 }
2319
Guido van Rossum50700601997-12-08 17:15:20 +00002320 if (c == '#' && (options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002321 {
2322 while ((c = *(++ptr)) != 0 && c != '\n');
2323 continue;
2324 }
2325
2326 switch(c)
2327 {
2328 /* A backslashed item may be an escaped "normal" character or a
2329 character type. For a "normal" character, put the pointers and
2330 character back so that tests for whitespace etc. in the input
2331 are done correctly. */
2332
2333 case '\\':
2334 {
Guido van Rossum58132c61997-12-17 00:24:13 +00002335 const uschar *save_ptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00002336 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002337 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2338 if (c >= 0)
2339 {
2340 ptr = save_ptr;
2341 c = '\\';
2342 goto NORMAL_CHAR;
2343 }
2344 }
2345 length++;
2346
2347 /* A back reference needs an additional char, plus either one or 5
Guido van Rossum50700601997-12-08 17:15:20 +00002348 bytes for a repeat. We also need to keep the value of the highest
2349 back reference. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002350
2351 if (c <= -ESC_REF)
2352 {
Guido van Rossum50700601997-12-08 17:15:20 +00002353 int refnum = -c - ESC_REF;
2354 if (refnum > top_backref) top_backref = refnum;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002355 length++; /* For single back reference */
Guido van Rossum50700601997-12-08 17:15:20 +00002356 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002357 {
2358 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2359 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2360 if ((min == 0 && (max == 1 || max == -1)) ||
2361 (min == 1 && max == -1))
2362 length++;
2363 else length += 5;
2364 if (ptr[1] == '?') ptr++;
2365 }
2366 }
2367 continue;
2368
2369 case '^':
2370 case '.':
2371 case '$':
2372 case '*': /* These repeats won't be after brackets; */
2373 case '+': /* those are handled separately */
2374 case '?':
2375 length++;
2376 continue;
2377
2378 /* This covers the cases of repeats after a single char, metachar, class,
2379 or back reference. */
2380
2381 case '{':
Guido van Rossum50700601997-12-08 17:15:20 +00002382 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002383 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
2384 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2385 if ((min == 0 && (max == 1 || max == -1)) ||
2386 (min == 1 && max == -1))
2387 length++;
2388 else
2389 {
2390 length--; /* Uncount the original char or metachar */
2391 if (min == 1) length++; else if (min > 0) length += 4;
2392 if (max > 0) length += 4; else length += 2;
2393 }
2394 if (ptr[1] == '?') ptr++;
2395 continue;
2396
2397 /* An alternation contains an offset to the next branch or ket. */
2398 case '|':
2399 length += 3;
2400 continue;
2401
Guido van Rossum50700601997-12-08 17:15:20 +00002402 /* A character class uses 33 characters. Don't worry about character types
2403 that aren't allowed in classes - they'll get picked up during the compile.
2404 A character class that contains only one character uses 2 or 3 bytes,
2405 depending on whether it is negated or not. Notice this where we can. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002406
2407 case '[':
Guido van Rossum50700601997-12-08 17:15:20 +00002408 class_charcount = 0;
2409 if (*(++ptr) == '^') ptr++;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002410 do
2411 {
Guido van Rossum50700601997-12-08 17:15:20 +00002412 if (*ptr == '\\')
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002413 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002414 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002415 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
Guido van Rossum557dea11997-12-22 22:46:52 +00002416 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002417 }
Guido van Rossum50700601997-12-08 17:15:20 +00002418 else class_charcount++;
2419 ptr++;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002420 }
2421 while (*ptr != 0 && *ptr != ']');
2422
Guido van Rossum50700601997-12-08 17:15:20 +00002423 /* Repeats for negated single chars are handled by the general code */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002424
Guido van Rossum50700601997-12-08 17:15:20 +00002425 if (class_charcount == 1) length += 3; else
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002426 {
Guido van Rossum50700601997-12-08 17:15:20 +00002427 length += 33;
2428 if (options & PCRE_LOCALE) length++; /* Add a byte for the localization flag */
2429
2430 /* A repeat needs either 1 or 5 bytes. */
2431
Guido van Rossum557dea11997-12-22 22:46:52 +00002432 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
Guido van Rossum50700601997-12-08 17:15:20 +00002433 {
2434 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
2435 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2436 if ((min == 0 && (max == 1 || max == -1)) ||
2437 (min == 1 && max == -1))
2438 length++;
2439 else length += 5;
2440 if (ptr[1] == '?') ptr++;
2441 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002442 }
2443 continue;
2444
2445 /* Brackets may be genuine groups or special things */
2446
2447 case '(':
2448
2449 /* Handle special forms of bracket, which all start (? */
2450
2451 if (ptr[1] == '?') switch (c = ptr[2])
2452 {
2453 /* Skip over comments entirely */
2454 case '#':
2455 ptr += 3;
2456 while (*ptr != 0 && *ptr != ')') ptr++;
2457 if (*ptr == 0)
2458 {
Guido van Rossum50700601997-12-08 17:15:20 +00002459 *errorptr = ERR18;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002460 goto PCRE_ERROR_RETURN;
2461 }
2462 continue;
2463
2464 /* Non-referencing groups and lookaheads just move the pointer on, and
Guido van Rossum50700601997-12-08 17:15:20 +00002465 then behave like a non-special bracket, except that they don't increment
2466 the count of extracting brackets. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002467
2468 case ':':
2469 case '=':
2470 case '!':
2471 ptr += 2;
2472 break;
2473
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002474 case ('P'):
2475 {
2476 int idlen;
2477 switch (*ptr++) {
2478 case ('<'):
2479 idlen = get_group_id(ptr++, '>', errorptr);
2480 if (*errorptr) goto PCRE_ERROR_RETURN;
2481 ptr += idlen+1;
2482 break;
2483 case ('='):
2484 idlen = get_group_id(ptr++, ')', errorptr);
2485 if (*errorptr) goto PCRE_ERROR_RETURN;
2486 ptr += idlen+1;
2487 length++;
2488 break;
2489 }
2490 }
2491 break;
2492
Guido van Rossum50700601997-12-08 17:15:20 +00002493 /* Ditto for the "once only" bracket, allowed only if the extra bit
2494 is set. */
2495
2496 case '>':
2497 if ((options & PCRE_EXTRA) != 0)
2498 {
2499 ptr += 2;
2500 break;
2501 }
Guido van Rossumdda66961998-05-07 15:32:44 +00002502 /* Else fall through */
Guido van Rossum50700601997-12-08 17:15:20 +00002503
2504 /* Else loop setting valid options until ) is met. Anything else is an
2505 error. */
2506
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002507 default:
2508 ptr += 2;
2509 for (;; ptr++)
2510 {
2511 if ((c = *ptr) == 'i')
2512 {
2513 options |= PCRE_CASELESS;
2514 continue;
2515 }
Guido van Rossumbd49ac41997-12-10 23:05:53 +00002516 else if ((c = *ptr) == 'L')
Guido van Rossum50700601997-12-08 17:15:20 +00002517 {
2518 options |= PCRE_LOCALE;
2519 continue;
2520 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002521 else if ((c = *ptr) == 'm')
2522 {
2523 options |= PCRE_MULTILINE;
2524 continue;
2525 }
Guido van Rossum50700601997-12-08 17:15:20 +00002526 else if (c == 's')
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002527 {
2528 options |= PCRE_DOTALL;
2529 continue;
2530 }
2531 else if (c == 'x')
2532 {
2533 options |= PCRE_EXTENDED;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002534 length -= spaces; /* Already counted spaces */
2535 continue;
2536 }
2537 else if (c == ')') break;
2538
Guido van Rossum50700601997-12-08 17:15:20 +00002539 *errorptr = ERR12;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002540 goto PCRE_ERROR_RETURN;
2541 }
2542 continue; /* End of this bracket handling */
2543 }
2544
Guido van Rossum50700601997-12-08 17:15:20 +00002545 /* Extracting brackets must be counted so we can process escapes in a
2546 Perlish way. */
2547
2548 else bracount++;
2549
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002550 /* Non-special forms of bracket. Save length for computing whole length
2551 at end if there's a repeat that requires duplication of the group. */
2552
2553 if (brastackptr >= sizeof(brastack)/sizeof(int))
2554 {
Guido van Rossum50700601997-12-08 17:15:20 +00002555 *errorptr = ERR19;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002556 goto PCRE_ERROR_RETURN;
2557 }
2558
2559 brastack[brastackptr++] = length;
2560 length += 3;
2561 continue;
2562
2563 /* Handle ket. Look for subsequent max/min; for certain sets of values we
Guido van Rossum557dea11997-12-22 22:46:52 +00002564 have to replicate this bracket up to that many times. If brastackptr is
2565 0 this is an unmatched bracket which will generate an error, but take care
2566 not to try to access brastack[-1]. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002567
2568 case ')':
2569 length += 3;
2570 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002571 int minval = 1;
2572 int maxval = 1;
2573 int duplength = (brastackptr > 0)? length - brastack[--brastackptr] : 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002574
2575 /* Leave ptr at the final char; for read_repeat_counts this happens
2576 automatically; for the others we need an increment. */
2577
Guido van Rossum50700601997-12-08 17:15:20 +00002578 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002579 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002580 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002581 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2582 }
Guido van Rossum557dea11997-12-22 22:46:52 +00002583 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2584 else if (c == '+') { maxval = -1; ptr++; }
2585 else if (c == '?') { minval = 0; ptr++; }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002586
Guido van Rossum557dea11997-12-22 22:46:52 +00002587 /* If there is a minimum > 1 we have to replicate up to minval-1 times;
2588 if there is a limited maximum we have to replicate up to maxval-1 times
2589 and allow for a BRAZERO item before each optional copy, as we also have
2590 to do before the first copy if the minimum is zero. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002591
Guido van Rossum557dea11997-12-22 22:46:52 +00002592 if (minval == 0) length++;
2593 else if (minval > 1) length += (minval - 1) * duplength;
2594 if (maxval > minval) length += (maxval - minval) * (duplength + 1);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002595 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002596 continue;
2597
2598 /* Non-special character. For a run of such characters the length required
2599 is the number of characters + 2, except that the maximum run length is 255.
2600 We won't get a skipped space or a non-data escape or the start of a #
2601 comment as the first character, so the length can't be zero. */
2602
2603 NORMAL_CHAR:
2604 default:
2605 length += 2;
2606 runlength = 0;
2607 do
2608 {
2609 if ((pcre_ctypes[c] & ctype_space) != 0)
2610 {
Guido van Rossum50700601997-12-08 17:15:20 +00002611 if ((options & PCRE_EXTENDED) != 0) continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002612 spaces++;
2613 }
2614
Guido van Rossum50700601997-12-08 17:15:20 +00002615 if (c == '#' && (options & PCRE_EXTENDED) != 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002616 {
2617 while ((c = *(++ptr)) != 0 && c != '\n');
2618 continue;
2619 }
2620
2621 /* Backslash may introduce a data char or a metacharacter; stop the
2622 string before the latter. */
2623
2624 if (c == '\\')
2625 {
Guido van Rossum58132c61997-12-17 00:24:13 +00002626 const uschar *saveptr = ptr;
Guido van Rossum50700601997-12-08 17:15:20 +00002627 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002628 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2629 if (c < 0) { ptr = saveptr; break; }
2630 }
2631
2632 /* Ordinary character or single-char escape */
2633
2634 runlength++;
2635 }
2636
2637 /* This "while" is the end of the "do" above. */
2638
2639 while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0);
2640
2641 ptr--;
2642 length += runlength;
2643 continue;
2644 }
2645 }
2646
2647length += 4; /* For final KET and END */
2648
2649if (length > 65539)
2650 {
Guido van Rossum50700601997-12-08 17:15:20 +00002651 *errorptr = ERR20;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002652 return NULL;
2653 }
2654
2655/* Compute the size of data block needed and get it, either from malloc or
Guido van Rossum557dea11997-12-22 22:46:52 +00002656externally provided function. We specify "code[0]" in the offsetof() expression
2657rather than just "code", because it has been reported that one broken compiler
2658fails on "code" because it is also an independent variable. It should make no
2659difference to the value of the offsetof(). */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002660
Guido van Rossum557dea11997-12-22 22:46:52 +00002661size = length + offsetof(real_pcre, code[0]);
Guido van Rossum50700601997-12-08 17:15:20 +00002662re = (real_pcre *)(pcre_malloc)(size+50);
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002663
2664if (re == NULL)
2665 {
Guido van Rossum50700601997-12-08 17:15:20 +00002666 *errorptr = ERR21;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002667 return NULL;
2668 }
2669
Guido van Rossum557dea11997-12-22 22:46:52 +00002670/* Put in the magic number and the options. */
2671
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002672re->magic_number = MAGIC_NUMBER;
2673re->options = options;
2674
2675/* Set up a starting, non-extracting bracket, then compile the expression. On
2676error, *errorptr will be set non-NULL, so we don't need to look at the result
2677of the function here. */
2678
Guido van Rossum58132c61997-12-17 00:24:13 +00002679ptr = (const uschar *)pattern;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002680code = re->code;
2681*code = OP_BRA;
Guido van Rossum50700601997-12-08 17:15:20 +00002682bracount = 0;
2683(void)compile_regex(options, &bracount, &code, &ptr, errorptr, dictionary);
2684re->top_bracket = bracount;
2685re->top_backref = top_backref;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002686
2687/* If not reached end of pattern on success, there's an excess bracket. */
2688
Guido van Rossum50700601997-12-08 17:15:20 +00002689if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
2690
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002691/* Fill in the terminating state and check for disastrous overflow, but
2692if debugging, leave the test till after things are printed out. */
2693
2694*code++ = OP_END;
2695
Guido van Rossum50700601997-12-08 17:15:20 +00002696
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002697#ifndef DEBUG
Guido van Rossum50700601997-12-08 17:15:20 +00002698if (code - re->code > length) *errorptr = ERR23;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002699#endif
2700
2701/* Failed to compile */
2702
2703if (*errorptr != NULL)
2704 {
2705 (pcre_free)(re);
2706 PCRE_ERROR_RETURN:
Guido van Rossum58132c61997-12-17 00:24:13 +00002707 *erroroffset = ptr - (const uschar *)pattern;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002708 return NULL;
2709 }
2710
2711/* If the anchored option was not passed, set flag if we can determine that it
2712is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
2713we can determine what the first character has to be, because that speeds up
2714unanchored matches no end. In the case of multiline matches, an alternative is
2715to set the PCRE_STARTLINE flag if all branches start with ^. */
2716
2717if ((options & PCRE_ANCHORED) == 0)
2718 {
2719 if (is_anchored(re->code, (options & PCRE_MULTILINE) != 0))
2720 re->options |= PCRE_ANCHORED;
2721 else
2722 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002723 int ch = find_firstchar(re->code);
2724 if (ch >= 0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002725 {
Guido van Rossum557dea11997-12-22 22:46:52 +00002726 re->first_char = ch;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002727 re->options |= PCRE_FIRSTSET;
2728 }
2729 else if (is_startline(re->code))
2730 re->options |= PCRE_STARTLINE;
2731 }
2732 }
2733
2734/* Print out the compiled data for debugging */
2735
2736#ifdef DEBUG
2737
Guido van Rossum50700601997-12-08 17:15:20 +00002738printf("Length = %d top_bracket = %d top_backref=%d\n",
2739 length, re->top_bracket, re->top_backref);
2740
2741if (re->options != 0)
2742 {
Guido van Rossumdda66961998-05-07 15:32:44 +00002743 printf("%s%s%s%s%s%s%s%s\n",
Guido van Rossum50700601997-12-08 17:15:20 +00002744 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2745 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2746 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2747 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2748 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2749 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
Guido van Rossumdda66961998-05-07 15:32:44 +00002750 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
2751 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
Guido van Rossum50700601997-12-08 17:15:20 +00002752 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002753
2754if ((re->options & PCRE_FIRSTSET) != 0)
2755 {
2756 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2757 else printf("First char = \\x%02x\n", re->first_char);
2758 }
2759
2760code_end = code;
2761code_base = code = re->code;
2762
2763while (code < code_end)
2764 {
2765 int charlength;
2766
2767 printf("%3d ", code - code_base);
2768
2769 if (*code >= OP_BRA)
2770 {
2771 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2772 code += 2;
2773 }
2774
2775 else switch(*code)
2776 {
2777 case OP_CHARS:
2778 charlength = *(++code);
2779 printf("%3d ", charlength);
2780 while (charlength-- > 0)
2781 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2782 break;
2783
2784 case OP_KETRMAX:
2785 case OP_KETRMIN:
2786 case OP_ALT:
2787 case OP_KET:
2788 case OP_ASSERT:
2789 case OP_ASSERT_NOT:
Guido van Rossum50700601997-12-08 17:15:20 +00002790 case OP_ONCE:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002791 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2792 code += 2;
2793 break;
2794
2795 case OP_STAR:
2796 case OP_MINSTAR:
2797 case OP_PLUS:
2798 case OP_MINPLUS:
2799 case OP_QUERY:
2800 case OP_MINQUERY:
2801 case OP_TYPESTAR:
2802 case OP_TYPEMINSTAR:
2803 case OP_TYPEPLUS:
2804 case OP_TYPEMINPLUS:
2805 case OP_TYPEQUERY:
2806 case OP_TYPEMINQUERY:
2807 if (*code >= OP_TYPESTAR)
2808 printf(" %s", OP_names[code[1]]);
2809 else if (isprint(c = code[1])) printf(" %c", c);
2810 else printf(" \\x%02x", c);
2811 printf("%s", OP_names[*code++]);
2812 break;
2813
2814 case OP_EXACT:
2815 case OP_UPTO:
2816 case OP_MINUPTO:
2817 if (isprint(c = code[3])) printf(" %c{", c);
2818 else printf(" \\x%02x{", c);
Guido van Rossum557dea11997-12-22 22:46:52 +00002819 if (*code != OP_EXACT) printf("0,");
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002820 printf("%d}", (code[1] << 8) + code[2]);
2821 if (*code == OP_MINUPTO) printf("?");
2822 code += 3;
2823 break;
2824
2825 case OP_TYPEEXACT:
2826 case OP_TYPEUPTO:
2827 case OP_TYPEMINUPTO:
2828 printf(" %s{", OP_names[code[3]]);
2829 if (*code != OP_TYPEEXACT) printf(",");
2830 printf("%d}", (code[1] << 8) + code[2]);
2831 if (*code == OP_TYPEMINUPTO) printf("?");
2832 code += 3;
2833 break;
2834
Guido van Rossum50700601997-12-08 17:15:20 +00002835 case OP_NOT:
2836 if (isprint(c = *(++code))) printf(" [^%c]", c);
2837 else printf(" [^\\x%02x]", c);
2838 break;
2839
2840 case OP_NOTSTAR:
2841 case OP_NOTMINSTAR:
2842 case OP_NOTPLUS:
2843 case OP_NOTMINPLUS:
2844 case OP_NOTQUERY:
2845 case OP_NOTMINQUERY:
2846 if (isprint(c = code[1])) printf(" [^%c]", c);
2847 else printf(" [^\\x%02x]", c);
2848 printf("%s", OP_names[*code++]);
2849 break;
2850
2851 case OP_NOTEXACT:
2852 case OP_NOTUPTO:
2853 case OP_NOTMINUPTO:
2854 if (isprint(c = code[3])) printf(" [^%c]{", c);
2855 else printf(" [^\\x%02x]{", c);
2856 if (*code != OP_NOTEXACT) printf(",");
2857 printf("%d}", (code[1] << 8) + code[2]);
2858 if (*code == OP_NOTMINUPTO) printf("?");
2859 code += 3;
2860 break;
2861
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002862 case OP_REF:
2863 printf(" \\%d", *(++code));
Guido van Rossum557dea11997-12-22 22:46:52 +00002864 code ++;
2865 goto CLASS_REF_REPEAT;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002866
2867 case OP_CLASS:
Guido van Rossum042ff9e1998-04-03 21:13:31 +00002868 case OP_NEGCLASS:
Guido van Rossum50700601997-12-08 17:15:20 +00002869 case OP_CLASS_L:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002870 {
2871 int i, min, max;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002872
Guido van Rossum50700601997-12-08 17:15:20 +00002873 if (*code==OP_CLASS_L)
2874 {
2875 code++;
2876 printf("Locflag = %i ", *code++);
Guido van Rossum042ff9e1998-04-03 21:13:31 +00002877 printf(" [");
Guido van Rossum50700601997-12-08 17:15:20 +00002878 }
2879 else
Guido van Rossum042ff9e1998-04-03 21:13:31 +00002880 {
2881 if (*code++ == OP_CLASS) printf(" [");
2882 else printf(" ^[");
2883 }
2884
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002885
Guido van Rossum50700601997-12-08 17:15:20 +00002886 for (i = 0; i < 256; i++)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002887 {
Guido van Rossum50700601997-12-08 17:15:20 +00002888 if ((code[i/8] & (1 << (i&7))) != 0)
2889 {
2890 int j;
2891 for (j = i+1; j < 256; j++)
2892 if ((code[j/8] & (1 << (j&7))) == 0) break;
2893 if (i == '-' || i == ']') printf("\\");
2894 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2895 if (--j > i)
2896 {
2897 printf("-");
2898 if (j == '-' || j == ']') printf("\\");
2899 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2900 }
2901 i = j;
2902 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002903 }
2904 printf("]");
Guido van Rossum50700601997-12-08 17:15:20 +00002905 code += 32;
2906 /* code ++;*/
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002907
Guido van Rossum557dea11997-12-22 22:46:52 +00002908 CLASS_REF_REPEAT:
2909
Guido van Rossum50700601997-12-08 17:15:20 +00002910 switch(*code)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002911 {
2912 case OP_CRSTAR:
2913 case OP_CRMINSTAR:
2914 case OP_CRPLUS:
2915 case OP_CRMINPLUS:
2916 case OP_CRQUERY:
2917 case OP_CRMINQUERY:
2918 printf("%s", OP_names[*code]);
2919 break;
2920
2921 case OP_CRRANGE:
2922 case OP_CRMINRANGE:
2923 min = (code[1] << 8) + code[2];
2924 max = (code[3] << 8) + code[4];
2925 if (max == 0) printf("{%d,}", min);
2926 else printf("{%d,%d}", min, max);
2927 if (*code == OP_CRMINRANGE) printf("?");
2928 code += 4;
2929 break;
2930
2931 default:
2932 code--;
2933 }
2934 }
2935 break;
2936
2937 /* Anything else is just a one-node item */
2938
2939 default:
2940 printf(" %s", OP_names[*code]);
2941 break;
2942 }
2943
2944 code++;
2945 printf("\n");
2946 }
2947printf("------------------------------------------------------------------\n");
2948
2949/* This check is done here in the debugging case so that the code that
2950was compiled can be seen. */
2951
2952if (code - re->code > length)
2953 {
Guido van Rossum50700601997-12-08 17:15:20 +00002954 printf("length=%i, code length=%i\n", length, code-re->code);
2955 *errorptr = ERR23;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002956 (pcre_free)(re);
2957 *erroroffset = ptr - (uschar *)pattern;
2958 return NULL;
2959 }
2960#endif
2961
2962return (pcre *)re;
2963}
2964
2965
2966
2967/*************************************************
2968* Match a character type *
2969*************************************************/
2970
2971/* Not used in all the places it might be as it's sometimes faster
2972to put the code inline.
2973
2974Arguments:
2975 type the character type
2976 c the character
Guido van Rossum50700601997-12-08 17:15:20 +00002977 dotall the dotall flag
Guido van Rossum51b3aa31997-10-06 14:43:11 +00002978
2979Returns: TRUE if character is of the type
2980*/
2981
2982static BOOL
2983match_type(int type, int c, BOOL dotall)
2984{
2985
2986#ifdef DEBUG
2987if (isprint(c)) printf("matching subject %c against ", c);
2988 else printf("matching subject \\x%02x against ", c);
2989printf("%s\n", OP_names[type]);
2990#endif
2991
2992switch(type)
2993 {
2994 case OP_ANY: return dotall || c != '\n';
2995 case OP_NOT_DIGIT: return (pcre_ctypes[c] & ctype_digit) == 0;
2996 case OP_DIGIT: return (pcre_ctypes[c] & ctype_digit) != 0;
2997 case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0;
2998 case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0;
2999 case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0;
3000 case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0;
Guido van Rossum58132c61997-12-17 00:24:13 +00003001 case OP_NOT_WORDCHAR_L: return (c!='_' && !isalnum(c));
3002 case OP_WORDCHAR_L: return (c=='_' || isalnum(c));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003003 }
3004return FALSE;
3005}
3006
3007
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003008
3009/*************************************************
3010* Match a back-reference *
3011*************************************************/
3012
3013/* If a back reference hasn't been set, the match fails.
3014
3015Arguments:
3016 number reference number
3017 eptr points into the subject
3018 length length to be matched
3019 md points to match data block
3020
3021Returns: TRUE if matched
3022*/
3023
3024static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00003025match_ref(int number, register const uschar *eptr, int length, match_data *md)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003026{
Guido van Rossum58132c61997-12-17 00:24:13 +00003027const uschar *p = md->start_subject + md->offset_vector[number];
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003028
3029#ifdef DEBUG
3030if (eptr >= md->end_subject)
3031 printf("matching subject <null>");
3032else
3033 {
3034 printf("matching subject ");
3035 pchars(eptr, length, TRUE, md);
3036 }
3037printf(" against backref ");
3038pchars(p, length, FALSE, md);
3039printf("\n");
3040#endif
3041
3042/* Always fail if not enough characters left */
3043
3044if (length > md->end_subject - p) return FALSE;
3045
Guido van Rossum58132c61997-12-17 00:24:13 +00003046/* Separate the caseless case for speed */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003047
3048if (md->caseless)
3049 { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; }
3050else
3051 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3052
3053return TRUE;
3054}
3055
3056static int free_stack(match_data *md)
3057{
3058/* Free any stack space that was allocated by the call to match(). */
3059if (md->off_num) free(md->off_num);
3060if (md->offset_top) free(md->offset_top);
3061if (md->r1) free(md->r1);
3062if (md->r2) free(md->r2);
Guido van Rossum39b0f891998-04-10 21:52:06 +00003063if (md->eptr) free((char *)md->eptr);
3064if (md->ecode) free((char *)md->ecode);
Guido van Rossumc3861071997-10-08 02:07:40 +00003065return 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003066}
3067
3068static int grow_stack(match_data *md)
3069{
Guido van Rossum50700601997-12-08 17:15:20 +00003070 if (md->length != 0)
3071 {
3072 md->length = md->length + md->length/2;
3073 }
3074 else
3075 {
3076 int string_len = md->end_subject - md->start_subject + 1;
3077 if (string_len < 80) {md->length = string_len; }
3078 else {md->length = 80;}
3079 }
3080 PyMem_RESIZE(md->offset_top, int, md->length);
Guido van Rossum58132c61997-12-17 00:24:13 +00003081 PyMem_RESIZE(md->eptr, const uschar *, md->length);
3082 PyMem_RESIZE(md->ecode, const uschar *, md->length);
Guido van Rossum50700601997-12-08 17:15:20 +00003083 PyMem_RESIZE(md->off_num, int, md->length);
3084 PyMem_RESIZE(md->r1, int, md->length);
3085 PyMem_RESIZE(md->r2, int, md->length);
3086 if (md->offset_top == NULL || md->eptr == NULL || md->ecode == NULL ||
3087 md->off_num == NULL || md->r1 == NULL || md->r2 == NULL)
3088 {
Guido van Rossumdda66961998-05-07 15:32:44 +00003089 PyErr_NoMemory();
Guido van Rossum50700601997-12-08 17:15:20 +00003090 longjmp(md->error_env, 1);
3091 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003092 return 0;
3093}
3094
Guido van Rossum50700601997-12-08 17:15:20 +00003095
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003096/*************************************************
3097* Match from current position *
3098*************************************************/
3099
3100/* On entry ecode points to the first opcode, and eptr to the first character.
3101
3102Arguments:
3103 eptr pointer in subject
3104 ecode position in code
3105 offset_top current top pointer
3106 md pointer to "static" info for the match
3107
3108Returns: TRUE if matched
3109*/
3110
3111static BOOL
Guido van Rossum58132c61997-12-17 00:24:13 +00003112match(register const uschar *eptr, register const uschar *ecode, int offset_top,
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003113 match_data *md)
3114{
3115 int save_stack_position = md->point;
3116match_loop:
3117
3118#define SUCCEED goto succeed
3119#define FAIL goto fail
3120
3121for (;;)
3122 {
3123 int min, max, ctype;
3124 register int i;
3125 register int c;
Guido van Rossum58132c61997-12-17 00:24:13 +00003126 BOOL minimize = FALSE;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003127
3128 /* Opening bracket. Check the alternative branches in turn, failing if none
3129 match. We have to set the start offset if required and there is space
3130 in the offset vector so that it is available for subsequent back references
3131 if the bracket matches. However, if the bracket fails, we must put back the
3132 previous value of both offsets in case they were set by a previous copy of
3133 the same bracket. Don't worry about setting the flag for the error case here;
3134 that is handled in the code for KET. */
3135
3136 if ((int)*ecode >= OP_BRA)
3137 {
3138 int number = (*ecode - OP_BRA) << 1;
Guido van Rossum58132c61997-12-17 00:24:13 +00003139 int save_offset1 = 0, save_offset2 = 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003140
Guido van Rossum557dea11997-12-22 22:46:52 +00003141 DPRINTF(("start bracket %d\n", number/2));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003142
3143 if (number > 0 && number < md->offset_end)
3144 {
3145 save_offset1 = md->offset_vector[number];
3146 save_offset2 = md->offset_vector[number+1];
3147 md->offset_vector[number] = eptr - md->start_subject;
3148
Guido van Rossum557dea11997-12-22 22:46:52 +00003149 DPRINTF(("saving %d %d\n", save_offset1, save_offset2));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003150 }
3151
3152 /* Recurse for all the alternatives. */
3153
3154 do
3155 {
3156 if (match(eptr, ecode+3, offset_top, md)) SUCCEED;
3157 ecode += (ecode[1] << 8) + ecode[2];
3158 }
3159 while (*ecode == OP_ALT);
3160
Guido van Rossum557dea11997-12-22 22:46:52 +00003161 DPRINTF(("bracket %d failed\n", number/2));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003162
3163 if (number > 0 && number < md->offset_end)
3164 {
3165 md->offset_vector[number] = save_offset1;
3166 md->offset_vector[number+1] = save_offset2;
3167 }
3168
3169 FAIL;
3170 }
3171
3172 /* Other types of node can be handled by a switch */
3173
3174 switch(*ecode)
3175 {
3176 case OP_END:
3177 md->end_match_ptr = eptr; /* Record where we ended */
3178 md->end_offset_top = offset_top; /* and how many extracts were taken */
3179 SUCCEED;
3180
Guido van Rossum50700601997-12-08 17:15:20 +00003181 /* The equivalent of Prolog's "cut" - if the rest doesn't match, the
3182 whole thing doesn't match, so we have to get out via a longjmp(). */
3183
3184 case OP_CUT:
3185 if (match(eptr, ecode+1, offset_top, md)) SUCCEED;
3186 longjmp(md->fail_env, 1);
3187
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003188 /* Assertion brackets. Check the alternative branches in turn - the
3189 matching won't pass the KET for an assertion. If any one branch matches,
3190 the assertion is true. */
3191
3192 case OP_ASSERT:
3193 do
3194 {
3195 if (match(eptr, ecode+3, offset_top, md)) break;
3196 ecode += (ecode[1] << 8) + ecode[2];
3197 }
3198 while (*ecode == OP_ALT);
3199 if (*ecode == OP_KET) FAIL;
3200
3201 /* Continue from after the assertion, updating the offsets high water
3202 mark, since extracts may have been taken during the assertion. */
3203
3204 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3205 ecode += 3;
3206 offset_top = md->end_offset_top;
3207 continue;
3208
3209 /* Negative assertion: all branches must fail to match */
3210
3211 case OP_ASSERT_NOT:
3212 do
3213 {
3214 if (match(eptr, ecode+3, offset_top, md)) FAIL;
3215 ecode += (ecode[1] << 8) + ecode[2];
3216 }
3217 while (*ecode == OP_ALT);
3218 ecode += 3;
3219 continue;
3220
Guido van Rossum50700601997-12-08 17:15:20 +00003221 /* "Once" brackets are like assertion brackets except that after a match,
3222 the point in the subject string is not moved back. Thus there can never be
3223 a move back into the brackets. Check the alternative branches in turn - the
3224 matching won't pass the KET for this kind of subpattern. If any one branch
3225 matches, we carry on, leaving the subject pointer. */
3226
3227 case OP_ONCE:
3228 do
3229 {
3230 if (match(eptr, ecode+3, offset_top, md)) break;
3231 ecode += (ecode[1] << 8) + ecode[2];
3232 }
3233 while (*ecode == OP_ALT);
Guido van Rossum557dea11997-12-22 22:46:52 +00003234 if (*ecode == OP_KET) FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00003235
3236 /* Continue as from after the assertion, updating the offsets high water
3237 mark, since extracts may have been taken. */
3238
3239 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3240 ecode += 3;
3241 offset_top = md->end_offset_top;
3242 eptr = md->end_match_ptr;
3243 continue;
3244
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003245 /* An alternation is the end of a branch; scan along to find the end of the
3246 bracketed group and go to there. */
3247
3248 case OP_ALT:
3249 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3250 break;
3251
3252 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3253 that it may occur zero times. It may repeat infinitely, or not at all -
3254 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3255 repeat limits are compiled as a number of copies, with the optional ones
3256 preceded by BRAZERO or BRAMINZERO. */
3257
3258 case OP_BRAZERO:
3259 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003260 const uschar *next = ecode+1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003261 if (match(eptr, next, offset_top, md)) SUCCEED;
3262 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3263 ecode = next + 3;
3264 }
3265 break;
3266
3267 case OP_BRAMINZERO:
3268 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003269 const uschar *next = ecode+1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003270 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3271 if (match(eptr, next+3, offset_top, md)) SUCCEED;
3272 ecode++;
3273 }
3274 break;;
3275
3276 /* End of a group, repeated or non-repeating. If we are at the end of
3277 an assertion "group", stop matching and SUCCEED, but record the
3278 current high water mark for use by positive assertions. */
3279
3280 case OP_KET:
3281 case OP_KETRMIN:
3282 case OP_KETRMAX:
3283 {
Guido van Rossum50700601997-12-08 17:15:20 +00003284 int number;
Guido van Rossum58132c61997-12-17 00:24:13 +00003285 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003286
Guido van Rossum50700601997-12-08 17:15:20 +00003287 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ONCE)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003288 {
Guido van Rossum50700601997-12-08 17:15:20 +00003289 md->end_match_ptr = eptr; /* For ONCE */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003290 md->end_offset_top = offset_top;
3291 SUCCEED;
3292 }
3293
3294 /* In all other cases we have to check the group number back at the
3295 start and if necessary complete handling an extraction by setting the
3296 final offset and bumping the high water mark. */
3297
3298 number = (*prev - OP_BRA) << 1;
3299
Guido van Rossum557dea11997-12-22 22:46:52 +00003300 DPRINTF(("end bracket %d\n", number/2));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003301
3302 if (number > 0)
3303 {
3304 if (number >= md->offset_end) md->offset_overflow = TRUE; else
3305 {
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003306 md->offset_vector[number+1] = eptr - md->start_subject;
3307 if (offset_top <= number) offset_top = number + 2;
3308 }
3309 }
3310
3311 /* For a non-repeating ket, just advance to the next node and continue at
3312 this level. */
3313
3314 if (*ecode == OP_KET)
3315 {
3316 ecode += 3;
3317 break;
3318 }
3319
3320 /* The repeating kets try the rest of the pattern or restart from the
3321 preceding bracket, in the appropriate order. */
3322
3323 if (*ecode == OP_KETRMIN)
3324 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003325 const uschar *ptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003326 if (match(eptr, ecode+3, offset_top, md)) goto succeed;
3327 /* Handle alternation inside the BRA...KET; push the additional
Guido van Rossum58132c61997-12-17 00:24:13 +00003328 alternatives onto the stack */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003329 ptr=prev;
3330 do {
3331 ptr += (ptr[1]<<8)+ ptr[2];
3332 if (*ptr==OP_ALT)
3333 {
Guido van Rossum50700601997-12-08 17:15:20 +00003334 if (md->length == md->point)
3335 {
3336 grow_stack(md);
3337 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003338 md->offset_top[md->point] = offset_top;
3339 md->eptr[md->point] = eptr;
3340 md->ecode[md->point] = ptr+3;
3341 md->r1[md->point] = 0;
3342 md->r2[md->point] = 0;
3343 md->off_num[md->point] = 0;
3344 md->point++;
3345 }
3346 } while (*ptr==OP_ALT);
3347 ecode=prev+3; goto match_loop;
3348 }
3349 else /* OP_KETRMAX */
3350 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003351 const uschar *ptr;
3352 /*int points_pushed=0;*/
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003353
3354 /* Push one failure point, that will resume matching at the code after
3355 the KETRMAX opcode. */
Guido van Rossum50700601997-12-08 17:15:20 +00003356 if (md->length == md->point)
3357 {
3358 grow_stack(md);
3359 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003360 md->offset_top[md->point] = offset_top;
3361 md->eptr[md->point] = eptr;
3362 md->ecode[md->point] = ecode+3;
3363 md->r1[md->point] = md->offset_vector[number];
3364 md->r2[md->point] = md->offset_vector[number+1];
3365 md->off_num[md->point] = number;
3366 md->point++;
3367
3368 md->offset_vector[number] = eptr - md->start_subject;
3369 /* Handle alternation inside the BRA...KET; push each of the
Guido van Rossum58132c61997-12-17 00:24:13 +00003370 additional alternatives onto the stack */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003371 ptr=prev;
3372 do {
3373 ptr += (ptr[1]<<8)+ ptr[2];
3374 if (*ptr==OP_ALT)
3375 {
Guido van Rossum50700601997-12-08 17:15:20 +00003376 if (md->length == md->point)
3377 if (md->length == md->point)
3378 {
3379 grow_stack(md);
3380 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003381 md->offset_top[md->point] = offset_top;
3382 md->eptr[md->point] = eptr;
3383 md->ecode[md->point] = ptr+3;
3384 md->r1[md->point] = 0;
3385 md->r2[md->point] = 0;
3386 md->off_num[md->point] = 0;
3387 md->point++;
Guido van Rossum58132c61997-12-17 00:24:13 +00003388 /*points_pushed++;*/
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003389 }
3390 } while (*ptr==OP_ALT);
3391 /* Jump to the first (or only) alternative and resume trying to match */
3392 ecode=prev+3; goto match_loop;
3393 }
3394 }
Guido van Rossum58132c61997-12-17 00:24:13 +00003395 break;
3396
Guido van Rossum50700601997-12-08 17:15:20 +00003397 /* Start of subject unless notbol, or after internal newline if multiline */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003398
3399 case OP_CIRC:
Guido van Rossum50700601997-12-08 17:15:20 +00003400 if (md->notbol && eptr == md->start_subject) FAIL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003401 if (md->multiline)
3402 {
3403 if (eptr != md->start_subject && eptr[-1] != '\n') FAIL;
3404 ecode++;
3405 break;
3406 }
3407 /* ... else fall through */
3408
3409 /* Start of subject assertion */
3410
3411 case OP_SOD:
3412 if (eptr != md->start_subject) FAIL;
3413 ecode++;
3414 break;
3415
Guido van Rossum50700601997-12-08 17:15:20 +00003416 /* Assert before internal newline if multiline, or before
3417 a terminating newline unless endonly is set, else end of subject unless
3418 noteol is set. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003419
3420 case OP_DOLL:
Guido van Rossum50700601997-12-08 17:15:20 +00003421 if (md->noteol && eptr >= md->end_subject) FAIL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003422 if (md->multiline)
3423 {
3424 if (eptr < md->end_subject && *eptr != '\n') FAIL;
3425 ecode++;
3426 break;
3427 }
Guido van Rossum50700601997-12-08 17:15:20 +00003428 else if (!md->endonly)
3429 {
3430 if (eptr < md->end_subject - 1 ||
3431 (eptr == md->end_subject - 1 && *eptr != '\n')) FAIL;
3432 ecode++;
3433 break;
3434 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003435 /* ... else fall through */
3436
3437 /* End of subject assertion */
3438
3439 case OP_EOD:
3440 if (eptr < md->end_subject) FAIL;
3441 ecode++;
3442 break;
3443
3444 /* Word boundary assertions */
3445
3446 case OP_NOT_WORD_BOUNDARY:
3447 case OP_WORD_BOUNDARY:
3448 {
3449 BOOL prev_is_word = (eptr != md->start_subject) &&
3450 ((pcre_ctypes[eptr[-1]] & ctype_word) != 0);
3451 BOOL cur_is_word = (eptr < md->end_subject) &&
3452 ((pcre_ctypes[*eptr] & ctype_word) != 0);
3453 if ((*ecode++ == OP_WORD_BOUNDARY)?
3454 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3455 FAIL;
3456 }
3457 break;
3458
Guido van Rossum50700601997-12-08 17:15:20 +00003459 case OP_NOT_WORD_BOUNDARY_L:
3460 case OP_WORD_BOUNDARY_L:
3461 {
3462 BOOL prev_is_word = (eptr != md->start_subject) &&
Guido van Rossum58132c61997-12-17 00:24:13 +00003463 (isalnum(eptr[-1]) || eptr[-1]=='_');
Guido van Rossum50700601997-12-08 17:15:20 +00003464 BOOL cur_is_word = (eptr < md->end_subject) &&
Guido van Rossum58132c61997-12-17 00:24:13 +00003465 (isalnum(*eptr) || *eptr=='_');
Guido van Rossum50700601997-12-08 17:15:20 +00003466 if ((*ecode++ == OP_WORD_BOUNDARY_L)?
3467 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3468 FAIL;
3469 }
3470 break;
3471
3472
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003473 /* Match a single character type; inline for speed */
3474
3475 case OP_ANY:
3476 if (!md->dotall && eptr < md->end_subject && *eptr == '\n') FAIL;
3477 if (eptr++ >= md->end_subject) FAIL;
3478 ecode++;
3479 break;
3480
3481 case OP_NOT_DIGIT:
3482 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0)
3483 FAIL;
3484 ecode++;
3485 break;
3486
3487 case OP_DIGIT:
3488 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0)
3489 FAIL;
3490 ecode++;
3491 break;
3492
3493 case OP_NOT_WHITESPACE:
3494 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0)
3495 FAIL;
3496 ecode++;
3497 break;
3498
3499 case OP_WHITESPACE:
3500 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0)
3501 FAIL;
3502 ecode++;
3503 break;
3504
3505 case OP_NOT_WORDCHAR:
3506 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0)
3507 FAIL;
3508 ecode++;
3509 break;
3510
3511 case OP_WORDCHAR:
3512 if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0)
3513 FAIL;
3514 ecode++;
3515 break;
3516
Guido van Rossum50700601997-12-08 17:15:20 +00003517 case OP_NOT_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00003518 if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) ))
Guido van Rossum557dea11997-12-22 22:46:52 +00003519 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00003520 eptr++;
3521 ecode++;
3522 break;
3523
3524 case OP_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00003525 if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) ))
Guido van Rossum557dea11997-12-22 22:46:52 +00003526 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00003527 eptr++;
3528 ecode++;
3529 break;
3530
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003531 /* Match a back reference, possibly repeatedly. Look past the end of the
3532 item to see if there is repeat information following. The code is similar
3533 to that for character classes, but repeated for efficiency. Then obey
3534 similar code to character type repeats - written out again for speed.
3535 However, if the referenced string is the empty string, always treat
3536 it as matched, any number of times (otherwise there could be infinite
3537 loops). */
3538
3539 case OP_REF:
3540 {
3541 int length;
3542 int number = ecode[1] << 1; /* Doubled reference number */
3543 ecode += 2; /* Advance past the item */
3544
3545 if (number >= offset_top || md->offset_vector[number] < 0)
3546 {
3547 md->errorcode = PCRE_ERROR_BADREF;
3548 FAIL;
3549 }
3550
3551 length = md->offset_vector[number+1] - md->offset_vector[number];
3552
3553 switch (*ecode)
3554 {
3555 case OP_CRSTAR:
3556 case OP_CRMINSTAR:
3557 case OP_CRPLUS:
3558 case OP_CRMINPLUS:
3559 case OP_CRQUERY:
3560 case OP_CRMINQUERY:
3561 c = *ecode++ - OP_CRSTAR;
3562 minimize = (c & 1) != 0;
3563 min = rep_min[c]; /* Pick up values from tables; */
3564 max = rep_max[c]; /* zero for max => infinity */
3565 if (max == 0) max = INT_MAX;
3566 break;
3567
3568 case OP_CRRANGE:
3569 case OP_CRMINRANGE:
3570 minimize = (*ecode == OP_CRMINRANGE);
3571 min = (ecode[1] << 8) + ecode[2];
3572 max = (ecode[3] << 8) + ecode[4];
3573 if (max == 0) max = INT_MAX;
3574 ecode += 5;
3575 break;
3576
3577 default: /* No repeat follows */
3578 if (!match_ref(number, eptr, length, md)) FAIL;
3579 eptr += length;
3580 continue; /* With the main loop */
3581 }
3582
3583 /* If the length of the reference is zero, just continue with the
3584 main loop. */
3585
3586 if (length == 0) continue;
3587
3588 /* First, ensure the minimum number of matches are present. We get back
3589 the length of the reference string explicitly rather than passing the
3590 address of eptr, so that eptr can be a register variable. */
3591
3592 for (i = 1; i <= min; i++)
3593 {
3594 if (!match_ref(number, eptr, length, md)) FAIL;
3595 eptr += length;
3596 }
3597
3598 /* If min = max, continue at the same level without recursion.
3599 They are not both allowed to be zero. */
3600
3601 if (min == max) continue;
3602
3603 /* If minimizing, keep trying and advancing the pointer */
3604
3605 if (minimize)
3606 {
3607 for (i = min;; i++)
3608 {
3609 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3610 if (i >= max || !match_ref(number, eptr, length, md))
3611 FAIL;
3612 eptr += length;
3613 }
3614 /* Control never gets here */
3615 }
3616
3617 /* If maximizing, find the longest string and work backwards */
3618
3619 else
3620 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003621 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003622 for (i = min; i < max; i++)
3623 {
3624 if (!match_ref(number, eptr, length, md)) break;
3625 eptr += length;
3626 }
3627 while (eptr >= pp)
3628 {
3629 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3630 eptr -= length;
3631 }
3632 FAIL;
3633 }
3634 }
3635 /* Control never gets here */
3636
3637 /* Match a character class, possibly repeatedly. Look past the end of the
3638 item to see if there is repeat information following. Then obey similar
Guido van Rossum50700601997-12-08 17:15:20 +00003639 code to character type repeats - written out again for speed. If caseless
3640 matching was set at runtime but not at compile time, we have to check both
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003641 versions of a character, and we have to behave differently for positive and
3642 negative classes. This is the only time where OP_CLASS and OP_NEGCLASS are
3643 treated differently. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003644
3645 case OP_CLASS:
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003646 case OP_NEGCLASS:
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003647 {
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003648 BOOL nasty_case = *ecode == OP_NEGCLASS && md->runtime_caseless;
Guido van Rossum58132c61997-12-17 00:24:13 +00003649 const uschar *data = ecode + 1; /* Save for matching */
3650 ecode += 33; /* Advance past the item */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003651
3652 switch (*ecode)
3653 {
3654 case OP_CRSTAR:
3655 case OP_CRMINSTAR:
3656 case OP_CRPLUS:
3657 case OP_CRMINPLUS:
3658 case OP_CRQUERY:
3659 case OP_CRMINQUERY:
3660 c = *ecode++ - OP_CRSTAR;
3661 minimize = (c & 1) != 0;
3662 min = rep_min[c]; /* Pick up values from tables; */
3663 max = rep_max[c]; /* zero for max => infinity */
3664 if (max == 0) max = INT_MAX;
3665 break;
3666
3667 case OP_CRRANGE:
3668 case OP_CRMINRANGE:
3669 minimize = (*ecode == OP_CRMINRANGE);
3670 min = (ecode[1] << 8) + ecode[2];
3671 max = (ecode[3] << 8) + ecode[4];
3672 if (max == 0) max = INT_MAX;
3673 ecode += 5;
3674 break;
3675
3676 default: /* No repeat follows */
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003677 min = max = 1;
3678 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003679 }
3680
3681 /* First, ensure the minimum number of matches are present. */
3682
3683 for (i = 1; i <= min; i++)
Guido van Rossum50700601997-12-08 17:15:20 +00003684 {
3685 if (eptr >= md->end_subject) FAIL;
3686 c = *eptr++;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003687
3688 /* Either not runtime caseless, or it was a positive class. For
3689 runtime caseless, continue if either case is in the map. */
3690
3691 if (!nasty_case)
Guido van Rossum50700601997-12-08 17:15:20 +00003692 {
Guido van Rossum50700601997-12-08 17:15:20 +00003693 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003694 if (md->runtime_caseless)
3695 {
3696 c = pcre_fcc[c];
3697 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3698 }
Guido van Rossum50700601997-12-08 17:15:20 +00003699 }
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003700
3701 /* Runtime caseless and it was a negative class. Continue only if
3702 both cases are in the map. */
3703
3704 else
3705 {
3706 if ((data[c/8] & (1 << (c&7))) == 0) FAIL;
3707 c = pcre_fcc[c];
3708 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3709 }
3710
3711 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00003712 }
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003713
3714 /* If max == min we can continue with the main loop without the
3715 need to recurse. */
3716
3717 if (min == max) continue;
3718
3719 /* If minimizing, keep testing the rest of the expression and advancing
3720 the pointer while it matches the class. */
3721
3722 if (minimize)
3723 {
3724 for (i = min;; i++)
3725 {
3726 if (match(eptr, ecode, offset_top, md)) SUCCEED;
Guido van Rossum50700601997-12-08 17:15:20 +00003727 if (i >= max || eptr >= md->end_subject) FAIL;
3728 c = *eptr++;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003729
3730 /* Either not runtime caseless, or it was a positive class. For
3731 runtime caseless, continue if either case is in the map. */
3732
3733 if (!nasty_case)
Guido van Rossum50700601997-12-08 17:15:20 +00003734 {
Guido van Rossum50700601997-12-08 17:15:20 +00003735 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003736 if (md->runtime_caseless)
3737 {
3738 c = pcre_fcc[c];
3739 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3740 }
Guido van Rossum50700601997-12-08 17:15:20 +00003741 }
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003742
3743 /* Runtime caseless and it was a negative class. Continue only if
3744 both cases are in the map. */
3745
3746 else
3747 {
3748 if ((data[c/8] & (1 << (c&7))) == 0) return FALSE;
3749 c = pcre_fcc[c];
3750 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3751 }
3752
Guido van Rossum50700601997-12-08 17:15:20 +00003753 FAIL;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003754 }
3755 /* Control never gets here */
3756 }
3757
3758 /* If maximizing, find the longest possible run, then work backwards. */
3759
3760 else
3761 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003762 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00003763 for (i = min; i < max; eptr++, i++)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003764 {
Guido van Rossum50700601997-12-08 17:15:20 +00003765 if (eptr >= md->end_subject) break;
3766 c = *eptr;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003767
3768 /* Either not runtime caseless, or it was a positive class. For
3769 runtime caseless, continue if either case is in the map. */
3770
3771 if (!nasty_case)
Guido van Rossum50700601997-12-08 17:15:20 +00003772 {
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003773 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3774 if (md->runtime_caseless)
3775 {
3776 c = pcre_fcc[c];
3777 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3778 }
3779 }
3780
3781 /* Runtime caseless and it was a negative class. Continue only if
3782 both cases are in the map. */
3783
3784 else
3785 {
3786 if ((data[c/8] & (1 << (c&7))) == 0) break;
Guido van Rossum50700601997-12-08 17:15:20 +00003787 c = pcre_fcc[c];
3788 if ((data[c/8] & (1 << (c&7))) != 0) continue;
3789 }
Guido van Rossum042ff9e1998-04-03 21:13:31 +00003790
Guido van Rossum50700601997-12-08 17:15:20 +00003791 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003792 }
Guido van Rossum50700601997-12-08 17:15:20 +00003793
3794 while (eptr >= pp)
3795 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3796 FAIL;
3797 }
3798 }
3799 /* Control never gets here */
3800
3801 /* OP_CLASS_L opcode: handles localized character classes */
3802
3803 case OP_CLASS_L:
3804 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003805 const uschar *data = ecode + 1; /* Save for matching */
3806 const uschar locale_flag = *data;
Guido van Rossum50700601997-12-08 17:15:20 +00003807 ecode++; data++; /* The localization support adds an extra byte */
3808
3809 ecode += 33; /* Advance past the item */
3810
3811 switch (*ecode)
3812 {
3813 case OP_CRSTAR:
3814 case OP_CRMINSTAR:
3815 case OP_CRPLUS:
3816 case OP_CRMINPLUS:
3817 case OP_CRQUERY:
3818 case OP_CRMINQUERY:
3819 c = *ecode++ - OP_CRSTAR;
3820 minimize = (c & 1) != 0;
3821 min = rep_min[c]; /* Pick up values from tables; */
3822 max = rep_max[c]; /* zero for max => infinity */
3823 if (max == 0) max = INT_MAX;
3824 break;
3825
3826 case OP_CRRANGE:
3827 case OP_CRMINRANGE:
3828 minimize = (*ecode == OP_CRMINRANGE);
3829 min = (ecode[1] << 8) + ecode[2];
3830 max = (ecode[3] << 8) + ecode[4];
3831 if (max == 0) max = INT_MAX;
3832 ecode += 5;
3833 break;
3834
3835 default: /* No repeat follows */
3836 if (eptr >= md->end_subject) FAIL;
3837 c = *eptr++;
3838 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
Guido van Rossum58132c61997-12-17 00:24:13 +00003839 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3840 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003841#if 0
3842 if ( (locale_flag & 4) && isdigit(c) ) continue; /* Locale \d */
3843 if ( (locale_flag & 8) && !isdigit(c) ) continue; /* Locale \D */
3844 if ( (locale_flag & 16) && isspace(c) ) continue; /* Locale \s */
3845 if ( (locale_flag & 32) && !isspace(c) ) continue; /* Locale \S */
3846#endif
3847
3848 if (md->runtime_caseless)
3849 {
3850 c = pcre_fcc[c];
3851 if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
3852
Guido van Rossum58132c61997-12-17 00:24:13 +00003853 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3854 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003855 }
3856 FAIL;
3857 }
3858
3859 /* First, ensure the minimum number of matches are present. */
3860
3861 for (i = 1; i <= min; i++)
3862 {
3863 if (eptr >= md->end_subject) FAIL;
3864 c = *eptr++;
3865 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003866 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3867 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003868
3869 if (md->runtime_caseless)
3870 {
3871 c = pcre_fcc[c];
3872 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003873 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3874 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003875 }
3876 FAIL;
3877 }
3878
3879 /* If max == min we can continue with the main loop without the
3880 need to recurse. */
3881
3882 if (min == max) continue;
3883
3884 /* If minimizing, keep testing the rest of the expression and advancing
3885 the pointer while it matches the class. */
3886
3887 if (minimize)
3888 {
3889 for (i = min;; i++)
3890 {
3891 if (match(eptr, ecode, offset_top, md)) SUCCEED;
3892 if (i >= max || eptr >= md->end_subject) FAIL;
3893 c = *eptr++;
3894 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003895 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3896 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003897
3898 if (md->runtime_caseless)
3899 {
3900 c = pcre_fcc[c];
3901 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003902 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3903 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003904 }
3905 FAIL;
3906 }
3907 /* Control never gets here */
3908 }
3909
3910 /* If maximizing, find the longest possible run, then work backwards. */
3911
3912 else
3913 {
Guido van Rossum58132c61997-12-17 00:24:13 +00003914 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00003915 for (i = min; i < max; eptr++, i++)
3916 {
3917 if (eptr >= md->end_subject) break;
3918 c = *eptr;
3919 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003920 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3921 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003922 if (md->runtime_caseless)
3923 {
3924 c = pcre_fcc[c];
3925 if ((data[c/8] & (1 << (c&7))) != 0) continue;
Guido van Rossum58132c61997-12-17 00:24:13 +00003926 if ( (locale_flag & 1) && (isalnum(c) || c=='_') ) continue; /* Locale \w */
3927 if ( (locale_flag & 2) && (!isalnum(c) && c!='_') ) continue; /* Locale \W */
Guido van Rossum50700601997-12-08 17:15:20 +00003928 }
3929 break;
3930 }
3931
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003932 while (eptr >= pp)
3933 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
3934 FAIL;
3935 }
3936 }
3937 /* Control never gets here */
3938
3939 /* Match a run of characters */
3940
3941 case OP_CHARS:
3942 {
3943 register int length = ecode[1];
3944 ecode += 2;
3945
Guido van Rossum557dea11997-12-22 22:46:52 +00003946#ifdef DEBUG /* Sigh. Some compilers never learn. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003947 if (eptr >= md->end_subject)
3948 printf("matching subject <null> against pattern ");
3949 else
3950 {
3951 printf("matching subject ");
3952 pchars(eptr, length, TRUE, md);
3953 printf(" against pattern ");
3954 }
3955 pchars(ecode, length, FALSE, md);
3956 printf("\n");
Guido van Rossum57ba4f31997-12-02 20:40:28 +00003957#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00003958
3959 if (length > md->end_subject - eptr) FAIL;
3960 if (md->caseless)
3961 {
3962 while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) FAIL;
3963 }
3964 else
3965 {
3966 while (length-- > 0) if (*ecode++ != *eptr++) FAIL;
3967 }
3968 }
3969 break;
3970
3971 /* Match a single character repeatedly; different opcodes share code. */
3972
3973 case OP_EXACT:
3974 min = max = (ecode[1] << 8) + ecode[2];
3975 ecode += 3;
3976 goto REPEATCHAR;
3977
3978 case OP_UPTO:
3979 case OP_MINUPTO:
3980 min = 0;
3981 max = (ecode[1] << 8) + ecode[2];
3982 minimize = *ecode == OP_MINUPTO;
3983 ecode += 3;
3984 goto REPEATCHAR;
3985
3986 case OP_STAR:
3987 case OP_MINSTAR:
3988 case OP_PLUS:
3989 case OP_MINPLUS:
3990 case OP_QUERY:
3991 case OP_MINQUERY:
3992 c = *ecode++ - OP_STAR;
3993 minimize = (c & 1) != 0;
3994 min = rep_min[c]; /* Pick up values from tables; */
3995 max = rep_max[c]; /* zero for max => infinity */
3996 if (max == 0) max = INT_MAX;
3997
3998 /* Common code for all repeated single-character matches. We can give
3999 up quickly if there are fewer than the minimum number of characters left in
4000 the subject. */
4001
4002 REPEATCHAR:
4003 if (min > md->end_subject - eptr) FAIL;
4004 c = *ecode++;
4005
4006 /* The code is duplicated for the caseless and caseful cases, for speed,
4007 since matching characters is likely to be quite common. First, ensure the
4008 minimum number of matches are present. If min = max, continue at the same
4009 level without recursing. Otherwise, if minimizing, keep trying the rest of
4010 the expression and advancing one matching character if failing, up to the
4011 maximum. Alternatively, if maximizing, find the maximum number of
4012 characters and work backwards. */
4013
Guido van Rossum557dea11997-12-22 22:46:52 +00004014 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
4015 max, eptr));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004016
4017 if (md->caseless)
4018 {
4019 c = pcre_lcc[c];
4020 for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) FAIL;
4021 if (min == max) continue;
4022 if (minimize)
4023 {
4024 for (i = min;; i++)
4025 {
4026 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4027 if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++])
4028 FAIL;
4029 }
4030 /* Control never gets here */
4031 }
4032 else
4033 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004034 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004035 for (i = min; i < max; i++)
4036 {
4037 if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break;
4038 eptr++;
4039 }
4040 while (eptr >= pp)
4041 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4042 FAIL;
4043 }
Guido van Rossum50700601997-12-08 17:15:20 +00004044 /* Control never gets here */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004045 }
4046
4047 /* Caseful comparisons */
4048
4049 else
4050 {
4051 for (i = 1; i <= min; i++) if (c != *eptr++) FAIL;
4052 if (min == max) continue;
4053 if (minimize)
4054 {
4055 for (i = min;; i++)
4056 {
4057 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4058 if (i >= max || eptr >= md->end_subject || c != *eptr++) FAIL;
4059 }
4060 /* Control never gets here */
4061 }
4062 else
4063 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004064 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004065 for (i = min; i < max; i++)
4066 {
4067 if (eptr >= md->end_subject || c != *eptr) break;
4068 eptr++;
4069 }
4070 while (eptr >= pp)
4071 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4072 FAIL;
4073 }
4074 }
4075 /* Control never gets here */
4076
Guido van Rossum50700601997-12-08 17:15:20 +00004077 /* Match a negated single character */
4078
4079 case OP_NOT:
Guido van Rossum557dea11997-12-22 22:46:52 +00004080 if (eptr >= md->end_subject) FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00004081 ecode++;
4082 if (md->caseless)
4083 {
4084 if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) FAIL;
4085 }
4086 else
4087 {
4088 if (*ecode++ == *eptr++) FAIL;
4089 }
4090 break;
4091
4092 /* Match a negated single character repeatedly. This is almost a repeat of
4093 the code for a repeated single character, but I haven't found a nice way of
4094 commoning these up that doesn't require a test of the positive/negative
4095 option for each character match. Maybe that wouldn't add very much to the
4096 time taken, but character matching *is* what this is all about... */
4097
4098 case OP_NOTEXACT:
4099 min = max = (ecode[1] << 8) + ecode[2];
4100 ecode += 3;
4101 goto REPEATNOTCHAR;
4102
4103 case OP_NOTUPTO:
4104 case OP_NOTMINUPTO:
4105 min = 0;
4106 max = (ecode[1] << 8) + ecode[2];
4107 minimize = *ecode == OP_NOTMINUPTO;
4108 ecode += 3;
4109 goto REPEATNOTCHAR;
4110
4111 case OP_NOTSTAR:
4112 case OP_NOTMINSTAR:
4113 case OP_NOTPLUS:
4114 case OP_NOTMINPLUS:
4115 case OP_NOTQUERY:
4116 case OP_NOTMINQUERY:
4117 c = *ecode++ - OP_NOTSTAR;
4118 minimize = (c & 1) != 0;
4119 min = rep_min[c]; /* Pick up values from tables; */
4120 max = rep_max[c]; /* zero for max => infinity */
4121 if (max == 0) max = INT_MAX;
4122
4123 /* Common code for all repeated single-character matches. We can give
4124 up quickly if there are fewer than the minimum number of characters left in
4125 the subject. */
4126
4127 REPEATNOTCHAR:
4128 if (min > md->end_subject - eptr) FAIL;
4129 c = *ecode++;
4130
4131 /* The code is duplicated for the caseless and caseful cases, for speed,
4132 since matching characters is likely to be quite common. First, ensure the
4133 minimum number of matches are present. If min = max, continue at the same
4134 level without recursing. Otherwise, if minimizing, keep trying the rest of
4135 the expression and advancing one matching character if failing, up to the
4136 maximum. Alternatively, if maximizing, find the maximum number of
4137 characters and work backwards. */
4138
Guido van Rossum557dea11997-12-22 22:46:52 +00004139 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
4140 max, eptr));
Guido van Rossum50700601997-12-08 17:15:20 +00004141
4142 if (md->caseless)
4143 {
4144 c = pcre_lcc[c];
4145 for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) FAIL;
4146 if (min == max) continue;
4147 if (minimize)
4148 {
4149 for (i = min;; i++)
4150 {
4151 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4152 if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++])
4153 FAIL;
4154 }
4155 /* Control never gets here */
4156 }
4157 else
4158 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004159 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00004160 for (i = min; i < max; i++)
4161 {
4162 if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break;
4163 eptr++;
4164 }
4165 while (eptr >= pp)
4166 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4167 FAIL;
4168 }
4169 /* Control never gets here */
4170 }
4171
4172 /* Caseful comparisons */
4173
4174 else
4175 {
4176 for (i = 1; i <= min; i++) if (c == *eptr++) FAIL;
4177 if (min == max) continue;
4178 if (minimize)
4179 {
4180 for (i = min;; i++)
4181 {
4182 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4183 if (i >= max || eptr >= md->end_subject || c == *eptr++) FAIL;
4184 }
4185 /* Control never gets here */
4186 }
4187 else
4188 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004189 const uschar *pp = eptr;
Guido van Rossum50700601997-12-08 17:15:20 +00004190 for (i = min; i < max; i++)
4191 {
4192 if (eptr >= md->end_subject || c == *eptr) break;
4193 eptr++;
4194 }
4195 while (eptr >= pp)
4196 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4197 FAIL;
4198 }
4199 }
4200 /* Control never gets here */
4201
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004202 /* Match a single character type repeatedly; several different opcodes
4203 share code. This is very similar to the code for single characters, but we
4204 repeat it in the interests of efficiency. */
4205
4206 case OP_TYPEEXACT:
4207 min = max = (ecode[1] << 8) + ecode[2];
4208 minimize = TRUE;
4209 ecode += 3;
4210 goto REPEATTYPE;
4211
4212 case OP_TYPEUPTO:
4213 case OP_TYPEMINUPTO:
4214 min = 0;
4215 max = (ecode[1] << 8) + ecode[2];
4216 minimize = *ecode == OP_TYPEMINUPTO;
4217 ecode += 3;
4218 goto REPEATTYPE;
4219
4220 case OP_TYPESTAR:
4221 case OP_TYPEMINSTAR:
4222 case OP_TYPEPLUS:
4223 case OP_TYPEMINPLUS:
4224 case OP_TYPEQUERY:
4225 case OP_TYPEMINQUERY:
4226 c = *ecode++ - OP_TYPESTAR;
4227 minimize = (c & 1) != 0;
4228 min = rep_min[c]; /* Pick up values from tables; */
4229 max = rep_max[c]; /* zero for max => infinity */
4230 if (max == 0) max = INT_MAX;
4231
4232 /* Common code for all repeated single character type matches */
4233
4234 REPEATTYPE:
4235 ctype = *ecode++; /* Code for the character type */
4236
4237 /* First, ensure the minimum number of matches are present. Use inline
4238 code for maximizing the speed, and do the type test once at the start
4239 (i.e. keep it out of the loop). Also test that there are at least the
4240 minimum number of characters before we start. */
4241
4242 if (min > md->end_subject - eptr) FAIL;
4243 if (min > 0) switch(ctype)
4244 {
4245 case OP_ANY:
4246 if (!md->dotall)
4247 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') FAIL; }
4248 else eptr += min;
4249 break;
4250
4251 case OP_NOT_DIGIT:
4252 for (i = 1; i <= min; i++)
4253 if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) FAIL;
4254 break;
4255
4256 case OP_DIGIT:
4257 for (i = 1; i <= min; i++)
4258 if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) FAIL;
4259 break;
4260
4261 case OP_NOT_WHITESPACE:
4262 for (i = 1; i <= min; i++)
4263 if ((pcre_ctypes[*eptr++] & ctype_space) != 0) FAIL;
4264 break;
4265
4266 case OP_WHITESPACE:
4267 for (i = 1; i <= min; i++)
4268 if ((pcre_ctypes[*eptr++] & ctype_space) == 0) FAIL;
4269 break;
4270
4271 case OP_NOT_WORDCHAR:
4272 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0)
4273 FAIL;
4274 break;
4275
4276 case OP_WORDCHAR:
4277 for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0)
4278 FAIL;
4279 break;
Guido van Rossum50700601997-12-08 17:15:20 +00004280
4281 case OP_NOT_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00004282 for (i = 1; i <= min; i++, eptr++) if (*eptr=='_' || isalnum(*eptr))
Guido van Rossum557dea11997-12-22 22:46:52 +00004283 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00004284 break;
4285
4286 case OP_WORDCHAR_L:
Guido van Rossum58132c61997-12-17 00:24:13 +00004287 for (i = 1; i <= min; i++, eptr++) if (*eptr!='_' && !isalnum(*eptr))
Guido van Rossum557dea11997-12-22 22:46:52 +00004288 FAIL;
Guido van Rossum50700601997-12-08 17:15:20 +00004289 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004290 }
4291
4292 /* If min = max, continue at the same level without recursing */
4293
4294 if (min == max) continue;
4295
4296 /* If minimizing, we have to test the rest of the pattern before each
4297 subsequent match, so inlining isn't much help; just use the function. */
4298
4299 if (minimize)
4300 {
4301 for (i = min;; i++)
4302 {
4303 if (match(eptr, ecode, offset_top, md)) SUCCEED;
4304 if (i >= max || eptr >= md->end_subject ||
4305 !match_type(ctype, *eptr++, md->dotall))
4306 FAIL;
4307 }
4308 /* Control never gets here */
4309 }
4310
4311 /* If maximizing it is worth using inline code for speed, doing the type
4312 test once at the start (i.e. keep it out of the loop). */
4313
4314 else
4315 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004316 const uschar *pp = eptr;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004317 switch(ctype)
4318 {
4319 case OP_ANY:
4320 if (!md->dotall)
4321 {
4322 for (i = min; i < max; i++)
4323 {
4324 if (eptr >= md->end_subject || *eptr == '\n') break;
4325 eptr++;
4326 }
4327 }
4328 else
4329 {
4330 c = max - min;
4331 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4332 eptr += c;
4333 }
4334 break;
4335
4336 case OP_NOT_DIGIT:
4337 for (i = min; i < max; i++)
4338 {
4339 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0)
4340 break;
4341 eptr++;
4342 }
4343 break;
4344
4345 case OP_DIGIT:
4346 for (i = min; i < max; i++)
4347 {
4348 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0)
4349 break;
4350 eptr++;
4351 }
4352 break;
4353
4354 case OP_NOT_WHITESPACE:
4355 for (i = min; i < max; i++)
4356 {
4357 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0)
4358 break;
4359 eptr++;
4360 }
4361 break;
4362
4363 case OP_WHITESPACE:
4364 for (i = min; i < max; i++)
4365 {
4366 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0)
4367 break;
4368 eptr++;
4369 }
4370 break;
4371
4372 case OP_NOT_WORDCHAR:
4373 for (i = min; i < max; i++)
4374 {
4375 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0)
4376 break;
4377 eptr++;
4378 }
4379 break;
4380
4381 case OP_WORDCHAR:
4382 for (i = min; i < max; i++)
4383 {
Guido van Rossum50700601997-12-08 17:15:20 +00004384 if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0)
4385 break;
4386 eptr++;
4387 }
4388 break;
4389 case OP_NOT_WORDCHAR_L:
4390 for (i = min; i < max; i++)
4391 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004392 if (eptr >= md->end_subject || (*eptr=='_' || isalnum(*eptr) ) )
Guido van Rossum50700601997-12-08 17:15:20 +00004393 break;
4394 eptr++;
4395 }
4396 break;
4397
4398 case OP_WORDCHAR_L:
4399 for (i = min; i < max; i++)
4400 {
Guido van Rossum58132c61997-12-17 00:24:13 +00004401 if (eptr >= md->end_subject || (*eptr!='_' && !isalnum(*eptr) ) )
Guido van Rossum50700601997-12-08 17:15:20 +00004402 break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004403 eptr++;
4404 }
4405 break;
4406 }
4407
4408 while (eptr >= pp)
4409 if (match(eptr--, ecode, offset_top, md)) SUCCEED;
4410 FAIL;
4411 }
4412 /* Control never gets here */
4413
4414 /* There's been some horrible disaster. */
4415
4416 default:
Guido van Rossum557dea11997-12-22 22:46:52 +00004417 DPRINTF(("Unknown opcode %d\n", *ecode));
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004418 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4419 FAIL;
4420 }
4421
4422 /* Do not stick any code in here without much thought; it is assumed
4423 that "continue" in the code above comes out to here to repeat the main
4424 loop. */
4425
4426 } /* End of main loop */
4427/* Control never reaches here */
4428
4429fail:
4430 if (md->point > save_stack_position)
4431 {
4432 /* If there are still points remaining on the stack, pop the next one off */
Guido van Rossumc3861071997-10-08 02:07:40 +00004433 int off_num;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004434
4435 md->point--;
4436 offset_top = md->offset_top[md->point];
4437 eptr = md->eptr[md->point];
4438 ecode = md->ecode[md->point];
4439 off_num = md->off_num[md->point];
4440 md->offset_vector[off_num] = md->r1[md->point];
4441 md->offset_vector[off_num+1] = md->r2[md->point];
4442 goto match_loop;
4443 }
4444 /* Failure, and nothing left on the stack, so end this function call */
4445
4446 /* Restore the top of the stack to where it was before this function
4447 call. This lets us use one stack for everything; recursive calls
4448 can push and pop information, and may increase the stack. When
4449 the call returns, the parent function can resume pushing and
4450 popping wherever it was. */
4451
4452 md->point = save_stack_position;
4453 return FALSE;
4454
4455succeed:
4456 return TRUE;
4457}
4458
4459
Guido van Rossum50700601997-12-08 17:15:20 +00004460
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004461/*************************************************
Guido van Rossum557dea11997-12-22 22:46:52 +00004462* Segregate setjmp() *
4463*************************************************/
4464
4465/* The -Wall option of gcc gives warnings for all local variables when setjmp()
4466is used, even if the coding conforms to the rules of ANSI C. To avoid this, we
4467hide it in a separate function. This is called only when PCRE_EXTRA is set,
4468since it's needed only for the extension \X option, and with any luck, a good
4469compiler will spot the tail recursion and compile it efficiently.
4470
4471Arguments:
4472 eptr pointer in subject
4473 ecode position in code
4474 offset_top current top pointer
4475 md pointer to "static" info for the match
4476
4477Returns: TRUE if matched
4478*/
4479
4480static BOOL
4481match_with_setjmp(const uschar *eptr, const uschar *ecode, int offset_top,
4482 match_data *match_block)
4483{
4484return setjmp(match_block->fail_env) == 0 &&
4485 match(eptr, ecode, offset_top, match_block);
4486}
4487
4488
4489
4490/*************************************************
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004491* Execute a Regular Expression *
4492*************************************************/
4493
4494/* This function applies a compiled re to a subject string and picks out
4495portions of the string if it matches. Two elements in the vector are set for
4496each substring: the offsets to the start and end of the substring.
4497
4498Arguments:
Guido van Rossum50700601997-12-08 17:15:20 +00004499 external_re points to the compiled expression
4500 external_extra points to "hints" from pcre_study() or is NULL
4501 subject points to the subject string
4502 length length of subject string (may contain binary zeros)
4503 options option bits
4504 offsets points to a vector of ints to be filled in with offsets
4505 offsetcount the number of elements in the vector
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004506
Guido van Rossum50700601997-12-08 17:15:20 +00004507Returns: > 0 => success; value is the number of elements filled in
4508 = 0 => success, but offsets is not big enough
4509 -1 => failed to match
4510 < -1 => some kind of unexpected problem
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004511*/
4512
4513int
Guido van Rossum50700601997-12-08 17:15:20 +00004514pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
Guido van Rossum816671c1998-03-10 04:55:29 +00004515 const char *subject, int length, int start_pos, int options,
4516 int *offsets, int offsetcount)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004517{
Guido van Rossum58132c61997-12-17 00:24:13 +00004518 /* The "volatile" directives are to make gcc -Wall stop complaining
4519 that these variables can be clobbered by the longjmp. Hopefully
4520 they won't cost too much performance. */
Guido van Rossum042ff9e1998-04-03 21:13:31 +00004521volatile int resetcount, ocount;
4522volatile int first_char = -1;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004523match_data match_block;
Guido van Rossum557dea11997-12-22 22:46:52 +00004524const uschar *start_bits = NULL;
Guido van Rossum816671c1998-03-10 04:55:29 +00004525const uschar *start_match = (const uschar *)subject + start_pos;
Guido van Rossum58132c61997-12-17 00:24:13 +00004526const uschar *end_subject;
4527const real_pcre *re = (const real_pcre *)external_re;
4528const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00004529volatile BOOL using_temporary_offsets = FALSE;
4530volatile BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4531volatile BOOL startline = (re->options & PCRE_STARTLINE) != 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004532
4533if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4534
4535if (re == NULL || subject == NULL ||
4536 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4537if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4538
Guido van Rossum58132c61997-12-17 00:24:13 +00004539match_block.start_subject = (const uschar *)subject;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004540match_block.end_subject = match_block.start_subject + length;
4541end_subject = match_block.end_subject;
4542
Guido van Rossum50700601997-12-08 17:15:20 +00004543match_block.caseless = ((re->options | options) & PCRE_CASELESS) != 0;
4544match_block.runtime_caseless = match_block.caseless &&
4545 (re->options & PCRE_CASELESS) == 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004546
Guido van Rossum50700601997-12-08 17:15:20 +00004547match_block.multiline = ((re->options | options) & PCRE_MULTILINE) != 0;
4548match_block.dotall = ((re->options | options) & PCRE_DOTALL) != 0;
4549match_block.endonly = ((re->options | options) & PCRE_DOLLAR_ENDONLY) != 0;
4550
4551match_block.notbol = (options & PCRE_NOTBOL) != 0;
4552match_block.noteol = (options & PCRE_NOTEOL) != 0;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004553
4554match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
4555
4556/* Set the stack state to empty */
4557 match_block.off_num = match_block.offset_top = NULL;
4558 match_block.r1 = match_block.r2 = NULL;
4559 match_block.eptr = match_block.ecode = NULL;
4560 match_block.point = match_block.length = 0;
4561
Guido van Rossum50700601997-12-08 17:15:20 +00004562/* If the expression has got more back references than the offsets supplied can
4563hold, we get a temporary bit of working store to use during the matching.
Guido van Rossum557dea11997-12-22 22:46:52 +00004564Otherwise, we can use the vector supplied, rounding down its size to a multiple
4565of 2. */
Guido van Rossum50700601997-12-08 17:15:20 +00004566
Guido van Rossum557dea11997-12-22 22:46:52 +00004567ocount = offsetcount & (-2);
4568if (re->top_backref > 0 && re->top_backref >= ocount/2)
Guido van Rossum50700601997-12-08 17:15:20 +00004569 {
4570 ocount = re->top_backref * 2 + 2;
Guido van Rossum042ff9e1998-04-03 21:13:31 +00004571 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
Guido van Rossum50700601997-12-08 17:15:20 +00004572 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
Guido van Rossum557dea11997-12-22 22:46:52 +00004573 using_temporary_offsets = TRUE;
4574 DPRINTF(("Got memory to hold back references\n"));
Guido van Rossum50700601997-12-08 17:15:20 +00004575 }
4576else match_block.offset_vector = offsets;
4577
4578match_block.offset_end = ocount;
4579match_block.offset_overflow = FALSE;
4580
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004581/* Compute the minimum number of offsets that we need to reset each time. Doing
4582this makes a huge difference to execution time when there aren't many brackets
4583in the pattern. */
4584
4585resetcount = 2 + re->top_bracket * 2;
Guido van Rossum50700601997-12-08 17:15:20 +00004586if (resetcount > offsetcount) resetcount = ocount;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004587
4588/* If MULTILINE is set at exec time but was not set at compile time, and the
4589anchored flag is set, we must re-check because a setting provoked by ^ in the
4590pattern is not right in multi-line mode. Calling is_anchored() again here does
4591the right check, because multiline is now set. If it now yields FALSE, the
4592expression must have had ^ starting some of its branches. Check to see if
4593that is true for *all* branches, and if so, set the startline flag. */
4594
Guido van Rossum557dea11997-12-22 22:46:52 +00004595if (match_block.multiline && anchored && (re->options & PCRE_MULTILINE) == 0 &&
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004596 !is_anchored(re->code, match_block.multiline))
4597 {
4598 anchored = FALSE;
4599 if (is_startline(re->code)) startline = TRUE;
4600 }
4601
4602/* Set up the first character to match, if available. The first_char value is
4603never set for an anchored regular expression, but the anchoring may be forced
4604at run time, so we have to test for anchoring. The first char may be unset for
4605an unanchored pattern, of course. If there's no first char and the pattern was
4606studied, the may be a bitmap of possible first characters. However, we can
4607use this only if the caseless state of the studying was correct. */
4608
4609if (!anchored)
4610 {
4611 if ((re->options & PCRE_FIRSTSET) != 0)
4612 {
4613 first_char = re->first_char;
4614 if (match_block.caseless) first_char = pcre_lcc[first_char];
4615 }
4616 else
4617 if (!startline && extra != NULL &&
4618 (extra->options & PCRE_STUDY_MAPPED) != 0 &&
4619 ((extra->options & PCRE_STUDY_CASELESS) != 0) == match_block.caseless)
4620 start_bits = extra->start_bits;
4621 }
4622
4623/* Loop for unanchored matches; for anchored regexps the loop runs just once. */
4624
4625do
4626 {
Guido van Rossum557dea11997-12-22 22:46:52 +00004627 int rc;
Guido van Rossum50700601997-12-08 17:15:20 +00004628 register int *iptr = match_block.offset_vector;
4629 register int *iend = iptr + resetcount;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004630
4631 /* Reset the maximum number of extractions we might see. */
4632
4633 while (iptr < iend) *iptr++ = -1;
4634
4635 /* Advance to a unique first char if possible */
4636
4637 if (first_char >= 0)
4638 {
4639 if (match_block.caseless)
4640 while (start_match < end_subject && pcre_lcc[*start_match] != first_char)
4641 start_match++;
4642 else
4643 while (start_match < end_subject && *start_match != first_char)
4644 start_match++;
4645 }
4646
4647 /* Or to just after \n for a multiline match if possible */
4648
4649 else if (startline)
4650 {
4651 if (start_match > match_block.start_subject)
4652 {
4653 while (start_match < end_subject && start_match[-1] != '\n')
4654 start_match++;
4655 }
4656 }
4657
4658 /* Or to a non-unique first char */
4659
4660 else if (start_bits != NULL)
4661 {
4662 while (start_match < end_subject)
4663 {
4664 register int c = *start_match;
Guido van Rossum50700601997-12-08 17:15:20 +00004665 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004666 }
4667 }
4668
Guido van Rossum557dea11997-12-22 22:46:52 +00004669#ifdef DEBUG /* Sigh. Some compilers never learn. */
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004670 printf(">>>> Match against: ");
4671 pchars(start_match, end_subject - start_match, TRUE, &match_block);
4672 printf("\n");
Guido van Rossum57ba4f31997-12-02 20:40:28 +00004673#endif
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004674
4675 /* When a match occurs, substrings will be set for all internal extractions;
4676 we just need to set up the whole thing as substring 0 before returning. If
Guido van Rossum50700601997-12-08 17:15:20 +00004677 there were too many extractions, set the return code to zero. In the case
4678 where we had to get some local store to hold offsets for backreferences, copy
4679 those back references that we can. In this case there need not be overflow
4680 if certain parts of the pattern were not used.
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004681
Guido van Rossum50700601997-12-08 17:15:20 +00004682 Before starting the match, we have to set up a longjmp() target to enable
Guido van Rossum557dea11997-12-22 22:46:52 +00004683 the "cut" operation to fail a match completely without backtracking. This
4684 is done in a separate function to avoid compiler warnings. We need not do
4685 it unless PCRE_EXTRA is set, since only in that case is the "cut" operation
4686 enabled. */
Guido van Rossum50700601997-12-08 17:15:20 +00004687
4688 /* To handle errors such as running out of memory for the failure
4689 stack, we need to save this location via setjmp(), so
4690 error-handling code can call longjmp() to jump out of deeply-nested code. */
4691 if (setjmp(match_block.error_env)==0)
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004692 {
Guido van Rossum50700601997-12-08 17:15:20 +00004693
Guido van Rossum557dea11997-12-22 22:46:52 +00004694 if ((re->options & PCRE_EXTRA) != 0)
Guido van Rossum50700601997-12-08 17:15:20 +00004695 {
Guido van Rossum557dea11997-12-22 22:46:52 +00004696 if (!match_with_setjmp(start_match, re->code, 2, &match_block))
4697 continue;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004698 }
Guido van Rossum557dea11997-12-22 22:46:52 +00004699 else if (!match(start_match, re->code, 2, &match_block)) continue;
4700
4701 /* Copy the offset information from temporary store if necessary */
4702
4703 if (using_temporary_offsets)
4704 {
4705 if (offsetcount >= 4)
4706 {
4707 memcpy(offsets + 2, match_block.offset_vector + 2,
4708 (offsetcount - 2) * sizeof(int));
4709 DPRINTF(("Copied offsets from temporary memory\n"));
4710 }
4711 if (match_block.end_offset_top > offsetcount)
4712 match_block.offset_overflow = TRUE;
4713
4714 DPRINTF(("Freeing temporary memory\n"));
4715 (pcre_free)(match_block.offset_vector);
4716 }
4717
4718 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
4719
4720 if (match_block.offset_end < 2) rc = 0; else
4721 {
4722 offsets[0] = start_match - match_block.start_subject;
4723 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
4724 }
4725
4726 DPRINTF((">>>> returning %d\n", rc));
4727 free_stack(&match_block);
4728 return rc;
Guido van Rossum50700601997-12-08 17:15:20 +00004729 } /* End of (if setjmp(match_block.error_env)...) */
Guido van Rossum042ff9e1998-04-03 21:13:31 +00004730 free_stack(&match_block);
4731
Guido van Rossum50700601997-12-08 17:15:20 +00004732 /* Return an error code; pcremodule.c will preserve the exception */
4733 if (PyErr_Occurred()) return PCRE_ERROR_NOMEMORY;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004734 }
4735while (!anchored &&
4736 match_block.errorcode == PCRE_ERROR_NOMATCH &&
4737 start_match++ < end_subject);
4738
Guido van Rossum557dea11997-12-22 22:46:52 +00004739if (using_temporary_offsets)
4740 {
4741 DPRINTF(("Freeing temporary memory\n"));
4742 (pcre_free)(match_block.offset_vector);
4743 }
4744
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004745#ifdef DEBUG
4746printf(">>>> returning %d\n", match_block.errorcode);
4747#endif
Guido van Rossum50700601997-12-08 17:15:20 +00004748
Guido van Rossum557dea11997-12-22 22:46:52 +00004749 return match_block.errorcode;
Guido van Rossum51b3aa31997-10-06 14:43:11 +00004750}
4751
4752/* End of pcre.c */