blob: e6a541703d2db0172a5b146df8c2991a8258b99e [file] [log] [blame]
Guido van Rossum004c1e11997-05-09 02:35:58 +00001/* regexpr.c
2 *
3 * Author: Tatu Ylonen <ylo@ngs.fi>
4 *
5 * Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
6 *
7 * Permission to use, copy, modify, distribute, and sell this software
8 * and its documentation for any purpose is hereby granted without
9 * fee, provided that the above copyright notice appear in all copies.
10 * This software is provided "as is" without express or implied
11 * warranty.
12 *
13 * Created: Thu Sep 26 17:14:05 1991 ylo
14 * Last modified: Mon Nov 4 17:06:48 1991 ylo
15 * Ported to Think C: 19 Jan 1992 guido@cwi.nl
16 *
17 * This code draws many ideas from the regular expression packages by
18 * Henry Spencer of the University of Toronto and Richard Stallman of
19 * the Free Software Foundation.
20 *
21 * Emacs-specific code and syntax table code is almost directly borrowed
22 * from GNU regexp.
23 *
24 * Bugs fixed and lots of reorganization by Jeffrey C. Ollie, April
25 * 1997 Thanks for bug reports and ideas from Andrew Kuchling, Tim
26 * Peters, Guido van Rossum, Ka-Ping Yee, Sjoerd Mullender, and
27 * probably one or two others that I'm forgetting.
28 *
29 * $Id$ */
Guido van Rossumb674c3b1992-01-19 16:32:47 +000030
Guido van Rossum95e80531997-08-13 22:34:14 +000031#include "Python.h"
Guido van Rossumb674c3b1992-01-19 16:32:47 +000032#include "regexpr.h"
Guido van Rossumb674c3b1992-01-19 16:32:47 +000033
Guido van Rossumdb25f321997-07-10 14:31:32 +000034/* The original code blithely assumed that sizeof(short) == 2. Not
35 * always true. Original instances of "(short)x" were replaced by
36 * SHORT(x), where SHORT is #defined below. */
37
38#define SHORT(x) ((x) & 0x8000 ? (x) - 0x10000 : (x))
39
Guido van Rossum004c1e11997-05-09 02:35:58 +000040/* The stack implementation is taken from an idea by Andrew Kuchling.
41 * It's a doubly linked list of arrays. The advantages of this over a
42 * simple linked list are that the number of mallocs required are
43 * reduced. It also makes it possible to statically allocate enough
44 * space so that small patterns don't ever need to call malloc.
45 *
46 * The advantages over a single array is that is periodically
47 * realloced when more space is needed is that we avoid ever copying
48 * the stack. */
49
50/* item_t is the basic stack element. Defined as a union of
51 * structures so that both registers, failure points, and counters can
52 * be pushed/popped from the stack. There's nothing built into the
53 * item to keep track of whether a certain stack item is a register, a
54 * failure point, or a counter. */
55
56typedef union item_t
57{
Guido van Rossumdb25f321997-07-10 14:31:32 +000058 struct
59 {
60 int num;
61 int level;
Guido van Rossum95e80531997-08-13 22:34:14 +000062 unsigned char *start;
63 unsigned char *end;
Guido van Rossumdb25f321997-07-10 14:31:32 +000064 } reg;
65 struct
66 {
67 int count;
68 int level;
69 int phantom;
Guido van Rossum95e80531997-08-13 22:34:14 +000070 unsigned char *code;
71 unsigned char *text;
Guido van Rossumdb25f321997-07-10 14:31:32 +000072 } fail;
73 struct
74 {
75 int num;
76 int level;
77 int count;
78 } cntr;
Guido van Rossum004c1e11997-05-09 02:35:58 +000079} item_t;
80
81#define STACK_PAGE_SIZE 256
82#define NUM_REGISTERS 256
83
84/* A 'page' of stack items. */
85
86typedef struct item_page_t
87{
Guido van Rossumdb25f321997-07-10 14:31:32 +000088 item_t items[STACK_PAGE_SIZE];
89 struct item_page_t *prev;
90 struct item_page_t *next;
Guido van Rossum004c1e11997-05-09 02:35:58 +000091} item_page_t;
92
93
94typedef struct match_state
95{
Guido van Rossumdb25f321997-07-10 14:31:32 +000096 /* The number of registers that have been pushed onto the stack
97 * since the last failure point. */
Guido van Rossum004c1e11997-05-09 02:35:58 +000098
Guido van Rossumdb25f321997-07-10 14:31:32 +000099 int count;
100
101 /* Used to control when registers need to be pushed onto the
102 * stack. */
103
104 int level;
105
106 /* The number of failure points on the stack. */
107
108 int point;
109
110 /* Storage for the registers. Each register consists of two
111 * pointers to characters. So register N is represented as
112 * start[N] and end[N]. The pointers must be converted to
113 * offsets from the beginning of the string before returning the
114 * registers to the calling program. */
115
Guido van Rossum95e80531997-08-13 22:34:14 +0000116 unsigned char *start[NUM_REGISTERS];
117 unsigned char *end[NUM_REGISTERS];
Guido van Rossumdb25f321997-07-10 14:31:32 +0000118
119 /* Keeps track of whether a register has changed recently. */
120
121 int changed[NUM_REGISTERS];
122
123 /* Structure to encapsulate the stack. */
124 struct
125 {
Thomas Wouters7e474022000-07-16 12:04:32 +0000126 /* index into the current page. If index == 0 and you need
Guido van Rossumdb25f321997-07-10 14:31:32 +0000127 * to pop an item, move to the previous page and set index
128 * = STACK_PAGE_SIZE - 1. Otherwise decrement index to
129 * push a page. If index == STACK_PAGE_SIZE and you need
130 * to push a page move to the next page and set index =
131 * 0. If there is no new next page, allocate a new page
132 * and link it in. Otherwise, increment index to push a
133 * page. */
134
135 int index;
136 item_page_t *current; /* Pointer to the current page. */
137 item_page_t first; /* First page is statically allocated. */
138 } stack;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000139} match_state;
140
Guido van Rossumdb25f321997-07-10 14:31:32 +0000141/* Initialize a state object */
142
143/* #define NEW_STATE(state) \ */
144/* memset(&state, 0, (void *)(&state.stack) - (void *)(&state)); \ */
145/* state.stack.current = &state.stack.first; \ */
146/* state.stack.first.prev = NULL; \ */
147/* state.stack.first.next = NULL; \ */
148/* state.stack.index = 0; \ */
149/* state.level = 1 */
150
151#define NEW_STATE(state, nregs) \
152{ \
153 int i; \
154 for (i = 0; i < nregs; i++) \
155 { \
156 state.start[i] = NULL; \
157 state.end[i] = NULL; \
158 state.changed[i] = 0; \
159 } \
160 state.stack.current = &state.stack.first; \
161 state.stack.first.prev = NULL; \
162 state.stack.first.next = NULL; \
163 state.stack.index = 0; \
164 state.level = 1; \
165 state.count = 0; \
166 state.level = 0; \
167 state.point = 0; \
168}
169
170/* Free any memory that might have been malloc'd */
171
172#define FREE_STATE(state) \
173while(state.stack.first.next != NULL) \
174{ \
175 state.stack.current = state.stack.first.next; \
176 state.stack.first.next = state.stack.current->next; \
177 free(state.stack.current); \
178}
179
Guido van Rossum004c1e11997-05-09 02:35:58 +0000180/* Discard the top 'count' stack items. */
181
182#define STACK_DISCARD(stack, count, on_error) \
183stack.index -= count; \
184while (stack.index < 0) \
185{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +0000186 if (stack.current->prev == NULL) \
187 on_error; \
188 stack.current = stack.current->prev; \
189 stack.index += STACK_PAGE_SIZE; \
Guido van Rossum004c1e11997-05-09 02:35:58 +0000190}
191
192/* Store a pointer to the previous item on the stack. Used to pop an
193 * item off of the stack. */
194
195#define STACK_PREV(stack, top, on_error) \
196if (stack.index == 0) \
197{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +0000198 if (stack.current->prev == NULL) \
199 on_error; \
200 stack.current = stack.current->prev; \
201 stack.index = STACK_PAGE_SIZE - 1; \
Guido van Rossum004c1e11997-05-09 02:35:58 +0000202} \
203else \
Guido van Rossumfaf49081997-07-15 01:47:08 +0000204{ \
205 stack.index--; \
206} \
Guido van Rossum004c1e11997-05-09 02:35:58 +0000207top = &(stack.current->items[stack.index])
208
209/* Store a pointer to the next item on the stack. Used to push an item
210 * on to the stack. */
211
212#define STACK_NEXT(stack, top, on_error) \
213if (stack.index == STACK_PAGE_SIZE) \
214{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +0000215 if (stack.current->next == NULL) \
216 { \
217 stack.current->next = (item_page_t *)malloc(sizeof(item_page_t)); \
218 if (stack.current->next == NULL) \
219 on_error; \
220 stack.current->next->prev = stack.current; \
221 stack.current->next->next = NULL; \
222 } \
223 stack.current = stack.current->next; \
224 stack.index = 0; \
Guido van Rossum004c1e11997-05-09 02:35:58 +0000225} \
226top = &(stack.current->items[stack.index++])
227
228/* Store a pointer to the item that is 'count' items back in the
229 * stack. STACK_BACK(stack, top, 1, on_error) is equivalent to
230 * STACK_TOP(stack, top, on_error). */
231
232#define STACK_BACK(stack, top, count, on_error) \
233{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +0000234 int index; \
235 item_page_t *current; \
236 current = stack.current; \
237 index = stack.index - (count); \
238 while (index < 0) \
239 { \
240 if (current->prev == NULL) \
241 on_error; \
242 current = current->prev; \
243 index += STACK_PAGE_SIZE; \
244 } \
245 top = &(current->items[index]); \
Guido van Rossum004c1e11997-05-09 02:35:58 +0000246}
247
248/* Store a pointer to the top item on the stack. Execute the
249 * 'on_error' code if there are no items on the stack. */
250
251#define STACK_TOP(stack, top, on_error) \
252if (stack.index == 0) \
253{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +0000254 if (stack.current->prev == NULL) \
255 on_error; \
256 top = &(stack.current->prev->items[STACK_PAGE_SIZE - 1]); \
Guido van Rossum004c1e11997-05-09 02:35:58 +0000257} \
258else \
Guido van Rossumfaf49081997-07-15 01:47:08 +0000259{ \
260 top = &(stack.current->items[stack.index - 1]); \
261}
Guido van Rossum004c1e11997-05-09 02:35:58 +0000262
263/* Test to see if the stack is empty */
264
265#define STACK_EMPTY(stack) ((stack.index == 0) && \
266 (stack.current->prev == NULL))
267
Guido van Rossum004c1e11997-05-09 02:35:58 +0000268/* Return the start of register 'reg' */
269
270#define GET_REG_START(state, reg) (state.start[reg])
271
272/* Return the end of register 'reg' */
273
274#define GET_REG_END(state, reg) (state.end[reg])
275
276/* Set the start of register 'reg'. If the state of the register needs
277 * saving, push it on the stack. */
278
279#define SET_REG_START(state, reg, text, on_error) \
280if(state.changed[reg] < state.level) \
281{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +0000282 item_t *item; \
283 STACK_NEXT(state.stack, item, on_error); \
284 item->reg.num = reg; \
285 item->reg.start = state.start[reg]; \
286 item->reg.end = state.end[reg]; \
287 item->reg.level = state.changed[reg]; \
288 state.changed[reg] = state.level; \
289 state.count++; \
Guido van Rossum004c1e11997-05-09 02:35:58 +0000290} \
291state.start[reg] = text
292
293/* Set the end of register 'reg'. If the state of the register needs
294 * saving, push it on the stack. */
295
296#define SET_REG_END(state, reg, text, on_error) \
297if(state.changed[reg] < state.level) \
298{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +0000299 item_t *item; \
300 STACK_NEXT(state.stack, item, on_error); \
301 item->reg.num = reg; \
302 item->reg.start = state.start[reg]; \
303 item->reg.end = state.end[reg]; \
304 item->reg.level = state.changed[reg]; \
305 state.changed[reg] = state.level; \
306 state.count++; \
Guido van Rossum004c1e11997-05-09 02:35:58 +0000307} \
308state.end[reg] = text
309
310#define PUSH_FAILURE(state, xcode, xtext, on_error) \
311{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +0000312 item_t *item; \
313 STACK_NEXT(state.stack, item, on_error); \
314 item->fail.code = xcode; \
315 item->fail.text = xtext; \
316 item->fail.count = state.count; \
317 item->fail.level = state.level; \
318 item->fail.phantom = 0; \
319 state.count = 0; \
320 state.level++; \
321 state.point++; \
Guido van Rossum004c1e11997-05-09 02:35:58 +0000322}
323
324/* Update the last failure point with a new position in the text. */
325
Guido van Rossum004c1e11997-05-09 02:35:58 +0000326#define UPDATE_FAILURE(state, xtext, on_error) \
327{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +0000328 item_t *item; \
329 STACK_BACK(state.stack, item, state.count + 1, on_error); \
330 if (!item->fail.phantom) \
331 { \
332 item_t *item2; \
333 STACK_NEXT(state.stack, item2, on_error); \
334 item2->fail.code = item->fail.code; \
335 item2->fail.text = xtext; \
336 item2->fail.count = state.count; \
337 item2->fail.level = state.level; \
338 item2->fail.phantom = 1; \
339 state.count = 0; \
340 state.level++; \
341 state.point++; \
342 } \
343 else \
344 { \
345 STACK_DISCARD(state.stack, state.count, on_error); \
346 STACK_TOP(state.stack, item, on_error); \
347 item->fail.text = xtext; \
348 state.count = 0; \
349 state.level++; \
350 } \
Guido van Rossum004c1e11997-05-09 02:35:58 +0000351}
352
353#define POP_FAILURE(state, xcode, xtext, on_empty, on_error) \
354{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +0000355 item_t *item; \
356 do \
357 { \
358 while(state.count > 0) \
359 { \
360 STACK_PREV(state.stack, item, on_error); \
361 state.start[item->reg.num] = item->reg.start; \
362 state.end[item->reg.num] = item->reg.end; \
363 state.changed[item->reg.num] = item->reg.level; \
364 state.count--; \
365 } \
366 STACK_PREV(state.stack, item, on_empty); \
367 xcode = item->fail.code; \
368 xtext = item->fail.text; \
369 state.count = item->fail.count; \
370 state.level = item->fail.level; \
371 state.point--; \
372 } \
373 while (item->fail.text == NULL); \
Guido van Rossum004c1e11997-05-09 02:35:58 +0000374}
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000375
376enum regexp_compiled_ops /* opcodes for compiled regexp */
377{
Guido van Rossumfaf49081997-07-15 01:47:08 +0000378 Cend, /* end of pattern reached */
379 Cbol, /* beginning of line */
380 Ceol, /* end of line */
381 Cset, /* character set. Followed by 32 bytes of set. */
382 Cexact, /* followed by a byte to match */
383 Canychar, /* matches any character except newline */
384 Cstart_memory, /* set register start addr (followed by reg number) */
385 Cend_memory, /* set register end addr (followed by reg number) */
386 Cmatch_memory, /* match a duplicate of reg contents (regnum follows)*/
387 Cjump, /* followed by two bytes (lsb,msb) of displacement. */
388 Cstar_jump, /* will change to jump/update_failure_jump at runtime */
389 Cfailure_jump, /* jump to addr on failure */
390 Cupdate_failure_jump, /* update topmost failure point and jump */
391 Cdummy_failure_jump, /* push a dummy failure point and jump */
392 Cbegbuf, /* match at beginning of buffer */
393 Cendbuf, /* match at end of buffer */
394 Cwordbeg, /* match at beginning of word */
395 Cwordend, /* match at end of word */
396 Cwordbound, /* match if at word boundary */
397 Cnotwordbound, /* match if not at word boundary */
398 Csyntaxspec, /* matches syntax code (1 byte follows) */
Guido van Rossum95e80531997-08-13 22:34:14 +0000399 Cnotsyntaxspec, /* matches if syntax code does not match (1 byte follows) */
Guido van Rossumfaf49081997-07-15 01:47:08 +0000400 Crepeat1
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000401};
402
403enum regexp_syntax_op /* syntax codes for plain and quoted characters */
404{
Guido van Rossumfaf49081997-07-15 01:47:08 +0000405 Rend, /* special code for end of regexp */
406 Rnormal, /* normal character */
407 Ranychar, /* any character except newline */
408 Rquote, /* the quote character */
409 Rbol, /* match beginning of line */
410 Reol, /* match end of line */
411 Roptional, /* match preceding expression optionally */
412 Rstar, /* match preceding expr zero or more times */
413 Rplus, /* match preceding expr one or more times */
414 Ror, /* match either of alternatives */
415 Ropenpar, /* opening parenthesis */
416 Rclosepar, /* closing parenthesis */
417 Rmemory, /* match memory register */
418 Rextended_memory, /* \vnn to match registers 10-99 */
419 Ropenset, /* open set. Internal syntax hard-coded below. */
420 /* the following are gnu extensions to "normal" regexp syntax */
421 Rbegbuf, /* beginning of buffer */
422 Rendbuf, /* end of buffer */
423 Rwordchar, /* word character */
424 Rnotwordchar, /* not word character */
425 Rwordbeg, /* beginning of word */
426 Rwordend, /* end of word */
427 Rwordbound, /* word bound */
428 Rnotwordbound, /* not word bound */
429 Rnum_ops
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000430};
431
432static int re_compile_initialized = 0;
433static int regexp_syntax = 0;
Guido van Rossumb6775db1994-08-01 11:34:53 +0000434int re_syntax = 0; /* Exported copy of regexp_syntax */
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000435static unsigned char regexp_plain_ops[256];
436static unsigned char regexp_quoted_ops[256];
437static unsigned char regexp_precedences[Rnum_ops];
438static int regexp_context_indep_ops;
439static int regexp_ansi_sequences;
440
441#define NUM_LEVELS 5 /* number of precedence levels in use */
442#define MAX_NESTING 100 /* max nesting level of operators */
443
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000444#define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000445
Guido van Rossum95e80531997-08-13 22:34:14 +0000446unsigned char re_syntax_table[256];
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000447
Thomas Woutersf3f33dc2000-07-21 06:00:07 +0000448void re_compile_initialize(void)
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000449{
Guido van Rossumfaf49081997-07-15 01:47:08 +0000450 int a;
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000451
Guido van Rossumfaf49081997-07-15 01:47:08 +0000452 static int syntax_table_inited = 0;
Guido van Rossum74fb3031997-07-17 22:41:38 +0000453
Guido van Rossumfaf49081997-07-15 01:47:08 +0000454 if (!syntax_table_inited)
455 {
456 syntax_table_inited = 1;
457 memset(re_syntax_table, 0, 256);
458 for (a = 'a'; a <= 'z'; a++)
459 re_syntax_table[a] = Sword;
460 for (a = 'A'; a <= 'Z'; a++)
461 re_syntax_table[a] = Sword;
462 for (a = '0'; a <= '9'; a++)
Guido van Rossum52d68321997-08-13 03:21:14 +0000463 re_syntax_table[a] = Sword | Sdigit | Shexdigit;
464 for (a = '0'; a <= '7'; a++)
465 re_syntax_table[a] |= Soctaldigit;
466 for (a = 'A'; a <= 'F'; a++)
467 re_syntax_table[a] |= Shexdigit;
468 for (a = 'a'; a <= 'f'; a++)
469 re_syntax_table[a] |= Shexdigit;
Guido van Rossum74fb3031997-07-17 22:41:38 +0000470 re_syntax_table['_'] = Sword;
471 for (a = 9; a <= 13; a++)
472 re_syntax_table[a] = Swhitespace;
473 re_syntax_table[' '] = Swhitespace;
Guido van Rossumfaf49081997-07-15 01:47:08 +0000474 }
475 re_compile_initialized = 1;
476 for (a = 0; a < 256; a++)
477 {
478 regexp_plain_ops[a] = Rnormal;
479 regexp_quoted_ops[a] = Rnormal;
480 }
481 for (a = '0'; a <= '9'; a++)
482 regexp_quoted_ops[a] = Rmemory;
483 regexp_plain_ops['\134'] = Rquote;
484 if (regexp_syntax & RE_NO_BK_PARENS)
485 {
486 regexp_plain_ops['('] = Ropenpar;
487 regexp_plain_ops[')'] = Rclosepar;
488 }
489 else
490 {
491 regexp_quoted_ops['('] = Ropenpar;
492 regexp_quoted_ops[')'] = Rclosepar;
493 }
494 if (regexp_syntax & RE_NO_BK_VBAR)
495 regexp_plain_ops['\174'] = Ror;
496 else
497 regexp_quoted_ops['\174'] = Ror;
498 regexp_plain_ops['*'] = Rstar;
499 if (regexp_syntax & RE_BK_PLUS_QM)
500 {
501 regexp_quoted_ops['+'] = Rplus;
502 regexp_quoted_ops['?'] = Roptional;
503 }
504 else
505 {
506 regexp_plain_ops['+'] = Rplus;
507 regexp_plain_ops['?'] = Roptional;
508 }
509 if (regexp_syntax & RE_NEWLINE_OR)
510 regexp_plain_ops['\n'] = Ror;
511 regexp_plain_ops['\133'] = Ropenset;
512 regexp_plain_ops['\136'] = Rbol;
513 regexp_plain_ops['$'] = Reol;
514 regexp_plain_ops['.'] = Ranychar;
515 if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS))
516 {
517 regexp_quoted_ops['w'] = Rwordchar;
518 regexp_quoted_ops['W'] = Rnotwordchar;
519 regexp_quoted_ops['<'] = Rwordbeg;
520 regexp_quoted_ops['>'] = Rwordend;
521 regexp_quoted_ops['b'] = Rwordbound;
522 regexp_quoted_ops['B'] = Rnotwordbound;
523 regexp_quoted_ops['`'] = Rbegbuf;
524 regexp_quoted_ops['\''] = Rendbuf;
525 }
526 if (regexp_syntax & RE_ANSI_HEX)
527 regexp_quoted_ops['v'] = Rextended_memory;
528 for (a = 0; a < Rnum_ops; a++)
529 regexp_precedences[a] = 4;
530 if (regexp_syntax & RE_TIGHT_VBAR)
531 {
532 regexp_precedences[Ror] = 3;
533 regexp_precedences[Rbol] = 2;
534 regexp_precedences[Reol] = 2;
535 }
536 else
537 {
538 regexp_precedences[Ror] = 2;
539 regexp_precedences[Rbol] = 3;
540 regexp_precedences[Reol] = 3;
541 }
542 regexp_precedences[Rclosepar] = 1;
543 regexp_precedences[Rend] = 0;
544 regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0;
545 regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000546}
547
Peter Schneider-Kamp7d0c71a2000-07-10 13:05:29 +0000548int re_set_syntax(int syntax)
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000549{
Guido van Rossumfaf49081997-07-15 01:47:08 +0000550 int ret;
551
552 ret = regexp_syntax;
553 regexp_syntax = syntax;
554 re_syntax = syntax; /* Exported copy */
555 re_compile_initialize();
556 return ret;
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000557}
558
Peter Schneider-Kamp7d0c71a2000-07-10 13:05:29 +0000559static int hex_char_to_decimal(int ch)
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000560{
Guido van Rossumfaf49081997-07-15 01:47:08 +0000561 if (ch >= '0' && ch <= '9')
562 return ch - '0';
563 if (ch >= 'a' && ch <= 'f')
564 return ch - 'a' + 10;
565 if (ch >= 'A' && ch <= 'F')
566 return ch - 'A' + 10;
567 return 16;
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000568}
569
Peter Schneider-Kamp7d0c71a2000-07-10 13:05:29 +0000570static void re_compile_fastmap_aux(unsigned char *code, int pos,
571 unsigned char *visited,
572 unsigned char *can_be_null,
573 unsigned char *fastmap)
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000574{
Guido van Rossumfaf49081997-07-15 01:47:08 +0000575 int a;
576 int b;
577 int syntaxcode;
578
579 if (visited[pos])
580 return; /* we have already been here */
581 visited[pos] = 1;
582 for (;;)
Guido van Rossum74fb3031997-07-17 22:41:38 +0000583 switch (code[pos++]) {
Guido van Rossumfaf49081997-07-15 01:47:08 +0000584 case Cend:
Guido van Rossum74fb3031997-07-17 22:41:38 +0000585 {
586 *can_be_null = 1;
587 return;
588 }
Guido van Rossumfaf49081997-07-15 01:47:08 +0000589 case Cbol:
590 case Cbegbuf:
591 case Cendbuf:
592 case Cwordbeg:
593 case Cwordend:
594 case Cwordbound:
595 case Cnotwordbound:
596 {
597 for (a = 0; a < 256; a++)
598 fastmap[a] = 1;
599 break;
600 }
601 case Csyntaxspec:
602 {
603 syntaxcode = code[pos++];
604 for (a = 0; a < 256; a++)
Guido van Rossume59d3f81997-12-02 20:39:23 +0000605 if (SYNTAX(a) & syntaxcode)
Guido van Rossumfaf49081997-07-15 01:47:08 +0000606 fastmap[a] = 1;
607 return;
608 }
609 case Cnotsyntaxspec:
610 {
611 syntaxcode = code[pos++];
612 for (a = 0; a < 256; a++)
Guido van Rossume59d3f81997-12-02 20:39:23 +0000613 if (!(SYNTAX(a) & syntaxcode) )
Guido van Rossumfaf49081997-07-15 01:47:08 +0000614 fastmap[a] = 1;
615 return;
616 }
617 case Ceol:
618 {
619 fastmap['\n'] = 1;
620 if (*can_be_null == 0)
621 *can_be_null = 2; /* can match null, but only at end of buffer*/
622 return;
623 }
624 case Cset:
625 {
626 for (a = 0; a < 256/8; a++)
627 if (code[pos + a] != 0)
628 for (b = 0; b < 8; b++)
629 if (code[pos + a] & (1 << b))
630 fastmap[(a << 3) + b] = 1;
631 pos += 256/8;
632 return;
633 }
634 case Cexact:
635 {
636 fastmap[(unsigned char)code[pos]] = 1;
637 return;
638 }
639 case Canychar:
640 {
641 for (a = 0; a < 256; a++)
642 if (a != '\n')
643 fastmap[a] = 1;
644 return;
645 }
646 case Cstart_memory:
647 case Cend_memory:
648 {
649 pos++;
650 break;
651 }
652 case Cmatch_memory:
653 {
654 for (a = 0; a < 256; a++)
655 fastmap[a] = 1;
656 *can_be_null = 1;
657 return;
658 }
659 case Cjump:
660 case Cdummy_failure_jump:
661 case Cupdate_failure_jump:
662 case Cstar_jump:
663 {
664 a = (unsigned char)code[pos++];
665 a |= (unsigned char)code[pos++] << 8;
666 pos += (int)SHORT(a);
667 if (visited[pos])
668 {
669 /* argh... the regexp contains empty loops. This is not
670 good, as this may cause a failure stack overflow when
671 matching. Oh well. */
672 /* this path leads nowhere; pursue other paths. */
673 return;
674 }
675 visited[pos] = 1;
676 break;
677 }
678 case Cfailure_jump:
679 {
680 a = (unsigned char)code[pos++];
681 a |= (unsigned char)code[pos++] << 8;
682 a = pos + (int)SHORT(a);
683 re_compile_fastmap_aux(code, a, visited, can_be_null, fastmap);
684 break;
685 }
686 case Crepeat1:
687 {
688 pos += 2;
689 break;
690 }
691 default:
692 {
Guido van Rossum95e80531997-08-13 22:34:14 +0000693 PyErr_SetString(PyExc_SystemError, "Unknown regex opcode: memory corrupted?");
694 return;
Guido van Rossumfaf49081997-07-15 01:47:08 +0000695 /*NOTREACHED*/
696 }
697 }
Guido van Rossum004c1e11997-05-09 02:35:58 +0000698}
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000699
Peter Schneider-Kamp7d0c71a2000-07-10 13:05:29 +0000700static int re_do_compile_fastmap(unsigned char *buffer, int used, int pos,
701 unsigned char *can_be_null,
702 unsigned char *fastmap)
Guido van Rossum004c1e11997-05-09 02:35:58 +0000703{
Guido van Rossum95e80531997-08-13 22:34:14 +0000704 unsigned char small_visited[512], *visited;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000705
Guido van Rossumfaf49081997-07-15 01:47:08 +0000706 if (used <= sizeof(small_visited))
707 visited = small_visited;
708 else
709 {
710 visited = malloc(used);
711 if (!visited)
712 return 0;
713 }
714 *can_be_null = 0;
715 memset(fastmap, 0, 256);
716 memset(visited, 0, used);
717 re_compile_fastmap_aux(buffer, pos, visited, can_be_null, fastmap);
718 if (visited != small_visited)
719 free(visited);
720 return 1;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000721}
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000722
Peter Schneider-Kamp7d0c71a2000-07-10 13:05:29 +0000723void re_compile_fastmap(regexp_t bufp)
Guido van Rossum004c1e11997-05-09 02:35:58 +0000724{
Guido van Rossumfaf49081997-07-15 01:47:08 +0000725 if (!bufp->fastmap || bufp->fastmap_accurate)
726 return;
727 assert(bufp->used > 0);
728 if (!re_do_compile_fastmap(bufp->buffer,
729 bufp->used,
730 0,
731 &bufp->can_be_null,
732 bufp->fastmap))
733 return;
Guido van Rossum95e80531997-08-13 22:34:14 +0000734 if (PyErr_Occurred()) return;
Guido van Rossumfaf49081997-07-15 01:47:08 +0000735 if (bufp->buffer[0] == Cbol)
736 bufp->anchor = 1; /* begline */
737 else
738 if (bufp->buffer[0] == Cbegbuf)
739 bufp->anchor = 2; /* begbuf */
740 else
741 bufp->anchor = 0; /* none */
742 bufp->fastmap_accurate = 1;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000743}
744
745/*
746 * star is coded as:
747 * 1: failure_jump 2
748 * ... code for operand of star
749 * star_jump 1
750 * 2: ... code after star
751 *
752 * We change the star_jump to update_failure_jump if we can determine
753 * that it is safe to do so; otherwise we change it to an ordinary
754 * jump.
755 *
756 * plus is coded as
757 *
758 * jump 2
759 * 1: failure_jump 3
760 * 2: ... code for operand of plus
761 * star_jump 1
762 * 3: ... code after plus
763 *
764 * For star_jump considerations this is processed identically to star.
765 *
766 */
767
Peter Schneider-Kamp7d0c71a2000-07-10 13:05:29 +0000768static int re_optimize_star_jump(regexp_t bufp, unsigned char *code)
Guido van Rossum004c1e11997-05-09 02:35:58 +0000769{
Guido van Rossum95e80531997-08-13 22:34:14 +0000770 unsigned char map[256];
771 unsigned char can_be_null;
772 unsigned char *p1;
773 unsigned char *p2;
774 unsigned char ch;
Guido van Rossumfaf49081997-07-15 01:47:08 +0000775 int a;
776 int b;
777 int num_instructions = 0;
Guido van Rossum95e80531997-08-13 22:34:14 +0000778
Guido van Rossumfaf49081997-07-15 01:47:08 +0000779 a = (unsigned char)*code++;
780 a |= (unsigned char)*code++ << 8;
781 a = (int)SHORT(a);
782
783 p1 = code + a + 3; /* skip the failure_jump */
Guido van Rossum95e80531997-08-13 22:34:14 +0000784 /* Check that the jump is within the pattern */
785 if (p1<bufp->buffer || bufp->buffer+bufp->used<p1)
786 {
787 PyErr_SetString(PyExc_SystemError, "Regex VM jump out of bounds (failure_jump opt)");
788 return 0;
789 }
790
Guido van Rossumfaf49081997-07-15 01:47:08 +0000791 assert(p1[-3] == Cfailure_jump);
792 p2 = code;
793 /* p1 points inside loop, p2 points to after loop */
794 if (!re_do_compile_fastmap(bufp->buffer, bufp->used,
Guido van Rossumff1ccbf1999-04-10 15:48:23 +0000795 (int)(p2 - bufp->buffer),
796 &can_be_null, map))
Guido van Rossumfaf49081997-07-15 01:47:08 +0000797 goto make_normal_jump;
798
799 /* If we might introduce a new update point inside the
800 * loop, we can't optimize because then update_jump would
801 * update a wrong failure point. Thus we have to be
802 * quite careful here.
803 */
804
805 /* loop until we find something that consumes a character */
Guido van Rossum004c1e11997-05-09 02:35:58 +0000806 loop_p1:
Guido van Rossumfaf49081997-07-15 01:47:08 +0000807 num_instructions++;
808 switch (*p1++)
809 {
810 case Cbol:
811 case Ceol:
812 case Cbegbuf:
813 case Cendbuf:
814 case Cwordbeg:
815 case Cwordend:
816 case Cwordbound:
817 case Cnotwordbound:
818 {
819 goto loop_p1;
820 }
821 case Cstart_memory:
822 case Cend_memory:
823 {
824 p1++;
825 goto loop_p1;
826 }
827 case Cexact:
828 {
829 ch = (unsigned char)*p1++;
830 if (map[(int)ch])
831 goto make_normal_jump;
832 break;
833 }
834 case Canychar:
835 {
836 for (b = 0; b < 256; b++)
837 if (b != '\n' && map[b])
838 goto make_normal_jump;
839 break;
840 }
841 case Cset:
842 {
843 for (b = 0; b < 256; b++)
844 if ((p1[b >> 3] & (1 << (b & 7))) && map[b])
845 goto make_normal_jump;
846 p1 += 256/8;
847 break;
848 }
849 default:
850 {
851 goto make_normal_jump;
852 }
853 }
854 /* now we know that we can't backtrack. */
855 while (p1 != p2 - 3)
856 {
857 num_instructions++;
858 switch (*p1++)
859 {
860 case Cend:
861 {
862 return 0;
863 }
864 case Cbol:
865 case Ceol:
866 case Canychar:
867 case Cbegbuf:
868 case Cendbuf:
869 case Cwordbeg:
870 case Cwordend:
871 case Cwordbound:
872 case Cnotwordbound:
873 {
874 break;
875 }
876 case Cset:
877 {
878 p1 += 256/8;
879 break;
880 }
881 case Cexact:
882 case Cstart_memory:
883 case Cend_memory:
884 case Cmatch_memory:
885 case Csyntaxspec:
886 case Cnotsyntaxspec:
887 {
888 p1++;
889 break;
890 }
891 case Cjump:
892 case Cstar_jump:
893 case Cfailure_jump:
894 case Cupdate_failure_jump:
895 case Cdummy_failure_jump:
896 {
897 goto make_normal_jump;
898 }
899 default:
900 {
901 return 0;
Guido van Rossumfaf49081997-07-15 01:47:08 +0000902 }
903 }
904 }
905
Guido van Rossum95e80531997-08-13 22:34:14 +0000906 /* make_update_jump: */
Guido van Rossumfaf49081997-07-15 01:47:08 +0000907 code -= 3;
908 a += 3; /* jump to after the Cfailure_jump */
909 code[0] = Cupdate_failure_jump;
910 code[1] = a & 0xff;
911 code[2] = a >> 8;
912 if (num_instructions > 1)
913 return 1;
914 assert(num_instructions == 1);
915 /* if the only instruction matches a single character, we can do
916 * better */
917 p1 = code + 3 + a; /* start of sole instruction */
918 if (*p1 == Cset || *p1 == Cexact || *p1 == Canychar ||
919 *p1 == Csyntaxspec || *p1 == Cnotsyntaxspec)
920 code[0] = Crepeat1;
921 return 1;
922
Guido van Rossum004c1e11997-05-09 02:35:58 +0000923 make_normal_jump:
Guido van Rossumfaf49081997-07-15 01:47:08 +0000924 code -= 3;
925 *code = Cjump;
926 return 1;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000927}
928
Peter Schneider-Kamp7d0c71a2000-07-10 13:05:29 +0000929static int re_optimize(regexp_t bufp)
Guido van Rossum004c1e11997-05-09 02:35:58 +0000930{
Guido van Rossum95e80531997-08-13 22:34:14 +0000931 unsigned char *code;
Guido van Rossumfaf49081997-07-15 01:47:08 +0000932
933 code = bufp->buffer;
934
935 while(1)
936 {
937 switch (*code++)
938 {
939 case Cend:
940 {
941 return 1;
942 }
943 case Canychar:
944 case Cbol:
945 case Ceol:
946 case Cbegbuf:
947 case Cendbuf:
948 case Cwordbeg:
949 case Cwordend:
950 case Cwordbound:
951 case Cnotwordbound:
952 {
953 break;
954 }
955 case Cset:
956 {
957 code += 256/8;
958 break;
959 }
960 case Cexact:
961 case Cstart_memory:
962 case Cend_memory:
963 case Cmatch_memory:
964 case Csyntaxspec:
965 case Cnotsyntaxspec:
966 {
967 code++;
968 break;
969 }
970 case Cstar_jump:
971 {
972 if (!re_optimize_star_jump(bufp, code))
973 {
974 return 0;
975 }
976 /* fall through */
977 }
978 case Cupdate_failure_jump:
979 case Cjump:
980 case Cdummy_failure_jump:
981 case Cfailure_jump:
982 case Crepeat1:
983 {
984 code += 2;
985 break;
986 }
987 default:
988 {
989 return 0;
990 }
991 }
992 }
Guido van Rossum004c1e11997-05-09 02:35:58 +0000993}
994
995#define NEXTCHAR(var) \
996{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +0000997 if (pos >= size) \
998 goto ends_prematurely; \
999 (var) = regex[pos]; \
1000 pos++; \
Guido van Rossum004c1e11997-05-09 02:35:58 +00001001}
1002
1003#define ALLOC(amount) \
1004{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +00001005 if (pattern_offset+(amount) > alloc) \
1006 { \
1007 alloc += 256 + (amount); \
1008 pattern = realloc(pattern, alloc); \
1009 if (!pattern) \
1010 goto out_of_memory; \
1011 } \
Guido van Rossum004c1e11997-05-09 02:35:58 +00001012}
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001013
1014#define STORE(ch) pattern[pattern_offset++] = (ch)
1015
1016#define CURRENT_LEVEL_START (starts[starts_base + current_level])
1017
1018#define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
1019
Guido van Rossum004c1e11997-05-09 02:35:58 +00001020#define PUSH_LEVEL_STARTS \
Guido van Rossumfaf49081997-07-15 01:47:08 +00001021if (starts_base < (MAX_NESTING-1)*NUM_LEVELS) \
1022 starts_base += NUM_LEVELS; \
1023else \
1024 goto too_complex \
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001025
1026#define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
1027
Guido van Rossum004c1e11997-05-09 02:35:58 +00001028#define PUT_ADDR(offset,addr) \
1029{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +00001030 int disp = (addr) - (offset) - 2; \
1031 pattern[(offset)] = disp & 0xff; \
1032 pattern[(offset)+1] = (disp>>8) & 0xff; \
Guido van Rossum004c1e11997-05-09 02:35:58 +00001033}
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001034
Guido van Rossum004c1e11997-05-09 02:35:58 +00001035#define INSERT_JUMP(pos,type,addr) \
1036{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +00001037 int a, p = (pos), t = (type), ad = (addr); \
1038 for (a = pattern_offset - 1; a >= p; a--) \
1039 pattern[a + 3] = pattern[a]; \
1040 pattern[p] = t; \
1041 PUT_ADDR(p+1,ad); \
1042 pattern_offset += 3; \
Guido van Rossum004c1e11997-05-09 02:35:58 +00001043}
Guido van Rossumfaf49081997-07-15 01:47:08 +00001044
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001045#define SETBIT(buf,offset,bit) (buf)[(offset)+(bit)/8] |= (1<<((bit) & 7))
1046
Guido van Rossum004c1e11997-05-09 02:35:58 +00001047#define SET_FIELDS \
1048{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +00001049 bufp->allocated = alloc; \
1050 bufp->buffer = pattern; \
1051 bufp->used = pattern_offset; \
Guido van Rossum004c1e11997-05-09 02:35:58 +00001052}
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001053
Guido van Rossum004c1e11997-05-09 02:35:58 +00001054#define GETHEX(var) \
1055{ \
Guido van Rossum95e80531997-08-13 22:34:14 +00001056 unsigned char gethex_ch, gethex_value; \
Guido van Rossumfaf49081997-07-15 01:47:08 +00001057 NEXTCHAR(gethex_ch); \
1058 gethex_value = hex_char_to_decimal(gethex_ch); \
1059 if (gethex_value == 16) \
1060 goto hex_error; \
1061 NEXTCHAR(gethex_ch); \
1062 gethex_ch = hex_char_to_decimal(gethex_ch); \
1063 if (gethex_ch == 16) \
1064 goto hex_error; \
1065 (var) = gethex_value * 16 + gethex_ch; \
Guido van Rossum004c1e11997-05-09 02:35:58 +00001066}
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001067
Guido van Rossumfaf49081997-07-15 01:47:08 +00001068#define ANSI_TRANSLATE(ch) \
Guido van Rossum004c1e11997-05-09 02:35:58 +00001069{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +00001070 switch (ch) \
1071 { \
1072 case 'a': \
1073 case 'A': \
1074 { \
1075 ch = 7; /* audible bell */ \
1076 break; \
1077 } \
1078 case 'b': \
1079 case 'B': \
1080 { \
1081 ch = 8; /* backspace */ \
1082 break; \
1083 } \
1084 case 'f': \
1085 case 'F': \
1086 { \
1087 ch = 12; /* form feed */ \
1088 break; \
1089 } \
1090 case 'n': \
1091 case 'N': \
1092 { \
1093 ch = 10; /* line feed */ \
1094 break; \
1095 } \
1096 case 'r': \
1097 case 'R': \
1098 { \
1099 ch = 13; /* carriage return */ \
1100 break; \
1101 } \
1102 case 't': \
1103 case 'T': \
1104 { \
1105 ch = 9; /* tab */ \
1106 break; \
1107 } \
1108 case 'v': \
1109 case 'V': \
1110 { \
1111 ch = 11; /* vertical tab */ \
1112 break; \
1113 } \
1114 case 'x': /* hex code */ \
1115 case 'X': \
1116 { \
1117 GETHEX(ch); \
1118 break; \
1119 } \
1120 default: \
1121 { \
1122 /* other characters passed through */ \
1123 if (translate) \
1124 ch = translate[(unsigned char)ch]; \
1125 break; \
1126 } \
1127 } \
Guido van Rossum004c1e11997-05-09 02:35:58 +00001128}
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001129
Peter Schneider-Kamp7d0c71a2000-07-10 13:05:29 +00001130char *re_compile_pattern(unsigned char *regex, int size, regexp_t bufp)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001131{
Guido van Rossumfaf49081997-07-15 01:47:08 +00001132 int a;
1133 int pos;
1134 int op;
1135 int current_level;
1136 int level;
1137 int opcode;
1138 int pattern_offset = 0, alloc;
1139 int starts[NUM_LEVELS * MAX_NESTING];
1140 int starts_base;
1141 int future_jumps[MAX_NESTING];
1142 int num_jumps;
1143 unsigned char ch = '\0';
Guido van Rossum95e80531997-08-13 22:34:14 +00001144 unsigned char *pattern;
1145 unsigned char *translate;
Guido van Rossumfaf49081997-07-15 01:47:08 +00001146 int next_register;
1147 int paren_depth;
1148 int num_open_registers;
1149 int open_registers[RE_NREGS];
1150 int beginning_context;
1151
1152 if (!re_compile_initialized)
1153 re_compile_initialize();
1154 bufp->used = 0;
1155 bufp->fastmap_accurate = 0;
1156 bufp->uses_registers = 1;
1157 bufp->num_registers = 1;
1158 translate = bufp->translate;
1159 pattern = bufp->buffer;
1160 alloc = bufp->allocated;
1161 if (alloc == 0 || pattern == NULL)
1162 {
1163 alloc = 256;
1164 pattern = malloc(alloc);
1165 if (!pattern)
1166 goto out_of_memory;
1167 }
1168 pattern_offset = 0;
1169 starts_base = 0;
1170 num_jumps = 0;
1171 current_level = 0;
1172 SET_LEVEL_START;
1173 num_open_registers = 0;
1174 next_register = 1;
1175 paren_depth = 0;
1176 beginning_context = 1;
1177 op = -1;
1178 /* we use Rend dummy to ensure that pending jumps are updated
1179 (due to low priority of Rend) before exiting the loop. */
1180 pos = 0;
1181 while (op != Rend)
1182 {
1183 if (pos >= size)
1184 op = Rend;
1185 else
1186 {
1187 NEXTCHAR(ch);
1188 if (translate)
1189 ch = translate[(unsigned char)ch];
1190 op = regexp_plain_ops[(unsigned char)ch];
1191 if (op == Rquote)
1192 {
1193 NEXTCHAR(ch);
1194 op = regexp_quoted_ops[(unsigned char)ch];
1195 if (op == Rnormal && regexp_ansi_sequences)
1196 ANSI_TRANSLATE(ch);
1197 }
1198 }
1199 level = regexp_precedences[op];
1200 /* printf("ch='%c' op=%d level=%d current_level=%d
1201 curlevstart=%d\n", ch, op, level, current_level,
1202 CURRENT_LEVEL_START); */
1203 if (level > current_level)
1204 {
1205 for (current_level++; current_level < level; current_level++)
1206 SET_LEVEL_START;
1207 SET_LEVEL_START;
1208 }
1209 else
1210 if (level < current_level)
1211 {
1212 current_level = level;
1213 for (;num_jumps > 0 &&
1214 future_jumps[num_jumps-1] >= CURRENT_LEVEL_START;
1215 num_jumps--)
1216 PUT_ADDR(future_jumps[num_jumps-1], pattern_offset);
1217 }
1218 switch (op)
1219 {
1220 case Rend:
1221 {
1222 break;
1223 }
1224 case Rnormal:
1225 {
1226 normal_char:
1227 opcode = Cexact;
1228 store_opcode_and_arg: /* opcode & ch must be set */
1229 SET_LEVEL_START;
1230 ALLOC(2);
1231 STORE(opcode);
1232 STORE(ch);
1233 break;
1234 }
1235 case Ranychar:
1236 {
1237 opcode = Canychar;
1238 store_opcode:
1239 SET_LEVEL_START;
1240 ALLOC(1);
1241 STORE(opcode);
1242 break;
1243 }
1244 case Rquote:
1245 {
Martin v. Löwis3f19b102002-08-07 16:21:51 +00001246 Py_FatalError("Rquote");
Guido van Rossumfaf49081997-07-15 01:47:08 +00001247 /*NOTREACHED*/
1248 }
1249 case Rbol:
1250 {
Guido van Rossum730806d1998-04-10 22:27:42 +00001251 if (!beginning_context) {
Guido van Rossumfaf49081997-07-15 01:47:08 +00001252 if (regexp_context_indep_ops)
1253 goto op_error;
1254 else
1255 goto normal_char;
Guido van Rossum730806d1998-04-10 22:27:42 +00001256 }
Guido van Rossumfaf49081997-07-15 01:47:08 +00001257 opcode = Cbol;
1258 goto store_opcode;
1259 }
1260 case Reol:
1261 {
1262 if (!((pos >= size) ||
1263 ((regexp_syntax & RE_NO_BK_VBAR) ?
1264 (regex[pos] == '\174') :
1265 (pos+1 < size && regex[pos] == '\134' &&
1266 regex[pos+1] == '\174')) ||
1267 ((regexp_syntax & RE_NO_BK_PARENS)?
1268 (regex[pos] == ')'):
1269 (pos+1 < size && regex[pos] == '\134' &&
Guido van Rossum730806d1998-04-10 22:27:42 +00001270 regex[pos+1] == ')')))) {
Guido van Rossumfaf49081997-07-15 01:47:08 +00001271 if (regexp_context_indep_ops)
1272 goto op_error;
1273 else
1274 goto normal_char;
Guido van Rossum730806d1998-04-10 22:27:42 +00001275 }
Guido van Rossumfaf49081997-07-15 01:47:08 +00001276 opcode = Ceol;
1277 goto store_opcode;
1278 /* NOTREACHED */
1279 break;
1280 }
1281 case Roptional:
1282 {
Guido van Rossum730806d1998-04-10 22:27:42 +00001283 if (beginning_context) {
Guido van Rossumfaf49081997-07-15 01:47:08 +00001284 if (regexp_context_indep_ops)
1285 goto op_error;
1286 else
1287 goto normal_char;
Guido van Rossum730806d1998-04-10 22:27:42 +00001288 }
Guido van Rossumfaf49081997-07-15 01:47:08 +00001289 if (CURRENT_LEVEL_START == pattern_offset)
1290 break; /* ignore empty patterns for ? */
1291 ALLOC(3);
1292 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
1293 pattern_offset + 3);
1294 break;
1295 }
1296 case Rstar:
1297 case Rplus:
1298 {
Guido van Rossum730806d1998-04-10 22:27:42 +00001299 if (beginning_context) {
Guido van Rossumfaf49081997-07-15 01:47:08 +00001300 if (regexp_context_indep_ops)
1301 goto op_error;
1302 else
1303 goto normal_char;
Guido van Rossum730806d1998-04-10 22:27:42 +00001304 }
Guido van Rossumfaf49081997-07-15 01:47:08 +00001305 if (CURRENT_LEVEL_START == pattern_offset)
1306 break; /* ignore empty patterns for + and * */
1307 ALLOC(9);
1308 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
1309 pattern_offset + 6);
1310 INSERT_JUMP(pattern_offset, Cstar_jump, CURRENT_LEVEL_START);
1311 if (op == Rplus) /* jump over initial failure_jump */
1312 INSERT_JUMP(CURRENT_LEVEL_START, Cdummy_failure_jump,
1313 CURRENT_LEVEL_START + 6);
1314 break;
1315 }
1316 case Ror:
1317 {
1318 ALLOC(6);
1319 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
1320 pattern_offset + 6);
1321 if (num_jumps >= MAX_NESTING)
1322 goto too_complex;
1323 STORE(Cjump);
1324 future_jumps[num_jumps++] = pattern_offset;
1325 STORE(0);
1326 STORE(0);
1327 SET_LEVEL_START;
1328 break;
1329 }
1330 case Ropenpar:
1331 {
1332 SET_LEVEL_START;
1333 if (next_register < RE_NREGS)
1334 {
1335 bufp->uses_registers = 1;
1336 ALLOC(2);
1337 STORE(Cstart_memory);
1338 STORE(next_register);
1339 open_registers[num_open_registers++] = next_register;
1340 bufp->num_registers++;
1341 next_register++;
1342 }
1343 paren_depth++;
1344 PUSH_LEVEL_STARTS;
1345 current_level = 0;
1346 SET_LEVEL_START;
1347 break;
1348 }
1349 case Rclosepar:
1350 {
1351 if (paren_depth <= 0)
1352 goto parenthesis_error;
1353 POP_LEVEL_STARTS;
1354 current_level = regexp_precedences[Ropenpar];
1355 paren_depth--;
1356 if (paren_depth < num_open_registers)
1357 {
1358 bufp->uses_registers = 1;
1359 ALLOC(2);
1360 STORE(Cend_memory);
1361 num_open_registers--;
1362 STORE(open_registers[num_open_registers]);
1363 }
1364 break;
1365 }
1366 case Rmemory:
1367 {
1368 if (ch == '0')
1369 goto bad_match_register;
1370 assert(ch >= '0' && ch <= '9');
1371 bufp->uses_registers = 1;
1372 opcode = Cmatch_memory;
1373 ch -= '0';
1374 goto store_opcode_and_arg;
1375 }
1376 case Rextended_memory:
1377 {
1378 NEXTCHAR(ch);
1379 if (ch < '0' || ch > '9')
1380 goto bad_match_register;
1381 NEXTCHAR(a);
1382 if (a < '0' || a > '9')
1383 goto bad_match_register;
1384 ch = 10 * (a - '0') + ch - '0';
Fredrik Lundhee2f18d2001-07-03 19:27:05 +00001385 if (ch == 0 || ch >= RE_NREGS)
Guido van Rossumfaf49081997-07-15 01:47:08 +00001386 goto bad_match_register;
1387 bufp->uses_registers = 1;
1388 opcode = Cmatch_memory;
1389 goto store_opcode_and_arg;
1390 }
1391 case Ropenset:
1392 {
1393 int complement;
1394 int prev;
1395 int offset;
1396 int range;
Peter Schneider-Kamp7d0c71a2000-07-10 13:05:29 +00001397 int firstchar;
1398
Guido van Rossumfaf49081997-07-15 01:47:08 +00001399 SET_LEVEL_START;
1400 ALLOC(1+256/8);
1401 STORE(Cset);
1402 offset = pattern_offset;
1403 for (a = 0; a < 256/8; a++)
1404 STORE(0);
1405 NEXTCHAR(ch);
1406 if (translate)
1407 ch = translate[(unsigned char)ch];
1408 if (ch == '\136')
1409 {
1410 complement = 1;
1411 NEXTCHAR(ch);
1412 if (translate)
1413 ch = translate[(unsigned char)ch];
1414 }
1415 else
1416 complement = 0;
1417 prev = -1;
1418 range = 0;
1419 firstchar = 1;
1420 while (ch != '\135' || firstchar)
1421 {
1422 firstchar = 0;
1423 if (regexp_ansi_sequences && ch == '\134')
1424 {
1425 NEXTCHAR(ch);
1426 ANSI_TRANSLATE(ch);
1427 }
1428 if (range)
1429 {
1430 for (a = prev; a <= (int)ch; a++)
1431 SETBIT(pattern, offset, a);
1432 prev = -1;
1433 range = 0;
1434 }
1435 else
1436 if (prev != -1 && ch == '-')
1437 range = 1;
1438 else
1439 {
1440 SETBIT(pattern, offset, ch);
1441 prev = ch;
1442 }
1443 NEXTCHAR(ch);
1444 if (translate)
1445 ch = translate[(unsigned char)ch];
1446 }
1447 if (range)
1448 SETBIT(pattern, offset, '-');
1449 if (complement)
1450 {
1451 for (a = 0; a < 256/8; a++)
1452 pattern[offset+a] ^= 0xff;
1453 }
1454 break;
1455 }
1456 case Rbegbuf:
1457 {
1458 opcode = Cbegbuf;
1459 goto store_opcode;
1460 }
1461 case Rendbuf:
1462 {
1463 opcode = Cendbuf;
1464 goto store_opcode;
1465 }
1466 case Rwordchar:
1467 {
1468 opcode = Csyntaxspec;
1469 ch = Sword;
1470 goto store_opcode_and_arg;
1471 }
1472 case Rnotwordchar:
1473 {
1474 opcode = Cnotsyntaxspec;
1475 ch = Sword;
1476 goto store_opcode_and_arg;
1477 }
1478 case Rwordbeg:
1479 {
1480 opcode = Cwordbeg;
1481 goto store_opcode;
1482 }
1483 case Rwordend:
1484 {
1485 opcode = Cwordend;
1486 goto store_opcode;
1487 }
1488 case Rwordbound:
1489 {
1490 opcode = Cwordbound;
1491 goto store_opcode;
1492 }
1493 case Rnotwordbound:
1494 {
1495 opcode = Cnotwordbound;
1496 goto store_opcode;
1497 }
1498 default:
1499 {
1500 abort();
1501 }
1502 }
1503 beginning_context = (op == Ropenpar || op == Ror);
1504 }
1505 if (starts_base != 0)
1506 goto parenthesis_error;
1507 assert(num_jumps == 0);
1508 ALLOC(1);
1509 STORE(Cend);
1510 SET_FIELDS;
1511 if(!re_optimize(bufp))
Guido van Rossumd19c04a1997-09-03 00:47:36 +00001512 return "Optimization error";
Guido van Rossumfaf49081997-07-15 01:47:08 +00001513 return NULL;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001514
Guido van Rossum004c1e11997-05-09 02:35:58 +00001515 op_error:
Guido van Rossumfaf49081997-07-15 01:47:08 +00001516 SET_FIELDS;
Guido van Rossumd19c04a1997-09-03 00:47:36 +00001517 return "Badly placed special character";
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001518
Guido van Rossum004c1e11997-05-09 02:35:58 +00001519 bad_match_register:
Guido van Rossumfaf49081997-07-15 01:47:08 +00001520 SET_FIELDS;
Guido van Rossumd19c04a1997-09-03 00:47:36 +00001521 return "Bad match register number";
Guido van Rossum004c1e11997-05-09 02:35:58 +00001522
1523 hex_error:
Guido van Rossumfaf49081997-07-15 01:47:08 +00001524 SET_FIELDS;
Guido van Rossumd19c04a1997-09-03 00:47:36 +00001525 return "Bad hexadecimal number";
Guido van Rossum004c1e11997-05-09 02:35:58 +00001526
1527 parenthesis_error:
Guido van Rossumfaf49081997-07-15 01:47:08 +00001528 SET_FIELDS;
Guido van Rossumd19c04a1997-09-03 00:47:36 +00001529 return "Badly placed parenthesis";
Guido van Rossum004c1e11997-05-09 02:35:58 +00001530
1531 out_of_memory:
Guido van Rossumfaf49081997-07-15 01:47:08 +00001532 SET_FIELDS;
Guido van Rossumd19c04a1997-09-03 00:47:36 +00001533 return "Out of memory";
Guido van Rossum004c1e11997-05-09 02:35:58 +00001534
1535 ends_prematurely:
Guido van Rossumfaf49081997-07-15 01:47:08 +00001536 SET_FIELDS;
Guido van Rossumd19c04a1997-09-03 00:47:36 +00001537 return "Regular expression ends prematurely";
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001538
Guido van Rossum004c1e11997-05-09 02:35:58 +00001539 too_complex:
Guido van Rossumfaf49081997-07-15 01:47:08 +00001540 SET_FIELDS;
Guido van Rossumd19c04a1997-09-03 00:47:36 +00001541 return "Regular expression too complex";
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001542}
Guido van Rossum004c1e11997-05-09 02:35:58 +00001543
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001544#undef CHARAT
1545#undef NEXTCHAR
1546#undef GETHEX
1547#undef ALLOC
1548#undef STORE
1549#undef CURRENT_LEVEL_START
1550#undef SET_LEVEL_START
1551#undef PUSH_LEVEL_STARTS
1552#undef POP_LEVEL_STARTS
1553#undef PUT_ADDR
1554#undef INSERT_JUMP
1555#undef SETBIT
1556#undef SET_FIELDS
1557
Guido van Rossum004c1e11997-05-09 02:35:58 +00001558#define PREFETCH if (text == textend) goto fail
1559
1560#define NEXTCHAR(var) \
1561PREFETCH; \
1562var = (unsigned char)*text++; \
1563if (translate) \
Guido van Rossumfaf49081997-07-15 01:47:08 +00001564 var = translate[var]
Guido van Rossum004c1e11997-05-09 02:35:58 +00001565
Peter Schneider-Kamp7d0c71a2000-07-10 13:05:29 +00001566int re_match(regexp_t bufp, unsigned char *string, int size, int pos,
1567 regexp_registers_t old_regs)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001568{
Guido van Rossum95e80531997-08-13 22:34:14 +00001569 unsigned char *code;
1570 unsigned char *translate;
1571 unsigned char *text;
1572 unsigned char *textstart;
1573 unsigned char *textend;
Guido van Rossumfaf49081997-07-15 01:47:08 +00001574 int a;
1575 int b;
1576 int ch;
1577 int reg;
1578 int match_end;
Guido van Rossum95e80531997-08-13 22:34:14 +00001579 unsigned char *regstart;
1580 unsigned char *regend;
Guido van Rossumfaf49081997-07-15 01:47:08 +00001581 int regsize;
1582 match_state state;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001583
Guido van Rossumfaf49081997-07-15 01:47:08 +00001584 assert(pos >= 0 && size >= 0);
1585 assert(pos <= size);
Guido van Rossum004c1e11997-05-09 02:35:58 +00001586
Guido van Rossumfaf49081997-07-15 01:47:08 +00001587 text = string + pos;
1588 textstart = string;
1589 textend = string + size;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001590
Guido van Rossumfaf49081997-07-15 01:47:08 +00001591 code = bufp->buffer;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001592
Guido van Rossumfaf49081997-07-15 01:47:08 +00001593 translate = bufp->translate;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001594
Guido van Rossumfaf49081997-07-15 01:47:08 +00001595 NEW_STATE(state, bufp->num_registers);
1596
Guido van Rossum004c1e11997-05-09 02:35:58 +00001597 continue_matching:
Guido van Rossumfaf49081997-07-15 01:47:08 +00001598 switch (*code++)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001599 {
Guido van Rossumfaf49081997-07-15 01:47:08 +00001600 case Cend:
Guido van Rossum004c1e11997-05-09 02:35:58 +00001601 {
Guido van Rossumfaf49081997-07-15 01:47:08 +00001602 match_end = text - textstart;
1603 if (old_regs)
1604 {
1605 old_regs->start[0] = pos;
1606 old_regs->end[0] = match_end;
1607 if (!bufp->uses_registers)
1608 {
1609 for (a = 1; a < RE_NREGS; a++)
1610 {
1611 old_regs->start[a] = -1;
1612 old_regs->end[a] = -1;
1613 }
1614 }
1615 else
1616 {
1617 for (a = 1; a < bufp->num_registers; a++)
1618 {
1619 if ((GET_REG_START(state, a) == NULL) ||
1620 (GET_REG_END(state, a) == NULL))
1621 {
1622 old_regs->start[a] = -1;
1623 old_regs->end[a] = -1;
1624 continue;
1625 }
1626 old_regs->start[a] = GET_REG_START(state, a) - textstart;
1627 old_regs->end[a] = GET_REG_END(state, a) - textstart;
1628 }
1629 for (; a < RE_NREGS; a++)
1630 {
1631 old_regs->start[a] = -1;
1632 old_regs->end[a] = -1;
1633 }
1634 }
1635 }
1636 FREE_STATE(state);
1637 return match_end - pos;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001638 }
Guido van Rossumfaf49081997-07-15 01:47:08 +00001639 case Cbol:
1640 {
1641 if (text == textstart || text[-1] == '\n')
1642 goto continue_matching;
1643 goto fail;
1644 }
1645 case Ceol:
1646 {
1647 if (text == textend || *text == '\n')
1648 goto continue_matching;
1649 goto fail;
1650 }
1651 case Cset:
1652 {
1653 NEXTCHAR(ch);
1654 if (code[ch/8] & (1<<(ch & 7)))
1655 {
1656 code += 256/8;
1657 goto continue_matching;
1658 }
1659 goto fail;
1660 }
1661 case Cexact:
1662 {
1663 NEXTCHAR(ch);
1664 if (ch != (unsigned char)*code++)
1665 goto fail;
1666 goto continue_matching;
1667 }
1668 case Canychar:
1669 {
1670 NEXTCHAR(ch);
1671 if (ch == '\n')
1672 goto fail;
1673 goto continue_matching;
1674 }
1675 case Cstart_memory:
1676 {
1677 reg = *code++;
1678 SET_REG_START(state, reg, text, goto error);
1679 goto continue_matching;
1680 }
1681 case Cend_memory:
1682 {
1683 reg = *code++;
1684 SET_REG_END(state, reg, text, goto error);
1685 goto continue_matching;
1686 }
1687 case Cmatch_memory:
1688 {
1689 reg = *code++;
1690 regstart = GET_REG_START(state, reg);
1691 regend = GET_REG_END(state, reg);
1692 if ((regstart == NULL) || (regend == NULL))
1693 goto fail; /* or should we just match nothing? */
1694 regsize = regend - regstart;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001695
Guido van Rossumfaf49081997-07-15 01:47:08 +00001696 if (regsize > (textend - text))
1697 goto fail;
1698 if(translate)
1699 {
1700 for (; regstart < regend; regstart++, text++)
1701 if (translate[*regstart] != translate[*text])
1702 goto fail;
1703 }
1704 else
1705 for (; regstart < regend; regstart++, text++)
1706 if (*regstart != *text)
1707 goto fail;
1708 goto continue_matching;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001709 }
Guido van Rossumfaf49081997-07-15 01:47:08 +00001710 case Cupdate_failure_jump:
Guido van Rossumdb25f321997-07-10 14:31:32 +00001711 {
Guido van Rossumfaf49081997-07-15 01:47:08 +00001712 UPDATE_FAILURE(state, text, goto error);
1713 /* fall to next case */
Guido van Rossumdb25f321997-07-10 14:31:32 +00001714 }
Guido van Rossumfaf49081997-07-15 01:47:08 +00001715 /* treat Cstar_jump just like Cjump if it hasn't been optimized */
1716 case Cstar_jump:
1717 case Cjump:
1718 {
1719 a = (unsigned char)*code++;
1720 a |= (unsigned char)*code++ << 8;
1721 code += (int)SHORT(a);
Guido van Rossum95e80531997-08-13 22:34:14 +00001722 if (code<bufp->buffer || bufp->buffer+bufp->used<code) {
1723 PyErr_SetString(PyExc_SystemError, "Regex VM jump out of bounds (Cjump)");
1724 FREE_STATE(state);
1725 return -2;
1726 }
Guido van Rossumfaf49081997-07-15 01:47:08 +00001727 goto continue_matching;
1728 }
1729 case Cdummy_failure_jump:
1730 {
Guido van Rossum95e80531997-08-13 22:34:14 +00001731 unsigned char *failuredest;
1732
Guido van Rossumfaf49081997-07-15 01:47:08 +00001733 a = (unsigned char)*code++;
1734 a |= (unsigned char)*code++ << 8;
1735 a = (int)SHORT(a);
1736 assert(*code == Cfailure_jump);
1737 b = (unsigned char)code[1];
1738 b |= (unsigned char)code[2] << 8;
Guido van Rossum95e80531997-08-13 22:34:14 +00001739 failuredest = code + (int)SHORT(b) + 3;
1740 if (failuredest<bufp->buffer || bufp->buffer+bufp->used < failuredest) {
1741 PyErr_SetString(PyExc_SystemError, "Regex VM jump out of bounds (Cdummy_failure_jump failuredest)");
1742 FREE_STATE(state);
1743 return -2;
1744 }
1745 PUSH_FAILURE(state, failuredest, NULL, goto error);
Guido van Rossumfaf49081997-07-15 01:47:08 +00001746 code += a;
Guido van Rossum95e80531997-08-13 22:34:14 +00001747 if (code<bufp->buffer || bufp->buffer+bufp->used < code) {
1748 PyErr_SetString(PyExc_SystemError, "Regex VM jump out of bounds (Cdummy_failure_jump code)");
1749 FREE_STATE(state);
1750 return -2;
1751 }
Guido van Rossumfaf49081997-07-15 01:47:08 +00001752 goto continue_matching;
1753 }
1754 case Cfailure_jump:
1755 {
1756 a = (unsigned char)*code++;
1757 a |= (unsigned char)*code++ << 8;
1758 a = (int)SHORT(a);
Guido van Rossum95e80531997-08-13 22:34:14 +00001759 if (code+a<bufp->buffer || bufp->buffer+bufp->used < code+a) {
1760 PyErr_SetString(PyExc_SystemError, "Regex VM jump out of bounds (Cfailure_jump)");
1761 FREE_STATE(state);
1762 return -2;
1763 }
Guido van Rossumfaf49081997-07-15 01:47:08 +00001764 PUSH_FAILURE(state, code + a, text, goto error);
1765 goto continue_matching;
1766 }
1767 case Crepeat1:
1768 {
Guido van Rossum95e80531997-08-13 22:34:14 +00001769 unsigned char *pinst;
Guido van Rossumfaf49081997-07-15 01:47:08 +00001770 a = (unsigned char)*code++;
1771 a |= (unsigned char)*code++ << 8;
1772 a = (int)SHORT(a);
1773 pinst = code + a;
Guido van Rossum95e80531997-08-13 22:34:14 +00001774 if (pinst<bufp->buffer || bufp->buffer+bufp->used<pinst) {
1775 PyErr_SetString(PyExc_SystemError, "Regex VM jump out of bounds (Crepeat1)");
1776 FREE_STATE(state);
1777 return -2;
1778 }
Guido van Rossumfaf49081997-07-15 01:47:08 +00001779 /* pinst is sole instruction in loop, and it matches a
1780 * single character. Since Crepeat1 was originally a
1781 * Cupdate_failure_jump, we also know that backtracking
1782 * is useless: so long as the single-character
1783 * expression matches, it must be used. Also, in the
1784 * case of +, we've already matched one character, so +
1785 * can't fail: nothing here can cause a failure. */
1786 switch (*pinst++)
1787 {
1788 case Cset:
Guido van Rossum95e80531997-08-13 22:34:14 +00001789 {
1790 if (translate)
Guido van Rossumfaf49081997-07-15 01:47:08 +00001791 {
1792 while (text < textend)
1793 {
1794 ch = translate[(unsigned char)*text];
1795 if (pinst[ch/8] & (1<<(ch & 7)))
1796 text++;
1797 else
1798 break;
1799 }
1800 }
1801 else
1802 {
1803 while (text < textend)
1804 {
1805 ch = (unsigned char)*text;
1806 if (pinst[ch/8] & (1<<(ch & 7)))
1807 text++;
1808 else
1809 break;
1810 }
1811 }
1812 break;
Guido van Rossum95e80531997-08-13 22:34:14 +00001813 }
Guido van Rossumfaf49081997-07-15 01:47:08 +00001814 case Cexact:
1815 {
1816 ch = (unsigned char)*pinst;
1817 if (translate)
1818 {
1819 while (text < textend &&
1820 translate[(unsigned char)*text] == ch)
1821 text++;
1822 }
1823 else
1824 {
1825 while (text < textend && (unsigned char)*text == ch)
1826 text++;
1827 }
1828 break;
1829 }
1830 case Canychar:
1831 {
1832 while (text < textend && (unsigned char)*text != '\n')
1833 text++;
1834 break;
1835 }
1836 case Csyntaxspec:
1837 {
1838 a = (unsigned char)*pinst;
1839 if (translate)
1840 {
1841 while (text < textend &&
Guido van Rossume59d3f81997-12-02 20:39:23 +00001842 (SYNTAX(translate[*text]) & a) )
Guido van Rossumfaf49081997-07-15 01:47:08 +00001843 text++;
1844 }
1845 else
1846 {
Guido van Rossume59d3f81997-12-02 20:39:23 +00001847 while (text < textend && (SYNTAX(*text) & a) )
Guido van Rossumfaf49081997-07-15 01:47:08 +00001848 text++;
1849 }
1850 break;
1851 }
1852 case Cnotsyntaxspec:
1853 {
1854 a = (unsigned char)*pinst;
1855 if (translate)
1856 {
1857 while (text < textend &&
Guido van Rossume59d3f81997-12-02 20:39:23 +00001858 !(SYNTAX(translate[*text]) & a) )
Guido van Rossumfaf49081997-07-15 01:47:08 +00001859 text++;
1860 }
1861 else
1862 {
Guido van Rossume59d3f81997-12-02 20:39:23 +00001863 while (text < textend && !(SYNTAX(*text) & a) )
Guido van Rossumfaf49081997-07-15 01:47:08 +00001864 text++;
1865 }
1866 break;
1867 }
1868 default:
1869 {
Guido van Rossum95e80531997-08-13 22:34:14 +00001870 FREE_STATE(state);
1871 PyErr_SetString(PyExc_SystemError, "Unknown regex opcode: memory corrupted?");
1872 return -2;
Guido van Rossumfaf49081997-07-15 01:47:08 +00001873 /*NOTREACHED*/
1874 }
1875 }
1876 /* due to the funky way + and * are compiled, the top
1877 * failure- stack entry at this point is actually a
1878 * success entry -- update it & pop it */
1879 UPDATE_FAILURE(state, text, goto error);
1880 goto fail; /* i.e., succeed <wink/sigh> */
1881 }
1882 case Cbegbuf:
1883 {
1884 if (text == textstart)
1885 goto continue_matching;
1886 goto fail;
1887 }
1888 case Cendbuf:
1889 {
1890 if (text == textend)
1891 goto continue_matching;
1892 goto fail;
1893 }
1894 case Cwordbeg:
1895 {
1896 if (text == textend)
1897 goto fail;
Guido van Rossum95e80531997-08-13 22:34:14 +00001898 if (!(SYNTAX(*text) & Sword))
Guido van Rossumfaf49081997-07-15 01:47:08 +00001899 goto fail;
1900 if (text == textstart)
1901 goto continue_matching;
Guido van Rossum74fb3031997-07-17 22:41:38 +00001902 if (!(SYNTAX(text[-1]) & Sword))
Guido van Rossumfaf49081997-07-15 01:47:08 +00001903 goto continue_matching;
1904 goto fail;
1905 }
1906 case Cwordend:
1907 {
1908 if (text == textstart)
1909 goto fail;
Guido van Rossum74fb3031997-07-17 22:41:38 +00001910 if (!(SYNTAX(text[-1]) & Sword))
Guido van Rossumfaf49081997-07-15 01:47:08 +00001911 goto fail;
1912 if (text == textend)
1913 goto continue_matching;
Guido van Rossum95e80531997-08-13 22:34:14 +00001914 if (!(SYNTAX(*text) & Sword))
1915 goto continue_matching;
1916 goto fail;
Guido van Rossumfaf49081997-07-15 01:47:08 +00001917 }
1918 case Cwordbound:
1919 {
1920 /* Note: as in gnu regexp, this also matches at the
1921 * beginning and end of buffer. */
Guido van Rossum004c1e11997-05-09 02:35:58 +00001922
Guido van Rossumfaf49081997-07-15 01:47:08 +00001923 if (text == textstart || text == textend)
1924 goto continue_matching;
Guido van Rossum74fb3031997-07-17 22:41:38 +00001925 if ((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword))
Guido van Rossumfaf49081997-07-15 01:47:08 +00001926 goto continue_matching;
1927 goto fail;
1928 }
1929 case Cnotwordbound:
1930 {
1931 /* Note: as in gnu regexp, this never matches at the
1932 * beginning and end of buffer. */
1933 if (text == textstart || text == textend)
1934 goto fail;
Guido van Rossum74fb3031997-07-17 22:41:38 +00001935 if (!((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword)))
Guido van Rossum53665e51997-08-15 15:45:25 +00001936 goto continue_matching;
1937 goto fail;
Guido van Rossumfaf49081997-07-15 01:47:08 +00001938 }
1939 case Csyntaxspec:
1940 {
1941 NEXTCHAR(ch);
Guido van Rossum74fb3031997-07-17 22:41:38 +00001942 if (!(SYNTAX(ch) & (unsigned char)*code++))
Guido van Rossumfaf49081997-07-15 01:47:08 +00001943 goto fail;
1944 goto continue_matching;
1945 }
1946 case Cnotsyntaxspec:
1947 {
1948 NEXTCHAR(ch);
Guido van Rossum74fb3031997-07-17 22:41:38 +00001949 if (SYNTAX(ch) & (unsigned char)*code++)
Guido van Rossum95e80531997-08-13 22:34:14 +00001950 goto fail;
Guido van Rossumfaf49081997-07-15 01:47:08 +00001951 goto continue_matching;
1952 }
1953 default:
1954 {
Guido van Rossum95e80531997-08-13 22:34:14 +00001955 FREE_STATE(state);
1956 PyErr_SetString(PyExc_SystemError, "Unknown regex opcode: memory corrupted?");
1957 return -2;
Guido van Rossumfaf49081997-07-15 01:47:08 +00001958 /*NOTREACHED*/
1959 }
1960 }
Guido van Rossum95e80531997-08-13 22:34:14 +00001961
1962
Guido van Rossum004c1e11997-05-09 02:35:58 +00001963
Guido van Rossum3b1a57a1992-01-27 16:47:46 +00001964#if 0 /* This line is never reached --Guido */
Guido van Rossumfaf49081997-07-15 01:47:08 +00001965 abort();
Guido van Rossum5f21dd11992-01-19 16:49:14 +00001966#endif
Guido van Rossumfaf49081997-07-15 01:47:08 +00001967 /*
1968 *NOTREACHED
1969 */
Guido van Rossum95e80531997-08-13 22:34:14 +00001970
1971 /* Using "break;" in the above switch statement is equivalent to "goto fail;" */
Guido van Rossum004c1e11997-05-09 02:35:58 +00001972 fail:
Guido van Rossumfaf49081997-07-15 01:47:08 +00001973 POP_FAILURE(state, code, text, goto done_matching, goto error);
1974 goto continue_matching;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001975
1976 done_matching:
1977/* if(translated != NULL) */
1978/* free(translated); */
Guido van Rossumfaf49081997-07-15 01:47:08 +00001979 FREE_STATE(state);
1980 return -1;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001981
Guido van Rossum004c1e11997-05-09 02:35:58 +00001982 error:
1983/* if (translated != NULL) */
1984/* free(translated); */
Guido van Rossumfaf49081997-07-15 01:47:08 +00001985 FREE_STATE(state);
1986 return -2;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001987}
Guido van Rossum95e80531997-08-13 22:34:14 +00001988
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001989
1990#undef PREFETCH
1991#undef NEXTCHAR
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001992
Peter Schneider-Kamp7d0c71a2000-07-10 13:05:29 +00001993int re_search(regexp_t bufp, unsigned char *string, int size, int pos,
1994 int range, regexp_registers_t regs)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001995{
Guido van Rossum95e80531997-08-13 22:34:14 +00001996 unsigned char *fastmap;
1997 unsigned char *translate;
1998 unsigned char *text;
1999 unsigned char *partstart;
2000 unsigned char *partend;
Guido van Rossumfaf49081997-07-15 01:47:08 +00002001 int dir;
2002 int ret;
Guido van Rossum95e80531997-08-13 22:34:14 +00002003 unsigned char anchor;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00002004
Guido van Rossumfaf49081997-07-15 01:47:08 +00002005 assert(size >= 0 && pos >= 0);
2006 assert(pos + range >= 0 && pos + range <= size); /* Bugfix by ylo */
Guido van Rossumb674c3b1992-01-19 16:32:47 +00002007
Guido van Rossumfaf49081997-07-15 01:47:08 +00002008 fastmap = bufp->fastmap;
2009 translate = bufp->translate;
Guido van Rossum95e80531997-08-13 22:34:14 +00002010 if (fastmap && !bufp->fastmap_accurate) {
2011 re_compile_fastmap(bufp);
2012 if (PyErr_Occurred()) return -2;
2013 }
2014
Guido van Rossumfaf49081997-07-15 01:47:08 +00002015 anchor = bufp->anchor;
2016 if (bufp->can_be_null == 1) /* can_be_null == 2: can match null at eob */
2017 fastmap = NULL;
Guido van Rossum004c1e11997-05-09 02:35:58 +00002018
Guido van Rossumfaf49081997-07-15 01:47:08 +00002019 if (range < 0)
2020 {
2021 dir = -1;
2022 range = -range;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00002023 }
Guido van Rossum004c1e11997-05-09 02:35:58 +00002024 else
Guido van Rossumfaf49081997-07-15 01:47:08 +00002025 dir = 1;
2026
Guido van Rossum730806d1998-04-10 22:27:42 +00002027 if (anchor == 2) {
Guido van Rossumfaf49081997-07-15 01:47:08 +00002028 if (pos != 0)
2029 return -1;
2030 else
2031 range = 0;
Guido van Rossum730806d1998-04-10 22:27:42 +00002032 }
Guido van Rossumfaf49081997-07-15 01:47:08 +00002033
2034 for (; range >= 0; range--, pos += dir)
2035 {
2036 if (fastmap)
2037 {
2038 if (dir == 1)
2039 { /* searching forwards */
2040
2041 text = string + pos;
2042 partend = string + size;
2043 partstart = text;
2044 if (translate)
2045 while (text != partend &&
2046 !fastmap[(unsigned char) translate[(unsigned char)*text]])
2047 text++;
2048 else
2049 while (text != partend && !fastmap[(unsigned char)*text])
2050 text++;
2051 pos += text - partstart;
2052 range -= text - partstart;
2053 if (pos == size && bufp->can_be_null == 0)
2054 return -1;
2055 }
2056 else
2057 { /* searching backwards */
2058 text = string + pos;
2059 partstart = string + pos - range;
2060 partend = text;
2061 if (translate)
2062 while (text != partstart &&
2063 !fastmap[(unsigned char)
2064 translate[(unsigned char)*text]])
2065 text--;
2066 else
2067 while (text != partstart &&
2068 !fastmap[(unsigned char)*text])
2069 text--;
2070 pos -= partend - text;
2071 range -= partend - text;
2072 }
2073 }
2074 if (anchor == 1)
2075 { /* anchored to begline */
2076 if (pos > 0 && (string[pos - 1] != '\n'))
2077 continue;
2078 }
2079 assert(pos >= 0 && pos <= size);
2080 ret = re_match(bufp, string, size, pos, regs);
2081 if (ret >= 0)
2082 return pos;
2083 if (ret == -2)
2084 return -2;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00002085 }
Guido van Rossumfaf49081997-07-15 01:47:08 +00002086 return -1;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00002087}
Guido van Rossum74fb3031997-07-17 22:41:38 +00002088
2089/*
2090** Local Variables:
2091** mode: c
2092** c-file-style: "python"
2093** End:
2094*/