blob: a90363a428c68d50c760efa87f288ab74adf8cb1 [file] [log] [blame]
Guido van Rossum004c1e11997-05-09 02:35:58 +00001/* regexpr.c
2 *
3 * Author: Tatu Ylonen <ylo@ngs.fi>
4 *
5 * Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
6 *
7 * Permission to use, copy, modify, distribute, and sell this software
8 * and its documentation for any purpose is hereby granted without
9 * fee, provided that the above copyright notice appear in all copies.
10 * This software is provided "as is" without express or implied
11 * warranty.
12 *
13 * Created: Thu Sep 26 17:14:05 1991 ylo
14 * Last modified: Mon Nov 4 17:06:48 1991 ylo
15 * Ported to Think C: 19 Jan 1992 guido@cwi.nl
16 *
17 * This code draws many ideas from the regular expression packages by
18 * Henry Spencer of the University of Toronto and Richard Stallman of
19 * the Free Software Foundation.
20 *
21 * Emacs-specific code and syntax table code is almost directly borrowed
22 * from GNU regexp.
23 *
24 * Bugs fixed and lots of reorganization by Jeffrey C. Ollie, April
25 * 1997 Thanks for bug reports and ideas from Andrew Kuchling, Tim
26 * Peters, Guido van Rossum, Ka-Ping Yee, Sjoerd Mullender, and
27 * probably one or two others that I'm forgetting.
28 *
29 * $Id$ */
Guido van Rossumb674c3b1992-01-19 16:32:47 +000030
Guido van Rossum339cfa31996-08-08 19:12:37 +000031#include "config.h" /* For Win* specific redefinition of printf c.s. */
Guido van Rossum339cfa31996-08-08 19:12:37 +000032
Guido van Rossum004c1e11997-05-09 02:35:58 +000033#include "myproto.h" /* For PROTO macro --Guido */
Guido van Rossum3b1a57a1992-01-27 16:47:46 +000034
Guido van Rossumb674c3b1992-01-19 16:32:47 +000035#include <stdio.h>
Guido van Rossum95e80531997-08-13 22:34:14 +000036#include "Python.h"
Guido van Rossum004c1e11997-05-09 02:35:58 +000037
38#ifndef NDEBUG
39#define NDEBUG 1
40#endif
41
Guido van Rossumb674c3b1992-01-19 16:32:47 +000042#include <assert.h>
43#include "regexpr.h"
44
Guido van Rossum3b1a57a1992-01-27 16:47:46 +000045#ifdef THINK_C
46/* Think C on the Mac really needs these headers... --Guido */
47#include <stdlib.h>
48#include <string.h>
49#else
Guido van Rossum88661e81996-05-23 22:55:58 +000050#if defined(__STDC__) || defined(_MSC_VER)
Guido van Rossum9abc5391992-03-27 17:24:37 +000051/* Don't mess around, use the standard headers */
52#include <stdlib.h>
53#include <string.h>
54#else
Guido van Rossumb674c3b1992-01-19 16:32:47 +000055char *malloc();
56void free();
57char *realloc();
Guido van Rossum9abc5391992-03-27 17:24:37 +000058#endif /* __STDC__ */
59#endif /* THINK_C */
Guido van Rossumb674c3b1992-01-19 16:32:47 +000060
Guido van Rossumdb25f321997-07-10 14:31:32 +000061/* The original code blithely assumed that sizeof(short) == 2. Not
62 * always true. Original instances of "(short)x" were replaced by
63 * SHORT(x), where SHORT is #defined below. */
64
65#define SHORT(x) ((x) & 0x8000 ? (x) - 0x10000 : (x))
66
Guido van Rossum004c1e11997-05-09 02:35:58 +000067/* The stack implementation is taken from an idea by Andrew Kuchling.
68 * It's a doubly linked list of arrays. The advantages of this over a
69 * simple linked list are that the number of mallocs required are
70 * reduced. It also makes it possible to statically allocate enough
71 * space so that small patterns don't ever need to call malloc.
72 *
73 * The advantages over a single array is that is periodically
74 * realloced when more space is needed is that we avoid ever copying
75 * the stack. */
76
77/* item_t is the basic stack element. Defined as a union of
78 * structures so that both registers, failure points, and counters can
79 * be pushed/popped from the stack. There's nothing built into the
80 * item to keep track of whether a certain stack item is a register, a
81 * failure point, or a counter. */
82
83typedef union item_t
84{
Guido van Rossumdb25f321997-07-10 14:31:32 +000085 struct
86 {
87 int num;
88 int level;
Guido van Rossum95e80531997-08-13 22:34:14 +000089 unsigned char *start;
90 unsigned char *end;
Guido van Rossumdb25f321997-07-10 14:31:32 +000091 } reg;
92 struct
93 {
94 int count;
95 int level;
96 int phantom;
Guido van Rossum95e80531997-08-13 22:34:14 +000097 unsigned char *code;
98 unsigned char *text;
Guido van Rossumdb25f321997-07-10 14:31:32 +000099 } fail;
100 struct
101 {
102 int num;
103 int level;
104 int count;
105 } cntr;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000106} item_t;
107
108#define STACK_PAGE_SIZE 256
109#define NUM_REGISTERS 256
110
111/* A 'page' of stack items. */
112
113typedef struct item_page_t
114{
Guido van Rossumdb25f321997-07-10 14:31:32 +0000115 item_t items[STACK_PAGE_SIZE];
116 struct item_page_t *prev;
117 struct item_page_t *next;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000118} item_page_t;
119
120
121typedef struct match_state
122{
Guido van Rossumdb25f321997-07-10 14:31:32 +0000123 /* The number of registers that have been pushed onto the stack
124 * since the last failure point. */
Guido van Rossum004c1e11997-05-09 02:35:58 +0000125
Guido van Rossumdb25f321997-07-10 14:31:32 +0000126 int count;
127
128 /* Used to control when registers need to be pushed onto the
129 * stack. */
130
131 int level;
132
133 /* The number of failure points on the stack. */
134
135 int point;
136
137 /* Storage for the registers. Each register consists of two
138 * pointers to characters. So register N is represented as
139 * start[N] and end[N]. The pointers must be converted to
140 * offsets from the beginning of the string before returning the
141 * registers to the calling program. */
142
Guido van Rossum95e80531997-08-13 22:34:14 +0000143 unsigned char *start[NUM_REGISTERS];
144 unsigned char *end[NUM_REGISTERS];
Guido van Rossumdb25f321997-07-10 14:31:32 +0000145
146 /* Keeps track of whether a register has changed recently. */
147
148 int changed[NUM_REGISTERS];
149
150 /* Structure to encapsulate the stack. */
151 struct
152 {
153 /* index into the curent page. If index == 0 and you need
154 * to pop an item, move to the previous page and set index
155 * = STACK_PAGE_SIZE - 1. Otherwise decrement index to
156 * push a page. If index == STACK_PAGE_SIZE and you need
157 * to push a page move to the next page and set index =
158 * 0. If there is no new next page, allocate a new page
159 * and link it in. Otherwise, increment index to push a
160 * page. */
161
162 int index;
163 item_page_t *current; /* Pointer to the current page. */
164 item_page_t first; /* First page is statically allocated. */
165 } stack;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000166} match_state;
167
Guido van Rossumdb25f321997-07-10 14:31:32 +0000168/* Initialize a state object */
169
170/* #define NEW_STATE(state) \ */
171/* memset(&state, 0, (void *)(&state.stack) - (void *)(&state)); \ */
172/* state.stack.current = &state.stack.first; \ */
173/* state.stack.first.prev = NULL; \ */
174/* state.stack.first.next = NULL; \ */
175/* state.stack.index = 0; \ */
176/* state.level = 1 */
177
178#define NEW_STATE(state, nregs) \
179{ \
180 int i; \
181 for (i = 0; i < nregs; i++) \
182 { \
183 state.start[i] = NULL; \
184 state.end[i] = NULL; \
185 state.changed[i] = 0; \
186 } \
187 state.stack.current = &state.stack.first; \
188 state.stack.first.prev = NULL; \
189 state.stack.first.next = NULL; \
190 state.stack.index = 0; \
191 state.level = 1; \
192 state.count = 0; \
193 state.level = 0; \
194 state.point = 0; \
195}
196
197/* Free any memory that might have been malloc'd */
198
199#define FREE_STATE(state) \
200while(state.stack.first.next != NULL) \
201{ \
202 state.stack.current = state.stack.first.next; \
203 state.stack.first.next = state.stack.current->next; \
204 free(state.stack.current); \
205}
206
Guido van Rossum004c1e11997-05-09 02:35:58 +0000207/* Discard the top 'count' stack items. */
208
209#define STACK_DISCARD(stack, count, on_error) \
210stack.index -= count; \
211while (stack.index < 0) \
212{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +0000213 if (stack.current->prev == NULL) \
214 on_error; \
215 stack.current = stack.current->prev; \
216 stack.index += STACK_PAGE_SIZE; \
Guido van Rossum004c1e11997-05-09 02:35:58 +0000217}
218
219/* Store a pointer to the previous item on the stack. Used to pop an
220 * item off of the stack. */
221
222#define STACK_PREV(stack, top, on_error) \
223if (stack.index == 0) \
224{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +0000225 if (stack.current->prev == NULL) \
226 on_error; \
227 stack.current = stack.current->prev; \
228 stack.index = STACK_PAGE_SIZE - 1; \
Guido van Rossum004c1e11997-05-09 02:35:58 +0000229} \
230else \
Guido van Rossumfaf49081997-07-15 01:47:08 +0000231{ \
232 stack.index--; \
233} \
Guido van Rossum004c1e11997-05-09 02:35:58 +0000234top = &(stack.current->items[stack.index])
235
236/* Store a pointer to the next item on the stack. Used to push an item
237 * on to the stack. */
238
239#define STACK_NEXT(stack, top, on_error) \
240if (stack.index == STACK_PAGE_SIZE) \
241{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +0000242 if (stack.current->next == NULL) \
243 { \
244 stack.current->next = (item_page_t *)malloc(sizeof(item_page_t)); \
245 if (stack.current->next == NULL) \
246 on_error; \
247 stack.current->next->prev = stack.current; \
248 stack.current->next->next = NULL; \
249 } \
250 stack.current = stack.current->next; \
251 stack.index = 0; \
Guido van Rossum004c1e11997-05-09 02:35:58 +0000252} \
253top = &(stack.current->items[stack.index++])
254
255/* Store a pointer to the item that is 'count' items back in the
256 * stack. STACK_BACK(stack, top, 1, on_error) is equivalent to
257 * STACK_TOP(stack, top, on_error). */
258
259#define STACK_BACK(stack, top, count, on_error) \
260{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +0000261 int index; \
262 item_page_t *current; \
263 current = stack.current; \
264 index = stack.index - (count); \
265 while (index < 0) \
266 { \
267 if (current->prev == NULL) \
268 on_error; \
269 current = current->prev; \
270 index += STACK_PAGE_SIZE; \
271 } \
272 top = &(current->items[index]); \
Guido van Rossum004c1e11997-05-09 02:35:58 +0000273}
274
275/* Store a pointer to the top item on the stack. Execute the
276 * 'on_error' code if there are no items on the stack. */
277
278#define STACK_TOP(stack, top, on_error) \
279if (stack.index == 0) \
280{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +0000281 if (stack.current->prev == NULL) \
282 on_error; \
283 top = &(stack.current->prev->items[STACK_PAGE_SIZE - 1]); \
Guido van Rossum004c1e11997-05-09 02:35:58 +0000284} \
285else \
Guido van Rossumfaf49081997-07-15 01:47:08 +0000286{ \
287 top = &(stack.current->items[stack.index - 1]); \
288}
Guido van Rossum004c1e11997-05-09 02:35:58 +0000289
290/* Test to see if the stack is empty */
291
292#define STACK_EMPTY(stack) ((stack.index == 0) && \
293 (stack.current->prev == NULL))
294
Guido van Rossum004c1e11997-05-09 02:35:58 +0000295/* Return the start of register 'reg' */
296
297#define GET_REG_START(state, reg) (state.start[reg])
298
299/* Return the end of register 'reg' */
300
301#define GET_REG_END(state, reg) (state.end[reg])
302
303/* Set the start of register 'reg'. If the state of the register needs
304 * saving, push it on the stack. */
305
306#define SET_REG_START(state, reg, text, on_error) \
307if(state.changed[reg] < state.level) \
308{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +0000309 item_t *item; \
310 STACK_NEXT(state.stack, item, on_error); \
311 item->reg.num = reg; \
312 item->reg.start = state.start[reg]; \
313 item->reg.end = state.end[reg]; \
314 item->reg.level = state.changed[reg]; \
315 state.changed[reg] = state.level; \
316 state.count++; \
Guido van Rossum004c1e11997-05-09 02:35:58 +0000317} \
318state.start[reg] = text
319
320/* Set the end of register 'reg'. If the state of the register needs
321 * saving, push it on the stack. */
322
323#define SET_REG_END(state, reg, text, on_error) \
324if(state.changed[reg] < state.level) \
325{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +0000326 item_t *item; \
327 STACK_NEXT(state.stack, item, on_error); \
328 item->reg.num = reg; \
329 item->reg.start = state.start[reg]; \
330 item->reg.end = state.end[reg]; \
331 item->reg.level = state.changed[reg]; \
332 state.changed[reg] = state.level; \
333 state.count++; \
Guido van Rossum004c1e11997-05-09 02:35:58 +0000334} \
335state.end[reg] = text
336
337#define PUSH_FAILURE(state, xcode, xtext, on_error) \
338{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +0000339 item_t *item; \
340 STACK_NEXT(state.stack, item, on_error); \
341 item->fail.code = xcode; \
342 item->fail.text = xtext; \
343 item->fail.count = state.count; \
344 item->fail.level = state.level; \
345 item->fail.phantom = 0; \
346 state.count = 0; \
347 state.level++; \
348 state.point++; \
Guido van Rossum004c1e11997-05-09 02:35:58 +0000349}
350
351/* Update the last failure point with a new position in the text. */
352
Guido van Rossum004c1e11997-05-09 02:35:58 +0000353#define UPDATE_FAILURE(state, xtext, on_error) \
354{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +0000355 item_t *item; \
356 STACK_BACK(state.stack, item, state.count + 1, on_error); \
357 if (!item->fail.phantom) \
358 { \
359 item_t *item2; \
360 STACK_NEXT(state.stack, item2, on_error); \
361 item2->fail.code = item->fail.code; \
362 item2->fail.text = xtext; \
363 item2->fail.count = state.count; \
364 item2->fail.level = state.level; \
365 item2->fail.phantom = 1; \
366 state.count = 0; \
367 state.level++; \
368 state.point++; \
369 } \
370 else \
371 { \
372 STACK_DISCARD(state.stack, state.count, on_error); \
373 STACK_TOP(state.stack, item, on_error); \
374 item->fail.text = xtext; \
375 state.count = 0; \
376 state.level++; \
377 } \
Guido van Rossum004c1e11997-05-09 02:35:58 +0000378}
379
380#define POP_FAILURE(state, xcode, xtext, on_empty, on_error) \
381{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +0000382 item_t *item; \
383 do \
384 { \
385 while(state.count > 0) \
386 { \
387 STACK_PREV(state.stack, item, on_error); \
388 state.start[item->reg.num] = item->reg.start; \
389 state.end[item->reg.num] = item->reg.end; \
390 state.changed[item->reg.num] = item->reg.level; \
391 state.count--; \
392 } \
393 STACK_PREV(state.stack, item, on_empty); \
394 xcode = item->fail.code; \
395 xtext = item->fail.text; \
396 state.count = item->fail.count; \
397 state.level = item->fail.level; \
398 state.point--; \
399 } \
400 while (item->fail.text == NULL); \
Guido van Rossum004c1e11997-05-09 02:35:58 +0000401}
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000402
403enum regexp_compiled_ops /* opcodes for compiled regexp */
404{
Guido van Rossumfaf49081997-07-15 01:47:08 +0000405 Cend, /* end of pattern reached */
406 Cbol, /* beginning of line */
407 Ceol, /* end of line */
408 Cset, /* character set. Followed by 32 bytes of set. */
409 Cexact, /* followed by a byte to match */
410 Canychar, /* matches any character except newline */
411 Cstart_memory, /* set register start addr (followed by reg number) */
412 Cend_memory, /* set register end addr (followed by reg number) */
413 Cmatch_memory, /* match a duplicate of reg contents (regnum follows)*/
414 Cjump, /* followed by two bytes (lsb,msb) of displacement. */
415 Cstar_jump, /* will change to jump/update_failure_jump at runtime */
416 Cfailure_jump, /* jump to addr on failure */
417 Cupdate_failure_jump, /* update topmost failure point and jump */
418 Cdummy_failure_jump, /* push a dummy failure point and jump */
419 Cbegbuf, /* match at beginning of buffer */
420 Cendbuf, /* match at end of buffer */
421 Cwordbeg, /* match at beginning of word */
422 Cwordend, /* match at end of word */
423 Cwordbound, /* match if at word boundary */
424 Cnotwordbound, /* match if not at word boundary */
425 Csyntaxspec, /* matches syntax code (1 byte follows) */
Guido van Rossum95e80531997-08-13 22:34:14 +0000426 Cnotsyntaxspec, /* matches if syntax code does not match (1 byte follows) */
Guido van Rossumfaf49081997-07-15 01:47:08 +0000427 Crepeat1
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000428};
429
430enum regexp_syntax_op /* syntax codes for plain and quoted characters */
431{
Guido van Rossumfaf49081997-07-15 01:47:08 +0000432 Rend, /* special code for end of regexp */
433 Rnormal, /* normal character */
434 Ranychar, /* any character except newline */
435 Rquote, /* the quote character */
436 Rbol, /* match beginning of line */
437 Reol, /* match end of line */
438 Roptional, /* match preceding expression optionally */
439 Rstar, /* match preceding expr zero or more times */
440 Rplus, /* match preceding expr one or more times */
441 Ror, /* match either of alternatives */
442 Ropenpar, /* opening parenthesis */
443 Rclosepar, /* closing parenthesis */
444 Rmemory, /* match memory register */
445 Rextended_memory, /* \vnn to match registers 10-99 */
446 Ropenset, /* open set. Internal syntax hard-coded below. */
447 /* the following are gnu extensions to "normal" regexp syntax */
448 Rbegbuf, /* beginning of buffer */
449 Rendbuf, /* end of buffer */
450 Rwordchar, /* word character */
451 Rnotwordchar, /* not word character */
452 Rwordbeg, /* beginning of word */
453 Rwordend, /* end of word */
454 Rwordbound, /* word bound */
455 Rnotwordbound, /* not word bound */
456 Rnum_ops
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000457};
458
459static int re_compile_initialized = 0;
460static int regexp_syntax = 0;
Guido van Rossumb6775db1994-08-01 11:34:53 +0000461int re_syntax = 0; /* Exported copy of regexp_syntax */
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000462static unsigned char regexp_plain_ops[256];
463static unsigned char regexp_quoted_ops[256];
464static unsigned char regexp_precedences[Rnum_ops];
465static int regexp_context_indep_ops;
466static int regexp_ansi_sequences;
467
468#define NUM_LEVELS 5 /* number of precedence levels in use */
469#define MAX_NESTING 100 /* max nesting level of operators */
470
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000471#define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000472
Guido van Rossum95e80531997-08-13 22:34:14 +0000473unsigned char re_syntax_table[256];
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000474
Guido van Rossum74fb3031997-07-17 22:41:38 +0000475void re_compile_initialize(void)
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000476{
Guido van Rossumfaf49081997-07-15 01:47:08 +0000477 int a;
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000478
Guido van Rossumfaf49081997-07-15 01:47:08 +0000479 static int syntax_table_inited = 0;
Guido van Rossum74fb3031997-07-17 22:41:38 +0000480
Guido van Rossumfaf49081997-07-15 01:47:08 +0000481 if (!syntax_table_inited)
482 {
483 syntax_table_inited = 1;
484 memset(re_syntax_table, 0, 256);
485 for (a = 'a'; a <= 'z'; a++)
486 re_syntax_table[a] = Sword;
487 for (a = 'A'; a <= 'Z'; a++)
488 re_syntax_table[a] = Sword;
489 for (a = '0'; a <= '9'; a++)
Guido van Rossum52d68321997-08-13 03:21:14 +0000490 re_syntax_table[a] = Sword | Sdigit | Shexdigit;
491 for (a = '0'; a <= '7'; a++)
492 re_syntax_table[a] |= Soctaldigit;
493 for (a = 'A'; a <= 'F'; a++)
494 re_syntax_table[a] |= Shexdigit;
495 for (a = 'a'; a <= 'f'; a++)
496 re_syntax_table[a] |= Shexdigit;
Guido van Rossum74fb3031997-07-17 22:41:38 +0000497 re_syntax_table['_'] = Sword;
498 for (a = 9; a <= 13; a++)
499 re_syntax_table[a] = Swhitespace;
500 re_syntax_table[' '] = Swhitespace;
Guido van Rossumfaf49081997-07-15 01:47:08 +0000501 }
502 re_compile_initialized = 1;
503 for (a = 0; a < 256; a++)
504 {
505 regexp_plain_ops[a] = Rnormal;
506 regexp_quoted_ops[a] = Rnormal;
507 }
508 for (a = '0'; a <= '9'; a++)
509 regexp_quoted_ops[a] = Rmemory;
510 regexp_plain_ops['\134'] = Rquote;
511 if (regexp_syntax & RE_NO_BK_PARENS)
512 {
513 regexp_plain_ops['('] = Ropenpar;
514 regexp_plain_ops[')'] = Rclosepar;
515 }
516 else
517 {
518 regexp_quoted_ops['('] = Ropenpar;
519 regexp_quoted_ops[')'] = Rclosepar;
520 }
521 if (regexp_syntax & RE_NO_BK_VBAR)
522 regexp_plain_ops['\174'] = Ror;
523 else
524 regexp_quoted_ops['\174'] = Ror;
525 regexp_plain_ops['*'] = Rstar;
526 if (regexp_syntax & RE_BK_PLUS_QM)
527 {
528 regexp_quoted_ops['+'] = Rplus;
529 regexp_quoted_ops['?'] = Roptional;
530 }
531 else
532 {
533 regexp_plain_ops['+'] = Rplus;
534 regexp_plain_ops['?'] = Roptional;
535 }
536 if (regexp_syntax & RE_NEWLINE_OR)
537 regexp_plain_ops['\n'] = Ror;
538 regexp_plain_ops['\133'] = Ropenset;
539 regexp_plain_ops['\136'] = Rbol;
540 regexp_plain_ops['$'] = Reol;
541 regexp_plain_ops['.'] = Ranychar;
542 if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS))
543 {
544 regexp_quoted_ops['w'] = Rwordchar;
545 regexp_quoted_ops['W'] = Rnotwordchar;
546 regexp_quoted_ops['<'] = Rwordbeg;
547 regexp_quoted_ops['>'] = Rwordend;
548 regexp_quoted_ops['b'] = Rwordbound;
549 regexp_quoted_ops['B'] = Rnotwordbound;
550 regexp_quoted_ops['`'] = Rbegbuf;
551 regexp_quoted_ops['\''] = Rendbuf;
552 }
553 if (regexp_syntax & RE_ANSI_HEX)
554 regexp_quoted_ops['v'] = Rextended_memory;
555 for (a = 0; a < Rnum_ops; a++)
556 regexp_precedences[a] = 4;
557 if (regexp_syntax & RE_TIGHT_VBAR)
558 {
559 regexp_precedences[Ror] = 3;
560 regexp_precedences[Rbol] = 2;
561 regexp_precedences[Reol] = 2;
562 }
563 else
564 {
565 regexp_precedences[Ror] = 2;
566 regexp_precedences[Rbol] = 3;
567 regexp_precedences[Reol] = 3;
568 }
569 regexp_precedences[Rclosepar] = 1;
570 regexp_precedences[Rend] = 0;
571 regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0;
572 regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000573}
574
Guido van Rossum004c1e11997-05-09 02:35:58 +0000575int re_set_syntax(int syntax)
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000576{
Guido van Rossumfaf49081997-07-15 01:47:08 +0000577 int ret;
578
579 ret = regexp_syntax;
580 regexp_syntax = syntax;
581 re_syntax = syntax; /* Exported copy */
582 re_compile_initialize();
583 return ret;
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000584}
585
Guido van Rossum004c1e11997-05-09 02:35:58 +0000586static int hex_char_to_decimal(int ch)
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000587{
Guido van Rossumfaf49081997-07-15 01:47:08 +0000588 if (ch >= '0' && ch <= '9')
589 return ch - '0';
590 if (ch >= 'a' && ch <= 'f')
591 return ch - 'a' + 10;
592 if (ch >= 'A' && ch <= 'F')
593 return ch - 'A' + 10;
594 return 16;
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000595}
596
Guido van Rossum95e80531997-08-13 22:34:14 +0000597static void re_compile_fastmap_aux(unsigned char *code,
Guido van Rossum004c1e11997-05-09 02:35:58 +0000598 int pos,
Guido van Rossum95e80531997-08-13 22:34:14 +0000599 unsigned char *visited,
600 unsigned char *can_be_null,
601 unsigned char *fastmap)
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000602{
Guido van Rossumfaf49081997-07-15 01:47:08 +0000603 int a;
604 int b;
605 int syntaxcode;
606
607 if (visited[pos])
608 return; /* we have already been here */
609 visited[pos] = 1;
610 for (;;)
Guido van Rossum74fb3031997-07-17 22:41:38 +0000611 switch (code[pos++]) {
Guido van Rossumfaf49081997-07-15 01:47:08 +0000612 case Cend:
Guido van Rossum74fb3031997-07-17 22:41:38 +0000613 {
614 *can_be_null = 1;
615 return;
616 }
Guido van Rossumfaf49081997-07-15 01:47:08 +0000617 case Cbol:
618 case Cbegbuf:
619 case Cendbuf:
620 case Cwordbeg:
621 case Cwordend:
622 case Cwordbound:
623 case Cnotwordbound:
624 {
625 for (a = 0; a < 256; a++)
626 fastmap[a] = 1;
627 break;
628 }
629 case Csyntaxspec:
630 {
631 syntaxcode = code[pos++];
632 for (a = 0; a < 256; a++)
633 if (SYNTAX(a) == syntaxcode)
634 fastmap[a] = 1;
635 return;
636 }
637 case Cnotsyntaxspec:
638 {
639 syntaxcode = code[pos++];
640 for (a = 0; a < 256; a++)
641 if (SYNTAX(a) != syntaxcode)
642 fastmap[a] = 1;
643 return;
644 }
645 case Ceol:
646 {
647 fastmap['\n'] = 1;
648 if (*can_be_null == 0)
649 *can_be_null = 2; /* can match null, but only at end of buffer*/
650 return;
651 }
652 case Cset:
653 {
654 for (a = 0; a < 256/8; a++)
655 if (code[pos + a] != 0)
656 for (b = 0; b < 8; b++)
657 if (code[pos + a] & (1 << b))
658 fastmap[(a << 3) + b] = 1;
659 pos += 256/8;
660 return;
661 }
662 case Cexact:
663 {
664 fastmap[(unsigned char)code[pos]] = 1;
665 return;
666 }
667 case Canychar:
668 {
669 for (a = 0; a < 256; a++)
670 if (a != '\n')
671 fastmap[a] = 1;
672 return;
673 }
674 case Cstart_memory:
675 case Cend_memory:
676 {
677 pos++;
678 break;
679 }
680 case Cmatch_memory:
681 {
682 for (a = 0; a < 256; a++)
683 fastmap[a] = 1;
684 *can_be_null = 1;
685 return;
686 }
687 case Cjump:
688 case Cdummy_failure_jump:
689 case Cupdate_failure_jump:
690 case Cstar_jump:
691 {
692 a = (unsigned char)code[pos++];
693 a |= (unsigned char)code[pos++] << 8;
694 pos += (int)SHORT(a);
695 if (visited[pos])
696 {
697 /* argh... the regexp contains empty loops. This is not
698 good, as this may cause a failure stack overflow when
699 matching. Oh well. */
700 /* this path leads nowhere; pursue other paths. */
701 return;
702 }
703 visited[pos] = 1;
704 break;
705 }
706 case Cfailure_jump:
707 {
708 a = (unsigned char)code[pos++];
709 a |= (unsigned char)code[pos++] << 8;
710 a = pos + (int)SHORT(a);
711 re_compile_fastmap_aux(code, a, visited, can_be_null, fastmap);
712 break;
713 }
714 case Crepeat1:
715 {
716 pos += 2;
717 break;
718 }
719 default:
720 {
Guido van Rossum95e80531997-08-13 22:34:14 +0000721 PyErr_SetString(PyExc_SystemError, "Unknown regex opcode: memory corrupted?");
722 return;
Guido van Rossumfaf49081997-07-15 01:47:08 +0000723 /*NOTREACHED*/
724 }
725 }
Guido van Rossum004c1e11997-05-09 02:35:58 +0000726}
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000727
Guido van Rossum95e80531997-08-13 22:34:14 +0000728static int re_do_compile_fastmap(unsigned char *buffer,
Guido van Rossum004c1e11997-05-09 02:35:58 +0000729 int used,
730 int pos,
Guido van Rossum95e80531997-08-13 22:34:14 +0000731 unsigned char *can_be_null,
732 unsigned char *fastmap)
Guido van Rossum004c1e11997-05-09 02:35:58 +0000733{
Guido van Rossum95e80531997-08-13 22:34:14 +0000734 unsigned char small_visited[512], *visited;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000735
Guido van Rossumfaf49081997-07-15 01:47:08 +0000736 if (used <= sizeof(small_visited))
737 visited = small_visited;
738 else
739 {
740 visited = malloc(used);
741 if (!visited)
742 return 0;
743 }
744 *can_be_null = 0;
745 memset(fastmap, 0, 256);
746 memset(visited, 0, used);
747 re_compile_fastmap_aux(buffer, pos, visited, can_be_null, fastmap);
748 if (visited != small_visited)
749 free(visited);
750 return 1;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000751}
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000752
Guido van Rossum004c1e11997-05-09 02:35:58 +0000753void re_compile_fastmap(regexp_t bufp)
754{
Guido van Rossumfaf49081997-07-15 01:47:08 +0000755 if (!bufp->fastmap || bufp->fastmap_accurate)
756 return;
757 assert(bufp->used > 0);
758 if (!re_do_compile_fastmap(bufp->buffer,
759 bufp->used,
760 0,
761 &bufp->can_be_null,
762 bufp->fastmap))
763 return;
Guido van Rossum95e80531997-08-13 22:34:14 +0000764 if (PyErr_Occurred()) return;
Guido van Rossumfaf49081997-07-15 01:47:08 +0000765 if (bufp->buffer[0] == Cbol)
766 bufp->anchor = 1; /* begline */
767 else
768 if (bufp->buffer[0] == Cbegbuf)
769 bufp->anchor = 2; /* begbuf */
770 else
771 bufp->anchor = 0; /* none */
772 bufp->fastmap_accurate = 1;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000773}
774
775/*
776 * star is coded as:
777 * 1: failure_jump 2
778 * ... code for operand of star
779 * star_jump 1
780 * 2: ... code after star
781 *
782 * We change the star_jump to update_failure_jump if we can determine
783 * that it is safe to do so; otherwise we change it to an ordinary
784 * jump.
785 *
786 * plus is coded as
787 *
788 * jump 2
789 * 1: failure_jump 3
790 * 2: ... code for operand of plus
791 * star_jump 1
792 * 3: ... code after plus
793 *
794 * For star_jump considerations this is processed identically to star.
795 *
796 */
797
Guido van Rossum95e80531997-08-13 22:34:14 +0000798static int re_optimize_star_jump(regexp_t bufp, unsigned char *code)
Guido van Rossum004c1e11997-05-09 02:35:58 +0000799{
Guido van Rossum95e80531997-08-13 22:34:14 +0000800 unsigned char map[256];
801 unsigned char can_be_null;
802 unsigned char *p1;
803 unsigned char *p2;
804 unsigned char ch;
Guido van Rossumfaf49081997-07-15 01:47:08 +0000805 int a;
806 int b;
807 int num_instructions = 0;
Guido van Rossum95e80531997-08-13 22:34:14 +0000808
Guido van Rossumfaf49081997-07-15 01:47:08 +0000809 a = (unsigned char)*code++;
810 a |= (unsigned char)*code++ << 8;
811 a = (int)SHORT(a);
812
813 p1 = code + a + 3; /* skip the failure_jump */
Guido van Rossum95e80531997-08-13 22:34:14 +0000814 /* Check that the jump is within the pattern */
815 if (p1<bufp->buffer || bufp->buffer+bufp->used<p1)
816 {
817 PyErr_SetString(PyExc_SystemError, "Regex VM jump out of bounds (failure_jump opt)");
818 return 0;
819 }
820
Guido van Rossumfaf49081997-07-15 01:47:08 +0000821 assert(p1[-3] == Cfailure_jump);
822 p2 = code;
823 /* p1 points inside loop, p2 points to after loop */
824 if (!re_do_compile_fastmap(bufp->buffer, bufp->used,
825 p2 - bufp->buffer, &can_be_null, map))
826 goto make_normal_jump;
827
828 /* If we might introduce a new update point inside the
829 * loop, we can't optimize because then update_jump would
830 * update a wrong failure point. Thus we have to be
831 * quite careful here.
832 */
833
834 /* loop until we find something that consumes a character */
Guido van Rossum004c1e11997-05-09 02:35:58 +0000835 loop_p1:
Guido van Rossumfaf49081997-07-15 01:47:08 +0000836 num_instructions++;
837 switch (*p1++)
838 {
839 case Cbol:
840 case Ceol:
841 case Cbegbuf:
842 case Cendbuf:
843 case Cwordbeg:
844 case Cwordend:
845 case Cwordbound:
846 case Cnotwordbound:
847 {
848 goto loop_p1;
849 }
850 case Cstart_memory:
851 case Cend_memory:
852 {
853 p1++;
854 goto loop_p1;
855 }
856 case Cexact:
857 {
858 ch = (unsigned char)*p1++;
859 if (map[(int)ch])
860 goto make_normal_jump;
861 break;
862 }
863 case Canychar:
864 {
865 for (b = 0; b < 256; b++)
866 if (b != '\n' && map[b])
867 goto make_normal_jump;
868 break;
869 }
870 case Cset:
871 {
872 for (b = 0; b < 256; b++)
873 if ((p1[b >> 3] & (1 << (b & 7))) && map[b])
874 goto make_normal_jump;
875 p1 += 256/8;
876 break;
877 }
878 default:
879 {
880 goto make_normal_jump;
881 }
882 }
883 /* now we know that we can't backtrack. */
884 while (p1 != p2 - 3)
885 {
886 num_instructions++;
887 switch (*p1++)
888 {
889 case Cend:
890 {
891 return 0;
892 }
893 case Cbol:
894 case Ceol:
895 case Canychar:
896 case Cbegbuf:
897 case Cendbuf:
898 case Cwordbeg:
899 case Cwordend:
900 case Cwordbound:
901 case Cnotwordbound:
902 {
903 break;
904 }
905 case Cset:
906 {
907 p1 += 256/8;
908 break;
909 }
910 case Cexact:
911 case Cstart_memory:
912 case Cend_memory:
913 case Cmatch_memory:
914 case Csyntaxspec:
915 case Cnotsyntaxspec:
916 {
917 p1++;
918 break;
919 }
920 case Cjump:
921 case Cstar_jump:
922 case Cfailure_jump:
923 case Cupdate_failure_jump:
924 case Cdummy_failure_jump:
925 {
926 goto make_normal_jump;
927 }
928 default:
929 {
930 return 0;
931 break;
932 }
933 }
934 }
935
Guido van Rossum95e80531997-08-13 22:34:14 +0000936 /* make_update_jump: */
Guido van Rossumfaf49081997-07-15 01:47:08 +0000937 code -= 3;
938 a += 3; /* jump to after the Cfailure_jump */
939 code[0] = Cupdate_failure_jump;
940 code[1] = a & 0xff;
941 code[2] = a >> 8;
942 if (num_instructions > 1)
943 return 1;
944 assert(num_instructions == 1);
945 /* if the only instruction matches a single character, we can do
946 * better */
947 p1 = code + 3 + a; /* start of sole instruction */
948 if (*p1 == Cset || *p1 == Cexact || *p1 == Canychar ||
949 *p1 == Csyntaxspec || *p1 == Cnotsyntaxspec)
950 code[0] = Crepeat1;
951 return 1;
952
Guido van Rossum004c1e11997-05-09 02:35:58 +0000953 make_normal_jump:
Guido van Rossumfaf49081997-07-15 01:47:08 +0000954 code -= 3;
955 *code = Cjump;
956 return 1;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000957}
958
959static int re_optimize(regexp_t bufp)
960{
Guido van Rossum95e80531997-08-13 22:34:14 +0000961 unsigned char *code;
Guido van Rossumfaf49081997-07-15 01:47:08 +0000962
963 code = bufp->buffer;
964
965 while(1)
966 {
967 switch (*code++)
968 {
969 case Cend:
970 {
971 return 1;
972 }
973 case Canychar:
974 case Cbol:
975 case Ceol:
976 case Cbegbuf:
977 case Cendbuf:
978 case Cwordbeg:
979 case Cwordend:
980 case Cwordbound:
981 case Cnotwordbound:
982 {
983 break;
984 }
985 case Cset:
986 {
987 code += 256/8;
988 break;
989 }
990 case Cexact:
991 case Cstart_memory:
992 case Cend_memory:
993 case Cmatch_memory:
994 case Csyntaxspec:
995 case Cnotsyntaxspec:
996 {
997 code++;
998 break;
999 }
1000 case Cstar_jump:
1001 {
1002 if (!re_optimize_star_jump(bufp, code))
1003 {
1004 return 0;
1005 }
1006 /* fall through */
1007 }
1008 case Cupdate_failure_jump:
1009 case Cjump:
1010 case Cdummy_failure_jump:
1011 case Cfailure_jump:
1012 case Crepeat1:
1013 {
1014 code += 2;
1015 break;
1016 }
1017 default:
1018 {
1019 return 0;
1020 }
1021 }
1022 }
Guido van Rossum004c1e11997-05-09 02:35:58 +00001023}
1024
1025#define NEXTCHAR(var) \
1026{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +00001027 if (pos >= size) \
1028 goto ends_prematurely; \
1029 (var) = regex[pos]; \
1030 pos++; \
Guido van Rossum004c1e11997-05-09 02:35:58 +00001031}
1032
1033#define ALLOC(amount) \
1034{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +00001035 if (pattern_offset+(amount) > alloc) \
1036 { \
1037 alloc += 256 + (amount); \
1038 pattern = realloc(pattern, alloc); \
1039 if (!pattern) \
1040 goto out_of_memory; \
1041 } \
Guido van Rossum004c1e11997-05-09 02:35:58 +00001042}
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001043
1044#define STORE(ch) pattern[pattern_offset++] = (ch)
1045
1046#define CURRENT_LEVEL_START (starts[starts_base + current_level])
1047
1048#define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
1049
Guido van Rossum004c1e11997-05-09 02:35:58 +00001050#define PUSH_LEVEL_STARTS \
Guido van Rossumfaf49081997-07-15 01:47:08 +00001051if (starts_base < (MAX_NESTING-1)*NUM_LEVELS) \
1052 starts_base += NUM_LEVELS; \
1053else \
1054 goto too_complex \
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001055
1056#define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
1057
Guido van Rossum004c1e11997-05-09 02:35:58 +00001058#define PUT_ADDR(offset,addr) \
1059{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +00001060 int disp = (addr) - (offset) - 2; \
1061 pattern[(offset)] = disp & 0xff; \
1062 pattern[(offset)+1] = (disp>>8) & 0xff; \
Guido van Rossum004c1e11997-05-09 02:35:58 +00001063}
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001064
Guido van Rossum004c1e11997-05-09 02:35:58 +00001065#define INSERT_JUMP(pos,type,addr) \
1066{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +00001067 int a, p = (pos), t = (type), ad = (addr); \
1068 for (a = pattern_offset - 1; a >= p; a--) \
1069 pattern[a + 3] = pattern[a]; \
1070 pattern[p] = t; \
1071 PUT_ADDR(p+1,ad); \
1072 pattern_offset += 3; \
Guido van Rossum004c1e11997-05-09 02:35:58 +00001073}
Guido van Rossumfaf49081997-07-15 01:47:08 +00001074
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001075#define SETBIT(buf,offset,bit) (buf)[(offset)+(bit)/8] |= (1<<((bit) & 7))
1076
Guido van Rossum004c1e11997-05-09 02:35:58 +00001077#define SET_FIELDS \
1078{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +00001079 bufp->allocated = alloc; \
1080 bufp->buffer = pattern; \
1081 bufp->used = pattern_offset; \
Guido van Rossum004c1e11997-05-09 02:35:58 +00001082}
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001083
Guido van Rossum004c1e11997-05-09 02:35:58 +00001084#define GETHEX(var) \
1085{ \
Guido van Rossum95e80531997-08-13 22:34:14 +00001086 unsigned char gethex_ch, gethex_value; \
Guido van Rossumfaf49081997-07-15 01:47:08 +00001087 NEXTCHAR(gethex_ch); \
1088 gethex_value = hex_char_to_decimal(gethex_ch); \
1089 if (gethex_value == 16) \
1090 goto hex_error; \
1091 NEXTCHAR(gethex_ch); \
1092 gethex_ch = hex_char_to_decimal(gethex_ch); \
1093 if (gethex_ch == 16) \
1094 goto hex_error; \
1095 (var) = gethex_value * 16 + gethex_ch; \
Guido van Rossum004c1e11997-05-09 02:35:58 +00001096}
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001097
Guido van Rossumfaf49081997-07-15 01:47:08 +00001098#define ANSI_TRANSLATE(ch) \
Guido van Rossum004c1e11997-05-09 02:35:58 +00001099{ \
Guido van Rossumfaf49081997-07-15 01:47:08 +00001100 switch (ch) \
1101 { \
1102 case 'a': \
1103 case 'A': \
1104 { \
1105 ch = 7; /* audible bell */ \
1106 break; \
1107 } \
1108 case 'b': \
1109 case 'B': \
1110 { \
1111 ch = 8; /* backspace */ \
1112 break; \
1113 } \
1114 case 'f': \
1115 case 'F': \
1116 { \
1117 ch = 12; /* form feed */ \
1118 break; \
1119 } \
1120 case 'n': \
1121 case 'N': \
1122 { \
1123 ch = 10; /* line feed */ \
1124 break; \
1125 } \
1126 case 'r': \
1127 case 'R': \
1128 { \
1129 ch = 13; /* carriage return */ \
1130 break; \
1131 } \
1132 case 't': \
1133 case 'T': \
1134 { \
1135 ch = 9; /* tab */ \
1136 break; \
1137 } \
1138 case 'v': \
1139 case 'V': \
1140 { \
1141 ch = 11; /* vertical tab */ \
1142 break; \
1143 } \
1144 case 'x': /* hex code */ \
1145 case 'X': \
1146 { \
1147 GETHEX(ch); \
1148 break; \
1149 } \
1150 default: \
1151 { \
1152 /* other characters passed through */ \
1153 if (translate) \
1154 ch = translate[(unsigned char)ch]; \
1155 break; \
1156 } \
1157 } \
Guido van Rossum004c1e11997-05-09 02:35:58 +00001158}
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001159
Guido van Rossum95e80531997-08-13 22:34:14 +00001160unsigned char *re_compile_pattern(unsigned char *regex, int size, regexp_t bufp)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001161{
Guido van Rossumfaf49081997-07-15 01:47:08 +00001162 int a;
1163 int pos;
1164 int op;
1165 int current_level;
1166 int level;
1167 int opcode;
1168 int pattern_offset = 0, alloc;
1169 int starts[NUM_LEVELS * MAX_NESTING];
1170 int starts_base;
1171 int future_jumps[MAX_NESTING];
1172 int num_jumps;
1173 unsigned char ch = '\0';
Guido van Rossum95e80531997-08-13 22:34:14 +00001174 unsigned char *pattern;
1175 unsigned char *translate;
Guido van Rossumfaf49081997-07-15 01:47:08 +00001176 int next_register;
1177 int paren_depth;
1178 int num_open_registers;
1179 int open_registers[RE_NREGS];
1180 int beginning_context;
1181
1182 if (!re_compile_initialized)
1183 re_compile_initialize();
1184 bufp->used = 0;
1185 bufp->fastmap_accurate = 0;
1186 bufp->uses_registers = 1;
1187 bufp->num_registers = 1;
1188 translate = bufp->translate;
1189 pattern = bufp->buffer;
1190 alloc = bufp->allocated;
1191 if (alloc == 0 || pattern == NULL)
1192 {
1193 alloc = 256;
1194 pattern = malloc(alloc);
1195 if (!pattern)
1196 goto out_of_memory;
1197 }
1198 pattern_offset = 0;
1199 starts_base = 0;
1200 num_jumps = 0;
1201 current_level = 0;
1202 SET_LEVEL_START;
1203 num_open_registers = 0;
1204 next_register = 1;
1205 paren_depth = 0;
1206 beginning_context = 1;
1207 op = -1;
1208 /* we use Rend dummy to ensure that pending jumps are updated
1209 (due to low priority of Rend) before exiting the loop. */
1210 pos = 0;
1211 while (op != Rend)
1212 {
1213 if (pos >= size)
1214 op = Rend;
1215 else
1216 {
1217 NEXTCHAR(ch);
1218 if (translate)
1219 ch = translate[(unsigned char)ch];
1220 op = regexp_plain_ops[(unsigned char)ch];
1221 if (op == Rquote)
1222 {
1223 NEXTCHAR(ch);
1224 op = regexp_quoted_ops[(unsigned char)ch];
1225 if (op == Rnormal && regexp_ansi_sequences)
1226 ANSI_TRANSLATE(ch);
1227 }
1228 }
1229 level = regexp_precedences[op];
1230 /* printf("ch='%c' op=%d level=%d current_level=%d
1231 curlevstart=%d\n", ch, op, level, current_level,
1232 CURRENT_LEVEL_START); */
1233 if (level > current_level)
1234 {
1235 for (current_level++; current_level < level; current_level++)
1236 SET_LEVEL_START;
1237 SET_LEVEL_START;
1238 }
1239 else
1240 if (level < current_level)
1241 {
1242 current_level = level;
1243 for (;num_jumps > 0 &&
1244 future_jumps[num_jumps-1] >= CURRENT_LEVEL_START;
1245 num_jumps--)
1246 PUT_ADDR(future_jumps[num_jumps-1], pattern_offset);
1247 }
1248 switch (op)
1249 {
1250 case Rend:
1251 {
1252 break;
1253 }
1254 case Rnormal:
1255 {
1256 normal_char:
1257 opcode = Cexact;
1258 store_opcode_and_arg: /* opcode & ch must be set */
1259 SET_LEVEL_START;
1260 ALLOC(2);
1261 STORE(opcode);
1262 STORE(ch);
1263 break;
1264 }
1265 case Ranychar:
1266 {
1267 opcode = Canychar;
1268 store_opcode:
1269 SET_LEVEL_START;
1270 ALLOC(1);
1271 STORE(opcode);
1272 break;
1273 }
1274 case Rquote:
1275 {
1276 abort();
1277 /*NOTREACHED*/
1278 }
1279 case Rbol:
1280 {
1281 if (!beginning_context)
1282 if (regexp_context_indep_ops)
1283 goto op_error;
1284 else
1285 goto normal_char;
1286 opcode = Cbol;
1287 goto store_opcode;
1288 }
1289 case Reol:
1290 {
1291 if (!((pos >= size) ||
1292 ((regexp_syntax & RE_NO_BK_VBAR) ?
1293 (regex[pos] == '\174') :
1294 (pos+1 < size && regex[pos] == '\134' &&
1295 regex[pos+1] == '\174')) ||
1296 ((regexp_syntax & RE_NO_BK_PARENS)?
1297 (regex[pos] == ')'):
1298 (pos+1 < size && regex[pos] == '\134' &&
1299 regex[pos+1] == ')'))))
1300 if (regexp_context_indep_ops)
1301 goto op_error;
1302 else
1303 goto normal_char;
1304 opcode = Ceol;
1305 goto store_opcode;
1306 /* NOTREACHED */
1307 break;
1308 }
1309 case Roptional:
1310 {
1311 if (beginning_context)
1312 if (regexp_context_indep_ops)
1313 goto op_error;
1314 else
1315 goto normal_char;
1316 if (CURRENT_LEVEL_START == pattern_offset)
1317 break; /* ignore empty patterns for ? */
1318 ALLOC(3);
1319 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
1320 pattern_offset + 3);
1321 break;
1322 }
1323 case Rstar:
1324 case Rplus:
1325 {
1326 if (beginning_context)
1327 if (regexp_context_indep_ops)
1328 goto op_error;
1329 else
1330 goto normal_char;
1331 if (CURRENT_LEVEL_START == pattern_offset)
1332 break; /* ignore empty patterns for + and * */
1333 ALLOC(9);
1334 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
1335 pattern_offset + 6);
1336 INSERT_JUMP(pattern_offset, Cstar_jump, CURRENT_LEVEL_START);
1337 if (op == Rplus) /* jump over initial failure_jump */
1338 INSERT_JUMP(CURRENT_LEVEL_START, Cdummy_failure_jump,
1339 CURRENT_LEVEL_START + 6);
1340 break;
1341 }
1342 case Ror:
1343 {
1344 ALLOC(6);
1345 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
1346 pattern_offset + 6);
1347 if (num_jumps >= MAX_NESTING)
1348 goto too_complex;
1349 STORE(Cjump);
1350 future_jumps[num_jumps++] = pattern_offset;
1351 STORE(0);
1352 STORE(0);
1353 SET_LEVEL_START;
1354 break;
1355 }
1356 case Ropenpar:
1357 {
1358 SET_LEVEL_START;
1359 if (next_register < RE_NREGS)
1360 {
1361 bufp->uses_registers = 1;
1362 ALLOC(2);
1363 STORE(Cstart_memory);
1364 STORE(next_register);
1365 open_registers[num_open_registers++] = next_register;
1366 bufp->num_registers++;
1367 next_register++;
1368 }
1369 paren_depth++;
1370 PUSH_LEVEL_STARTS;
1371 current_level = 0;
1372 SET_LEVEL_START;
1373 break;
1374 }
1375 case Rclosepar:
1376 {
1377 if (paren_depth <= 0)
1378 goto parenthesis_error;
1379 POP_LEVEL_STARTS;
1380 current_level = regexp_precedences[Ropenpar];
1381 paren_depth--;
1382 if (paren_depth < num_open_registers)
1383 {
1384 bufp->uses_registers = 1;
1385 ALLOC(2);
1386 STORE(Cend_memory);
1387 num_open_registers--;
1388 STORE(open_registers[num_open_registers]);
1389 }
1390 break;
1391 }
1392 case Rmemory:
1393 {
1394 if (ch == '0')
1395 goto bad_match_register;
1396 assert(ch >= '0' && ch <= '9');
1397 bufp->uses_registers = 1;
1398 opcode = Cmatch_memory;
1399 ch -= '0';
1400 goto store_opcode_and_arg;
1401 }
1402 case Rextended_memory:
1403 {
1404 NEXTCHAR(ch);
1405 if (ch < '0' || ch > '9')
1406 goto bad_match_register;
1407 NEXTCHAR(a);
1408 if (a < '0' || a > '9')
1409 goto bad_match_register;
1410 ch = 10 * (a - '0') + ch - '0';
1411 if (ch <= 0 || ch >= RE_NREGS)
1412 goto bad_match_register;
1413 bufp->uses_registers = 1;
1414 opcode = Cmatch_memory;
1415 goto store_opcode_and_arg;
1416 }
1417 case Ropenset:
1418 {
1419 int complement;
1420 int prev;
1421 int offset;
1422 int range;
1423 int firstchar;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001424
Guido van Rossumfaf49081997-07-15 01:47:08 +00001425 SET_LEVEL_START;
1426 ALLOC(1+256/8);
1427 STORE(Cset);
1428 offset = pattern_offset;
1429 for (a = 0; a < 256/8; a++)
1430 STORE(0);
1431 NEXTCHAR(ch);
1432 if (translate)
1433 ch = translate[(unsigned char)ch];
1434 if (ch == '\136')
1435 {
1436 complement = 1;
1437 NEXTCHAR(ch);
1438 if (translate)
1439 ch = translate[(unsigned char)ch];
1440 }
1441 else
1442 complement = 0;
1443 prev = -1;
1444 range = 0;
1445 firstchar = 1;
1446 while (ch != '\135' || firstchar)
1447 {
1448 firstchar = 0;
1449 if (regexp_ansi_sequences && ch == '\134')
1450 {
1451 NEXTCHAR(ch);
1452 ANSI_TRANSLATE(ch);
1453 }
1454 if (range)
1455 {
1456 for (a = prev; a <= (int)ch; a++)
1457 SETBIT(pattern, offset, a);
1458 prev = -1;
1459 range = 0;
1460 }
1461 else
1462 if (prev != -1 && ch == '-')
1463 range = 1;
1464 else
1465 {
1466 SETBIT(pattern, offset, ch);
1467 prev = ch;
1468 }
1469 NEXTCHAR(ch);
1470 if (translate)
1471 ch = translate[(unsigned char)ch];
1472 }
1473 if (range)
1474 SETBIT(pattern, offset, '-');
1475 if (complement)
1476 {
1477 for (a = 0; a < 256/8; a++)
1478 pattern[offset+a] ^= 0xff;
1479 }
1480 break;
1481 }
1482 case Rbegbuf:
1483 {
1484 opcode = Cbegbuf;
1485 goto store_opcode;
1486 }
1487 case Rendbuf:
1488 {
1489 opcode = Cendbuf;
1490 goto store_opcode;
1491 }
1492 case Rwordchar:
1493 {
1494 opcode = Csyntaxspec;
1495 ch = Sword;
1496 goto store_opcode_and_arg;
1497 }
1498 case Rnotwordchar:
1499 {
1500 opcode = Cnotsyntaxspec;
1501 ch = Sword;
1502 goto store_opcode_and_arg;
1503 }
1504 case Rwordbeg:
1505 {
1506 opcode = Cwordbeg;
1507 goto store_opcode;
1508 }
1509 case Rwordend:
1510 {
1511 opcode = Cwordend;
1512 goto store_opcode;
1513 }
1514 case Rwordbound:
1515 {
1516 opcode = Cwordbound;
1517 goto store_opcode;
1518 }
1519 case Rnotwordbound:
1520 {
1521 opcode = Cnotwordbound;
1522 goto store_opcode;
1523 }
1524 default:
1525 {
1526 abort();
1527 }
1528 }
1529 beginning_context = (op == Ropenpar || op == Ror);
1530 }
1531 if (starts_base != 0)
1532 goto parenthesis_error;
1533 assert(num_jumps == 0);
1534 ALLOC(1);
1535 STORE(Cend);
1536 SET_FIELDS;
1537 if(!re_optimize(bufp))
Guido van Rossumed2554a1997-08-18 15:31:24 +00001538 return (unsigned char *)"Optimization error";
Guido van Rossumfaf49081997-07-15 01:47:08 +00001539 return NULL;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001540
Guido van Rossum004c1e11997-05-09 02:35:58 +00001541 op_error:
Guido van Rossumfaf49081997-07-15 01:47:08 +00001542 SET_FIELDS;
Guido van Rossumed2554a1997-08-18 15:31:24 +00001543 return (unsigned char *)"Badly placed special character";
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001544
Guido van Rossum004c1e11997-05-09 02:35:58 +00001545 bad_match_register:
Guido van Rossumfaf49081997-07-15 01:47:08 +00001546 SET_FIELDS;
Guido van Rossumed2554a1997-08-18 15:31:24 +00001547 return (unsigned char *)"Bad match register number";
Guido van Rossum004c1e11997-05-09 02:35:58 +00001548
1549 hex_error:
Guido van Rossumfaf49081997-07-15 01:47:08 +00001550 SET_FIELDS;
Guido van Rossumed2554a1997-08-18 15:31:24 +00001551 return (unsigned char *)"Bad hexadecimal number";
Guido van Rossum004c1e11997-05-09 02:35:58 +00001552
1553 parenthesis_error:
Guido van Rossumfaf49081997-07-15 01:47:08 +00001554 SET_FIELDS;
Guido van Rossumed2554a1997-08-18 15:31:24 +00001555 return (unsigned char *)"Badly placed parenthesis";
Guido van Rossum004c1e11997-05-09 02:35:58 +00001556
1557 out_of_memory:
Guido van Rossumfaf49081997-07-15 01:47:08 +00001558 SET_FIELDS;
Guido van Rossumed2554a1997-08-18 15:31:24 +00001559 return (unsigned char *)"Out of memory";
Guido van Rossum004c1e11997-05-09 02:35:58 +00001560
1561 ends_prematurely:
Guido van Rossumfaf49081997-07-15 01:47:08 +00001562 SET_FIELDS;
Guido van Rossumed2554a1997-08-18 15:31:24 +00001563 return (unsigned char *)"Regular expression ends prematurely";
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001564
Guido van Rossum004c1e11997-05-09 02:35:58 +00001565 too_complex:
Guido van Rossumfaf49081997-07-15 01:47:08 +00001566 SET_FIELDS;
Guido van Rossumed2554a1997-08-18 15:31:24 +00001567 return (unsigned char *)"Regular expression too complex";
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001568}
Guido van Rossum004c1e11997-05-09 02:35:58 +00001569
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001570#undef CHARAT
1571#undef NEXTCHAR
1572#undef GETHEX
1573#undef ALLOC
1574#undef STORE
1575#undef CURRENT_LEVEL_START
1576#undef SET_LEVEL_START
1577#undef PUSH_LEVEL_STARTS
1578#undef POP_LEVEL_STARTS
1579#undef PUT_ADDR
1580#undef INSERT_JUMP
1581#undef SETBIT
1582#undef SET_FIELDS
1583
Guido van Rossum004c1e11997-05-09 02:35:58 +00001584#define PREFETCH if (text == textend) goto fail
1585
1586#define NEXTCHAR(var) \
1587PREFETCH; \
1588var = (unsigned char)*text++; \
1589if (translate) \
Guido van Rossumfaf49081997-07-15 01:47:08 +00001590 var = translate[var]
Guido van Rossum004c1e11997-05-09 02:35:58 +00001591
1592int re_match(regexp_t bufp,
Guido van Rossum95e80531997-08-13 22:34:14 +00001593 unsigned char *string,
Guido van Rossum004c1e11997-05-09 02:35:58 +00001594 int size,
1595 int pos,
1596 regexp_registers_t old_regs)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001597{
Guido van Rossum95e80531997-08-13 22:34:14 +00001598 unsigned char *code;
1599 unsigned char *translate;
1600 unsigned char *text;
1601 unsigned char *textstart;
1602 unsigned char *textend;
Guido van Rossumfaf49081997-07-15 01:47:08 +00001603 int a;
1604 int b;
1605 int ch;
1606 int reg;
1607 int match_end;
Guido van Rossum95e80531997-08-13 22:34:14 +00001608 unsigned char *regstart;
1609 unsigned char *regend;
Guido van Rossumfaf49081997-07-15 01:47:08 +00001610 int regsize;
1611 match_state state;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001612
Guido van Rossumfaf49081997-07-15 01:47:08 +00001613 assert(pos >= 0 && size >= 0);
1614 assert(pos <= size);
Guido van Rossum004c1e11997-05-09 02:35:58 +00001615
Guido van Rossumfaf49081997-07-15 01:47:08 +00001616 text = string + pos;
1617 textstart = string;
1618 textend = string + size;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001619
Guido van Rossumfaf49081997-07-15 01:47:08 +00001620 code = bufp->buffer;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001621
Guido van Rossumfaf49081997-07-15 01:47:08 +00001622 translate = bufp->translate;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001623
Guido van Rossumfaf49081997-07-15 01:47:08 +00001624 NEW_STATE(state, bufp->num_registers);
1625
Guido van Rossum004c1e11997-05-09 02:35:58 +00001626 continue_matching:
Guido van Rossumfaf49081997-07-15 01:47:08 +00001627 switch (*code++)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001628 {
Guido van Rossumfaf49081997-07-15 01:47:08 +00001629 case Cend:
Guido van Rossum004c1e11997-05-09 02:35:58 +00001630 {
Guido van Rossumfaf49081997-07-15 01:47:08 +00001631 match_end = text - textstart;
1632 if (old_regs)
1633 {
1634 old_regs->start[0] = pos;
1635 old_regs->end[0] = match_end;
1636 if (!bufp->uses_registers)
1637 {
1638 for (a = 1; a < RE_NREGS; a++)
1639 {
1640 old_regs->start[a] = -1;
1641 old_regs->end[a] = -1;
1642 }
1643 }
1644 else
1645 {
1646 for (a = 1; a < bufp->num_registers; a++)
1647 {
1648 if ((GET_REG_START(state, a) == NULL) ||
1649 (GET_REG_END(state, a) == NULL))
1650 {
1651 old_regs->start[a] = -1;
1652 old_regs->end[a] = -1;
1653 continue;
1654 }
1655 old_regs->start[a] = GET_REG_START(state, a) - textstart;
1656 old_regs->end[a] = GET_REG_END(state, a) - textstart;
1657 }
1658 for (; a < RE_NREGS; a++)
1659 {
1660 old_regs->start[a] = -1;
1661 old_regs->end[a] = -1;
1662 }
1663 }
1664 }
1665 FREE_STATE(state);
1666 return match_end - pos;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001667 }
Guido van Rossumfaf49081997-07-15 01:47:08 +00001668 case Cbol:
1669 {
1670 if (text == textstart || text[-1] == '\n')
1671 goto continue_matching;
1672 goto fail;
1673 }
1674 case Ceol:
1675 {
1676 if (text == textend || *text == '\n')
1677 goto continue_matching;
1678 goto fail;
1679 }
1680 case Cset:
1681 {
1682 NEXTCHAR(ch);
1683 if (code[ch/8] & (1<<(ch & 7)))
1684 {
1685 code += 256/8;
1686 goto continue_matching;
1687 }
1688 goto fail;
1689 }
1690 case Cexact:
1691 {
1692 NEXTCHAR(ch);
1693 if (ch != (unsigned char)*code++)
1694 goto fail;
1695 goto continue_matching;
1696 }
1697 case Canychar:
1698 {
1699 NEXTCHAR(ch);
1700 if (ch == '\n')
1701 goto fail;
1702 goto continue_matching;
1703 }
1704 case Cstart_memory:
1705 {
1706 reg = *code++;
1707 SET_REG_START(state, reg, text, goto error);
1708 goto continue_matching;
1709 }
1710 case Cend_memory:
1711 {
1712 reg = *code++;
1713 SET_REG_END(state, reg, text, goto error);
1714 goto continue_matching;
1715 }
1716 case Cmatch_memory:
1717 {
1718 reg = *code++;
1719 regstart = GET_REG_START(state, reg);
1720 regend = GET_REG_END(state, reg);
1721 if ((regstart == NULL) || (regend == NULL))
1722 goto fail; /* or should we just match nothing? */
1723 regsize = regend - regstart;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001724
Guido van Rossumfaf49081997-07-15 01:47:08 +00001725 if (regsize > (textend - text))
1726 goto fail;
1727 if(translate)
1728 {
1729 for (; regstart < regend; regstart++, text++)
1730 if (translate[*regstart] != translate[*text])
1731 goto fail;
1732 }
1733 else
1734 for (; regstart < regend; regstart++, text++)
1735 if (*regstart != *text)
1736 goto fail;
1737 goto continue_matching;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001738 }
Guido van Rossumfaf49081997-07-15 01:47:08 +00001739 case Cupdate_failure_jump:
Guido van Rossumdb25f321997-07-10 14:31:32 +00001740 {
Guido van Rossumfaf49081997-07-15 01:47:08 +00001741 UPDATE_FAILURE(state, text, goto error);
1742 /* fall to next case */
Guido van Rossumdb25f321997-07-10 14:31:32 +00001743 }
Guido van Rossumfaf49081997-07-15 01:47:08 +00001744 /* treat Cstar_jump just like Cjump if it hasn't been optimized */
1745 case Cstar_jump:
1746 case Cjump:
1747 {
1748 a = (unsigned char)*code++;
1749 a |= (unsigned char)*code++ << 8;
1750 code += (int)SHORT(a);
Guido van Rossum95e80531997-08-13 22:34:14 +00001751 if (code<bufp->buffer || bufp->buffer+bufp->used<code) {
1752 PyErr_SetString(PyExc_SystemError, "Regex VM jump out of bounds (Cjump)");
1753 FREE_STATE(state);
1754 return -2;
1755 }
Guido van Rossumfaf49081997-07-15 01:47:08 +00001756 goto continue_matching;
1757 }
1758 case Cdummy_failure_jump:
1759 {
Guido van Rossum95e80531997-08-13 22:34:14 +00001760 unsigned char *failuredest;
1761
Guido van Rossumfaf49081997-07-15 01:47:08 +00001762 a = (unsigned char)*code++;
1763 a |= (unsigned char)*code++ << 8;
1764 a = (int)SHORT(a);
1765 assert(*code == Cfailure_jump);
1766 b = (unsigned char)code[1];
1767 b |= (unsigned char)code[2] << 8;
Guido van Rossum95e80531997-08-13 22:34:14 +00001768 failuredest = code + (int)SHORT(b) + 3;
1769 if (failuredest<bufp->buffer || bufp->buffer+bufp->used < failuredest) {
1770 PyErr_SetString(PyExc_SystemError, "Regex VM jump out of bounds (Cdummy_failure_jump failuredest)");
1771 FREE_STATE(state);
1772 return -2;
1773 }
1774 PUSH_FAILURE(state, failuredest, NULL, goto error);
Guido van Rossumfaf49081997-07-15 01:47:08 +00001775 code += a;
Guido van Rossum95e80531997-08-13 22:34:14 +00001776 if (code<bufp->buffer || bufp->buffer+bufp->used < code) {
1777 PyErr_SetString(PyExc_SystemError, "Regex VM jump out of bounds (Cdummy_failure_jump code)");
1778 FREE_STATE(state);
1779 return -2;
1780 }
Guido van Rossumfaf49081997-07-15 01:47:08 +00001781 goto continue_matching;
1782 }
1783 case Cfailure_jump:
1784 {
1785 a = (unsigned char)*code++;
1786 a |= (unsigned char)*code++ << 8;
1787 a = (int)SHORT(a);
Guido van Rossum95e80531997-08-13 22:34:14 +00001788 if (code+a<bufp->buffer || bufp->buffer+bufp->used < code+a) {
1789 PyErr_SetString(PyExc_SystemError, "Regex VM jump out of bounds (Cfailure_jump)");
1790 FREE_STATE(state);
1791 return -2;
1792 }
Guido van Rossumfaf49081997-07-15 01:47:08 +00001793 PUSH_FAILURE(state, code + a, text, goto error);
1794 goto continue_matching;
1795 }
1796 case Crepeat1:
1797 {
Guido van Rossum95e80531997-08-13 22:34:14 +00001798 unsigned char *pinst;
Guido van Rossumfaf49081997-07-15 01:47:08 +00001799 a = (unsigned char)*code++;
1800 a |= (unsigned char)*code++ << 8;
1801 a = (int)SHORT(a);
1802 pinst = code + a;
Guido van Rossum95e80531997-08-13 22:34:14 +00001803 if (pinst<bufp->buffer || bufp->buffer+bufp->used<pinst) {
1804 PyErr_SetString(PyExc_SystemError, "Regex VM jump out of bounds (Crepeat1)");
1805 FREE_STATE(state);
1806 return -2;
1807 }
Guido van Rossumfaf49081997-07-15 01:47:08 +00001808 /* pinst is sole instruction in loop, and it matches a
1809 * single character. Since Crepeat1 was originally a
1810 * Cupdate_failure_jump, we also know that backtracking
1811 * is useless: so long as the single-character
1812 * expression matches, it must be used. Also, in the
1813 * case of +, we've already matched one character, so +
1814 * can't fail: nothing here can cause a failure. */
1815 switch (*pinst++)
1816 {
1817 case Cset:
Guido van Rossum95e80531997-08-13 22:34:14 +00001818 {
1819 if (translate)
Guido van Rossumfaf49081997-07-15 01:47:08 +00001820 {
1821 while (text < textend)
1822 {
1823 ch = translate[(unsigned char)*text];
1824 if (pinst[ch/8] & (1<<(ch & 7)))
1825 text++;
1826 else
1827 break;
1828 }
1829 }
1830 else
1831 {
1832 while (text < textend)
1833 {
1834 ch = (unsigned char)*text;
1835 if (pinst[ch/8] & (1<<(ch & 7)))
1836 text++;
1837 else
1838 break;
1839 }
1840 }
1841 break;
Guido van Rossum95e80531997-08-13 22:34:14 +00001842 }
Guido van Rossumfaf49081997-07-15 01:47:08 +00001843 case Cexact:
1844 {
1845 ch = (unsigned char)*pinst;
1846 if (translate)
1847 {
1848 while (text < textend &&
1849 translate[(unsigned char)*text] == ch)
1850 text++;
1851 }
1852 else
1853 {
1854 while (text < textend && (unsigned char)*text == ch)
1855 text++;
1856 }
1857 break;
1858 }
1859 case Canychar:
1860 {
1861 while (text < textend && (unsigned char)*text != '\n')
1862 text++;
1863 break;
1864 }
1865 case Csyntaxspec:
1866 {
1867 a = (unsigned char)*pinst;
1868 if (translate)
1869 {
1870 while (text < textend &&
1871 translate[SYNTAX(*text)] == a)
1872 text++;
1873 }
1874 else
1875 {
1876 while (text < textend && SYNTAX(*text) == a)
1877 text++;
1878 }
1879 break;
1880 }
1881 case Cnotsyntaxspec:
1882 {
1883 a = (unsigned char)*pinst;
1884 if (translate)
1885 {
1886 while (text < textend &&
1887 translate[SYNTAX(*text)] != a)
1888 text++;
1889 }
1890 else
1891 {
1892 while (text < textend && SYNTAX(*text) != a)
1893 text++;
1894 }
1895 break;
1896 }
1897 default:
1898 {
Guido van Rossum95e80531997-08-13 22:34:14 +00001899 FREE_STATE(state);
1900 PyErr_SetString(PyExc_SystemError, "Unknown regex opcode: memory corrupted?");
1901 return -2;
Guido van Rossumfaf49081997-07-15 01:47:08 +00001902 /*NOTREACHED*/
1903 }
1904 }
1905 /* due to the funky way + and * are compiled, the top
1906 * failure- stack entry at this point is actually a
1907 * success entry -- update it & pop it */
1908 UPDATE_FAILURE(state, text, goto error);
1909 goto fail; /* i.e., succeed <wink/sigh> */
1910 }
1911 case Cbegbuf:
1912 {
1913 if (text == textstart)
1914 goto continue_matching;
1915 goto fail;
1916 }
1917 case Cendbuf:
1918 {
1919 if (text == textend)
1920 goto continue_matching;
1921 goto fail;
1922 }
1923 case Cwordbeg:
1924 {
1925 if (text == textend)
1926 goto fail;
Guido van Rossum95e80531997-08-13 22:34:14 +00001927 if (!(SYNTAX(*text) & Sword))
Guido van Rossumfaf49081997-07-15 01:47:08 +00001928 goto fail;
1929 if (text == textstart)
1930 goto continue_matching;
Guido van Rossum74fb3031997-07-17 22:41:38 +00001931 if (!(SYNTAX(text[-1]) & Sword))
Guido van Rossumfaf49081997-07-15 01:47:08 +00001932 goto continue_matching;
1933 goto fail;
1934 }
1935 case Cwordend:
1936 {
1937 if (text == textstart)
1938 goto fail;
Guido van Rossum74fb3031997-07-17 22:41:38 +00001939 if (!(SYNTAX(text[-1]) & Sword))
Guido van Rossumfaf49081997-07-15 01:47:08 +00001940 goto fail;
1941 if (text == textend)
1942 goto continue_matching;
Guido van Rossum95e80531997-08-13 22:34:14 +00001943 if (!(SYNTAX(*text) & Sword))
1944 goto continue_matching;
1945 goto fail;
Guido van Rossumfaf49081997-07-15 01:47:08 +00001946 }
1947 case Cwordbound:
1948 {
1949 /* Note: as in gnu regexp, this also matches at the
1950 * beginning and end of buffer. */
Guido van Rossum004c1e11997-05-09 02:35:58 +00001951
Guido van Rossumfaf49081997-07-15 01:47:08 +00001952 if (text == textstart || text == textend)
1953 goto continue_matching;
Guido van Rossum74fb3031997-07-17 22:41:38 +00001954 if ((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword))
Guido van Rossumfaf49081997-07-15 01:47:08 +00001955 goto continue_matching;
1956 goto fail;
1957 }
1958 case Cnotwordbound:
1959 {
1960 /* Note: as in gnu regexp, this never matches at the
1961 * beginning and end of buffer. */
1962 if (text == textstart || text == textend)
1963 goto fail;
Guido van Rossum74fb3031997-07-17 22:41:38 +00001964 if (!((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword)))
Guido van Rossum53665e51997-08-15 15:45:25 +00001965 goto continue_matching;
1966 goto fail;
Guido van Rossumfaf49081997-07-15 01:47:08 +00001967 }
1968 case Csyntaxspec:
1969 {
1970 NEXTCHAR(ch);
Guido van Rossum74fb3031997-07-17 22:41:38 +00001971 if (!(SYNTAX(ch) & (unsigned char)*code++))
Guido van Rossumfaf49081997-07-15 01:47:08 +00001972 goto fail;
1973 goto continue_matching;
1974 }
1975 case Cnotsyntaxspec:
1976 {
1977 NEXTCHAR(ch);
Guido van Rossum74fb3031997-07-17 22:41:38 +00001978 if (SYNTAX(ch) & (unsigned char)*code++)
Guido van Rossum95e80531997-08-13 22:34:14 +00001979 goto fail;
Guido van Rossumfaf49081997-07-15 01:47:08 +00001980 goto continue_matching;
1981 }
1982 default:
1983 {
Guido van Rossum95e80531997-08-13 22:34:14 +00001984 FREE_STATE(state);
1985 PyErr_SetString(PyExc_SystemError, "Unknown regex opcode: memory corrupted?");
1986 return -2;
Guido van Rossumfaf49081997-07-15 01:47:08 +00001987 /*NOTREACHED*/
1988 }
1989 }
Guido van Rossum95e80531997-08-13 22:34:14 +00001990
1991
Guido van Rossum004c1e11997-05-09 02:35:58 +00001992
Guido van Rossum3b1a57a1992-01-27 16:47:46 +00001993#if 0 /* This line is never reached --Guido */
Guido van Rossumfaf49081997-07-15 01:47:08 +00001994 abort();
Guido van Rossum5f21dd11992-01-19 16:49:14 +00001995#endif
Guido van Rossumfaf49081997-07-15 01:47:08 +00001996 /*
1997 *NOTREACHED
1998 */
Guido van Rossum95e80531997-08-13 22:34:14 +00001999
2000 /* Using "break;" in the above switch statement is equivalent to "goto fail;" */
Guido van Rossum004c1e11997-05-09 02:35:58 +00002001 fail:
Guido van Rossumfaf49081997-07-15 01:47:08 +00002002 POP_FAILURE(state, code, text, goto done_matching, goto error);
2003 goto continue_matching;
Guido van Rossum004c1e11997-05-09 02:35:58 +00002004
2005 done_matching:
2006/* if(translated != NULL) */
2007/* free(translated); */
Guido van Rossumfaf49081997-07-15 01:47:08 +00002008 FREE_STATE(state);
2009 return -1;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00002010
Guido van Rossum004c1e11997-05-09 02:35:58 +00002011 error:
2012/* if (translated != NULL) */
2013/* free(translated); */
Guido van Rossumfaf49081997-07-15 01:47:08 +00002014 FREE_STATE(state);
2015 return -2;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00002016}
Guido van Rossum95e80531997-08-13 22:34:14 +00002017
Guido van Rossumb674c3b1992-01-19 16:32:47 +00002018
2019#undef PREFETCH
2020#undef NEXTCHAR
Guido van Rossumb674c3b1992-01-19 16:32:47 +00002021
Guido van Rossum004c1e11997-05-09 02:35:58 +00002022int re_search(regexp_t bufp,
Guido van Rossum95e80531997-08-13 22:34:14 +00002023 unsigned char *string,
Guido van Rossum004c1e11997-05-09 02:35:58 +00002024 int size,
2025 int pos,
2026 int range,
2027 regexp_registers_t regs)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00002028{
Guido van Rossum95e80531997-08-13 22:34:14 +00002029 unsigned char *fastmap;
2030 unsigned char *translate;
2031 unsigned char *text;
2032 unsigned char *partstart;
2033 unsigned char *partend;
Guido van Rossumfaf49081997-07-15 01:47:08 +00002034 int dir;
2035 int ret;
Guido van Rossum95e80531997-08-13 22:34:14 +00002036 unsigned char anchor;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00002037
Guido van Rossumfaf49081997-07-15 01:47:08 +00002038 assert(size >= 0 && pos >= 0);
2039 assert(pos + range >= 0 && pos + range <= size); /* Bugfix by ylo */
Guido van Rossumb674c3b1992-01-19 16:32:47 +00002040
Guido van Rossumfaf49081997-07-15 01:47:08 +00002041 fastmap = bufp->fastmap;
2042 translate = bufp->translate;
Guido van Rossum95e80531997-08-13 22:34:14 +00002043 if (fastmap && !bufp->fastmap_accurate) {
2044 re_compile_fastmap(bufp);
2045 if (PyErr_Occurred()) return -2;
2046 }
2047
Guido van Rossumfaf49081997-07-15 01:47:08 +00002048 anchor = bufp->anchor;
2049 if (bufp->can_be_null == 1) /* can_be_null == 2: can match null at eob */
2050 fastmap = NULL;
Guido van Rossum004c1e11997-05-09 02:35:58 +00002051
Guido van Rossumfaf49081997-07-15 01:47:08 +00002052 if (range < 0)
2053 {
2054 dir = -1;
2055 range = -range;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00002056 }
Guido van Rossum004c1e11997-05-09 02:35:58 +00002057 else
Guido van Rossumfaf49081997-07-15 01:47:08 +00002058 dir = 1;
2059
2060 if (anchor == 2)
2061 if (pos != 0)
2062 return -1;
2063 else
2064 range = 0;
2065
2066 for (; range >= 0; range--, pos += dir)
2067 {
2068 if (fastmap)
2069 {
2070 if (dir == 1)
2071 { /* searching forwards */
2072
2073 text = string + pos;
2074 partend = string + size;
2075 partstart = text;
2076 if (translate)
2077 while (text != partend &&
2078 !fastmap[(unsigned char) translate[(unsigned char)*text]])
2079 text++;
2080 else
2081 while (text != partend && !fastmap[(unsigned char)*text])
2082 text++;
2083 pos += text - partstart;
2084 range -= text - partstart;
2085 if (pos == size && bufp->can_be_null == 0)
2086 return -1;
2087 }
2088 else
2089 { /* searching backwards */
2090 text = string + pos;
2091 partstart = string + pos - range;
2092 partend = text;
2093 if (translate)
2094 while (text != partstart &&
2095 !fastmap[(unsigned char)
2096 translate[(unsigned char)*text]])
2097 text--;
2098 else
2099 while (text != partstart &&
2100 !fastmap[(unsigned char)*text])
2101 text--;
2102 pos -= partend - text;
2103 range -= partend - text;
2104 }
2105 }
2106 if (anchor == 1)
2107 { /* anchored to begline */
2108 if (pos > 0 && (string[pos - 1] != '\n'))
2109 continue;
2110 }
2111 assert(pos >= 0 && pos <= size);
2112 ret = re_match(bufp, string, size, pos, regs);
2113 if (ret >= 0)
2114 return pos;
2115 if (ret == -2)
2116 return -2;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00002117 }
Guido van Rossumfaf49081997-07-15 01:47:08 +00002118 return -1;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00002119}
Guido van Rossum74fb3031997-07-17 22:41:38 +00002120
2121/*
2122** Local Variables:
2123** mode: c
2124** c-file-style: "python"
2125** End:
2126*/