blob: 0b6ec8b8de6733efd24c46f41804d97ccf042bfa [file] [log] [blame]
Guido van Rossumdb25f321997-07-10 14:31:32 +00001/*
2 * -*- mode: c-mode; c-file-style: python -*-
3 */
4
Guido van Rossum004c1e11997-05-09 02:35:58 +00005/* regexpr.c
6 *
7 * Author: Tatu Ylonen <ylo@ngs.fi>
8 *
9 * Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
10 *
11 * Permission to use, copy, modify, distribute, and sell this software
12 * and its documentation for any purpose is hereby granted without
13 * fee, provided that the above copyright notice appear in all copies.
14 * This software is provided "as is" without express or implied
15 * warranty.
16 *
17 * Created: Thu Sep 26 17:14:05 1991 ylo
18 * Last modified: Mon Nov 4 17:06:48 1991 ylo
19 * Ported to Think C: 19 Jan 1992 guido@cwi.nl
20 *
21 * This code draws many ideas from the regular expression packages by
22 * Henry Spencer of the University of Toronto and Richard Stallman of
23 * the Free Software Foundation.
24 *
25 * Emacs-specific code and syntax table code is almost directly borrowed
26 * from GNU regexp.
27 *
28 * Bugs fixed and lots of reorganization by Jeffrey C. Ollie, April
29 * 1997 Thanks for bug reports and ideas from Andrew Kuchling, Tim
30 * Peters, Guido van Rossum, Ka-Ping Yee, Sjoerd Mullender, and
31 * probably one or two others that I'm forgetting.
32 *
33 * $Id$ */
Guido van Rossumb674c3b1992-01-19 16:32:47 +000034
Guido van Rossum339cfa31996-08-08 19:12:37 +000035#include "config.h" /* For Win* specific redefinition of printf c.s. */
Guido van Rossum339cfa31996-08-08 19:12:37 +000036
Guido van Rossum004c1e11997-05-09 02:35:58 +000037#include "myproto.h" /* For PROTO macro --Guido */
Guido van Rossum3b1a57a1992-01-27 16:47:46 +000038
Guido van Rossumb674c3b1992-01-19 16:32:47 +000039#include <stdio.h>
Guido van Rossum004c1e11997-05-09 02:35:58 +000040
41#ifndef NDEBUG
42#define NDEBUG 1
43#endif
44
Guido van Rossumb674c3b1992-01-19 16:32:47 +000045#include <assert.h>
46#include "regexpr.h"
47
Guido van Rossum3b1a57a1992-01-27 16:47:46 +000048#ifdef THINK_C
49/* Think C on the Mac really needs these headers... --Guido */
50#include <stdlib.h>
51#include <string.h>
52#else
Guido van Rossum88661e81996-05-23 22:55:58 +000053#if defined(__STDC__) || defined(_MSC_VER)
Guido van Rossum9abc5391992-03-27 17:24:37 +000054/* Don't mess around, use the standard headers */
55#include <stdlib.h>
56#include <string.h>
57#else
Guido van Rossumb674c3b1992-01-19 16:32:47 +000058char *malloc();
59void free();
60char *realloc();
Guido van Rossum9abc5391992-03-27 17:24:37 +000061#endif /* __STDC__ */
62#endif /* THINK_C */
Guido van Rossumb674c3b1992-01-19 16:32:47 +000063
Guido van Rossumdb25f321997-07-10 14:31:32 +000064/* The original code blithely assumed that sizeof(short) == 2. Not
65 * always true. Original instances of "(short)x" were replaced by
66 * SHORT(x), where SHORT is #defined below. */
67
68#define SHORT(x) ((x) & 0x8000 ? (x) - 0x10000 : (x))
69
Guido van Rossum004c1e11997-05-09 02:35:58 +000070/* The stack implementation is taken from an idea by Andrew Kuchling.
71 * It's a doubly linked list of arrays. The advantages of this over a
72 * simple linked list are that the number of mallocs required are
73 * reduced. It also makes it possible to statically allocate enough
74 * space so that small patterns don't ever need to call malloc.
75 *
76 * The advantages over a single array is that is periodically
77 * realloced when more space is needed is that we avoid ever copying
78 * the stack. */
79
80/* item_t is the basic stack element. Defined as a union of
81 * structures so that both registers, failure points, and counters can
82 * be pushed/popped from the stack. There's nothing built into the
83 * item to keep track of whether a certain stack item is a register, a
84 * failure point, or a counter. */
85
86typedef union item_t
87{
Guido van Rossumdb25f321997-07-10 14:31:32 +000088 struct
89 {
90 int num;
91 int level;
92 char *start;
93 char *end;
94 } reg;
95 struct
96 {
97 int count;
98 int level;
99 int phantom;
100 char *code;
101 char *text;
102 } fail;
103 struct
104 {
105 int num;
106 int level;
107 int count;
108 } cntr;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000109} item_t;
110
111#define STACK_PAGE_SIZE 256
112#define NUM_REGISTERS 256
113
114/* A 'page' of stack items. */
115
116typedef struct item_page_t
117{
Guido van Rossumdb25f321997-07-10 14:31:32 +0000118 item_t items[STACK_PAGE_SIZE];
119 struct item_page_t *prev;
120 struct item_page_t *next;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000121} item_page_t;
122
123
124typedef struct match_state
125{
Guido van Rossumdb25f321997-07-10 14:31:32 +0000126 /* The number of registers that have been pushed onto the stack
127 * since the last failure point. */
Guido van Rossum004c1e11997-05-09 02:35:58 +0000128
Guido van Rossumdb25f321997-07-10 14:31:32 +0000129 int count;
130
131 /* Used to control when registers need to be pushed onto the
132 * stack. */
133
134 int level;
135
136 /* The number of failure points on the stack. */
137
138 int point;
139
140 /* Storage for the registers. Each register consists of two
141 * pointers to characters. So register N is represented as
142 * start[N] and end[N]. The pointers must be converted to
143 * offsets from the beginning of the string before returning the
144 * registers to the calling program. */
145
146 char *start[NUM_REGISTERS];
147 char *end[NUM_REGISTERS];
148
149 /* Keeps track of whether a register has changed recently. */
150
151 int changed[NUM_REGISTERS];
152
153 /* Structure to encapsulate the stack. */
154 struct
155 {
156 /* index into the curent page. If index == 0 and you need
157 * to pop an item, move to the previous page and set index
158 * = STACK_PAGE_SIZE - 1. Otherwise decrement index to
159 * push a page. If index == STACK_PAGE_SIZE and you need
160 * to push a page move to the next page and set index =
161 * 0. If there is no new next page, allocate a new page
162 * and link it in. Otherwise, increment index to push a
163 * page. */
164
165 int index;
166 item_page_t *current; /* Pointer to the current page. */
167 item_page_t first; /* First page is statically allocated. */
168 } stack;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000169} match_state;
170
Guido van Rossumdb25f321997-07-10 14:31:32 +0000171/* Initialize a state object */
172
173/* #define NEW_STATE(state) \ */
174/* memset(&state, 0, (void *)(&state.stack) - (void *)(&state)); \ */
175/* state.stack.current = &state.stack.first; \ */
176/* state.stack.first.prev = NULL; \ */
177/* state.stack.first.next = NULL; \ */
178/* state.stack.index = 0; \ */
179/* state.level = 1 */
180
181#define NEW_STATE(state, nregs) \
182{ \
183 int i; \
184 for (i = 0; i < nregs; i++) \
185 { \
186 state.start[i] = NULL; \
187 state.end[i] = NULL; \
188 state.changed[i] = 0; \
189 } \
190 state.stack.current = &state.stack.first; \
191 state.stack.first.prev = NULL; \
192 state.stack.first.next = NULL; \
193 state.stack.index = 0; \
194 state.level = 1; \
195 state.count = 0; \
196 state.level = 0; \
197 state.point = 0; \
198}
199
200/* Free any memory that might have been malloc'd */
201
202#define FREE_STATE(state) \
203while(state.stack.first.next != NULL) \
204{ \
205 state.stack.current = state.stack.first.next; \
206 state.stack.first.next = state.stack.current->next; \
207 free(state.stack.current); \
208}
209
Guido van Rossum004c1e11997-05-09 02:35:58 +0000210/* Discard the top 'count' stack items. */
211
212#define STACK_DISCARD(stack, count, on_error) \
213stack.index -= count; \
214while (stack.index < 0) \
215{ \
216 if (stack.current->prev == NULL) \
217 on_error; \
218 stack.current = stack.current->prev; \
219 stack.index += STACK_PAGE_SIZE; \
220}
221
222/* Store a pointer to the previous item on the stack. Used to pop an
223 * item off of the stack. */
224
225#define STACK_PREV(stack, top, on_error) \
226if (stack.index == 0) \
227{ \
228 if (stack.current->prev == NULL) \
229 on_error; \
230 stack.current = stack.current->prev; \
231 stack.index = STACK_PAGE_SIZE - 1; \
232} \
233else \
234 stack.index--; \
235top = &(stack.current->items[stack.index])
236
237/* Store a pointer to the next item on the stack. Used to push an item
238 * on to the stack. */
239
240#define STACK_NEXT(stack, top, on_error) \
241if (stack.index == STACK_PAGE_SIZE) \
242{ \
243 if (stack.current->next == NULL) \
244 { \
Guido van Rossum445efa91997-05-14 15:30:32 +0000245 stack.current->next = (item_page_t *)malloc(sizeof(item_page_t)); \
Guido van Rossum004c1e11997-05-09 02:35:58 +0000246 if (stack.current->next == NULL) \
247 on_error; \
248 stack.current->next->prev = stack.current; \
249 stack.current->next->next = NULL; \
250 } \
251 stack.current = stack.current->next; \
252 stack.index = 0; \
253} \
254top = &(stack.current->items[stack.index++])
255
256/* Store a pointer to the item that is 'count' items back in the
257 * stack. STACK_BACK(stack, top, 1, on_error) is equivalent to
258 * STACK_TOP(stack, top, on_error). */
259
260#define STACK_BACK(stack, top, count, on_error) \
261{ \
262 int index; \
263 item_page_t *current; \
264 current = stack.current; \
265 index = stack.index - (count); \
266 while (index < 0) \
267 { \
268 if (current->prev == NULL) \
269 on_error; \
270 current = current->prev; \
271 index += STACK_PAGE_SIZE; \
272 } \
273 top = &(current->items[index]); \
274}
275
276/* Store a pointer to the top item on the stack. Execute the
277 * 'on_error' code if there are no items on the stack. */
278
279#define STACK_TOP(stack, top, on_error) \
280if (stack.index == 0) \
281{ \
282 if (stack.current->prev == NULL) \
283 on_error; \
284 top = &(stack.current->prev->items[STACK_PAGE_SIZE - 1]); \
285} \
286else \
287 top = &(stack.current->items[stack.index - 1])
288
289/* Test to see if the stack is empty */
290
291#define STACK_EMPTY(stack) ((stack.index == 0) && \
292 (stack.current->prev == NULL))
293
Guido van Rossum004c1e11997-05-09 02:35:58 +0000294/* Return the start of register 'reg' */
295
296#define GET_REG_START(state, reg) (state.start[reg])
297
298/* Return the end of register 'reg' */
299
300#define GET_REG_END(state, reg) (state.end[reg])
301
302/* Set the start of register 'reg'. If the state of the register needs
303 * saving, push it on the stack. */
304
305#define SET_REG_START(state, reg, text, on_error) \
306if(state.changed[reg] < state.level) \
307{ \
308 item_t *item; \
309 STACK_NEXT(state.stack, item, on_error); \
310 item->reg.num = reg; \
311 item->reg.start = state.start[reg]; \
312 item->reg.end = state.end[reg]; \
313 item->reg.level = state.changed[reg]; \
314 state.changed[reg] = state.level; \
315 state.count++; \
316} \
317state.start[reg] = text
318
319/* Set the end of register 'reg'. If the state of the register needs
320 * saving, push it on the stack. */
321
322#define SET_REG_END(state, reg, text, on_error) \
323if(state.changed[reg] < state.level) \
324{ \
325 item_t *item; \
326 STACK_NEXT(state.stack, item, on_error); \
327 item->reg.num = reg; \
328 item->reg.start = state.start[reg]; \
329 item->reg.end = state.end[reg]; \
330 item->reg.level = state.changed[reg]; \
331 state.changed[reg] = state.level; \
332 state.count++; \
333} \
334state.end[reg] = text
335
336#define PUSH_FAILURE(state, xcode, xtext, on_error) \
337{ \
338 item_t *item; \
339 STACK_NEXT(state.stack, item, on_error); \
340 item->fail.code = xcode; \
341 item->fail.text = xtext; \
342 item->fail.count = state.count; \
343 item->fail.level = state.level; \
344 item->fail.phantom = 0; \
345 state.count = 0; \
346 state.level++; \
347 state.point++; \
348}
349
350/* Update the last failure point with a new position in the text. */
351
Guido van Rossum004c1e11997-05-09 02:35:58 +0000352#define UPDATE_FAILURE(state, xtext, on_error) \
353{ \
354 item_t *item; \
355 STACK_BACK(state.stack, item, state.count + 1, on_error); \
356 if (!item->fail.phantom) \
357 { \
358 item_t *item2; \
359 STACK_NEXT(state.stack, item2, on_error); \
360 item2->fail.code = item->fail.code; \
361 item2->fail.text = xtext; \
362 item2->fail.count = state.count; \
363 item2->fail.level = state.level; \
364 item2->fail.phantom = 1; \
365 state.count = 0; \
366 state.level++; \
367 state.point++; \
368 } \
369 else \
370 { \
371 STACK_DISCARD(state.stack, state.count, on_error); \
372 STACK_TOP(state.stack, item, on_error); \
373 item->fail.text = xtext; \
374 state.count = 0; \
375 state.level++; \
376 } \
377}
378
379#define POP_FAILURE(state, xcode, xtext, on_empty, on_error) \
380{ \
381 item_t *item; \
382 do \
383 { \
384 while(state.count > 0) \
385 { \
386 STACK_PREV(state.stack, item, on_error); \
387 state.start[item->reg.num] = item->reg.start; \
388 state.end[item->reg.num] = item->reg.end; \
389 state.changed[item->reg.num] = item->reg.level; \
390 state.count--; \
391 } \
392 STACK_PREV(state.stack, item, on_empty); \
393 xcode = item->fail.code; \
394 xtext = item->fail.text; \
395 state.count = item->fail.count; \
396 state.level = item->fail.level; \
397 state.point--; \
398 } \
399 while (item->fail.text == NULL); \
400}
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000401
402enum regexp_compiled_ops /* opcodes for compiled regexp */
403{
404 Cend, /* end of pattern reached */
405 Cbol, /* beginning of line */
406 Ceol, /* end of line */
407 Cset, /* character set. Followed by 32 bytes of set. */
408 Cexact, /* followed by a byte to match */
409 Canychar, /* matches any character except newline */
410 Cstart_memory, /* set register start addr (followed by reg number) */
411 Cend_memory, /* set register end addr (followed by reg number) */
412 Cmatch_memory, /* match a duplicate of reg contents (regnum follows)*/
413 Cjump, /* followed by two bytes (lsb,msb) of displacement. */
414 Cstar_jump, /* will change to jump/update_failure_jump at runtime */
415 Cfailure_jump, /* jump to addr on failure */
416 Cupdate_failure_jump, /* update topmost failure point and jump */
417 Cdummy_failure_jump, /* push a dummy failure point and jump */
418 Cbegbuf, /* match at beginning of buffer */
419 Cendbuf, /* match at end of buffer */
420 Cwordbeg, /* match at beginning of word */
421 Cwordend, /* match at end of word */
422 Cwordbound, /* match if at word boundary */
423 Cnotwordbound, /* match if not at word boundary */
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000424 Csyntaxspec, /* matches syntax code (1 byte follows) */
Guido van Rossumdb25f321997-07-10 14:31:32 +0000425 Cnotsyntaxspec, /* matches if syntax code does not match (1 byte foll)*/
426 Crepeat1
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000427};
428
429enum regexp_syntax_op /* syntax codes for plain and quoted characters */
430{
431 Rend, /* special code for end of regexp */
432 Rnormal, /* normal character */
433 Ranychar, /* any character except newline */
434 Rquote, /* the quote character */
435 Rbol, /* match beginning of line */
436 Reol, /* match end of line */
437 Roptional, /* match preceding expression optionally */
438 Rstar, /* match preceding expr zero or more times */
439 Rplus, /* match preceding expr one or more times */
440 Ror, /* match either of alternatives */
441 Ropenpar, /* opening parenthesis */
442 Rclosepar, /* closing parenthesis */
443 Rmemory, /* match memory register */
444 Rextended_memory, /* \vnn to match registers 10-99 */
445 Ropenset, /* open set. Internal syntax hard-coded below. */
446 /* the following are gnu extensions to "normal" regexp syntax */
447 Rbegbuf, /* beginning of buffer */
448 Rendbuf, /* end of buffer */
449 Rwordchar, /* word character */
450 Rnotwordchar, /* not word character */
451 Rwordbeg, /* beginning of word */
452 Rwordend, /* end of word */
453 Rwordbound, /* word bound */
454 Rnotwordbound, /* not word bound */
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000455 Rnum_ops
456};
457
458static int re_compile_initialized = 0;
459static int regexp_syntax = 0;
Guido van Rossumb6775db1994-08-01 11:34:53 +0000460int re_syntax = 0; /* Exported copy of regexp_syntax */
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000461static unsigned char regexp_plain_ops[256];
462static unsigned char regexp_quoted_ops[256];
463static unsigned char regexp_precedences[Rnum_ops];
464static int regexp_context_indep_ops;
465static int regexp_ansi_sequences;
466
467#define NUM_LEVELS 5 /* number of precedence levels in use */
468#define MAX_NESTING 100 /* max nesting level of operators */
469
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000470#define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
471#define Sword 1
472
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000473static char re_syntax_table[256];
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000474
Guido van Rossum004c1e11997-05-09 02:35:58 +0000475static void re_compile_initialize(void)
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000476{
Guido van Rossum004c1e11997-05-09 02:35:58 +0000477 int a;
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000478
Guido van Rossum004c1e11997-05-09 02:35:58 +0000479 static int syntax_table_inited = 0;
480
481 if (!syntax_table_inited)
482 {
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000483 syntax_table_inited = 1;
484 memset(re_syntax_table, 0, 256);
485 for (a = 'a'; a <= 'z'; a++)
Guido van Rossum004c1e11997-05-09 02:35:58 +0000486 re_syntax_table[a] = Sword;
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000487 for (a = 'A'; a <= 'Z'; a++)
Guido van Rossum004c1e11997-05-09 02:35:58 +0000488 re_syntax_table[a] = Sword;
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000489 for (a = '0'; a <= '9'; a++)
Guido van Rossum004c1e11997-05-09 02:35:58 +0000490 re_syntax_table[a] = Sword;
491 }
492 re_compile_initialized = 1;
493 for (a = 0; a < 256; a++)
494 {
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000495 regexp_plain_ops[a] = Rnormal;
496 regexp_quoted_ops[a] = Rnormal;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000497 }
498 for (a = '0'; a <= '9'; a++)
499 regexp_quoted_ops[a] = Rmemory;
500 regexp_plain_ops['\134'] = Rquote;
501 if (regexp_syntax & RE_NO_BK_PARENS)
502 {
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000503 regexp_plain_ops['('] = Ropenpar;
504 regexp_plain_ops[')'] = Rclosepar;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000505 }
506 else
507 {
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000508 regexp_quoted_ops['('] = Ropenpar;
509 regexp_quoted_ops[')'] = Rclosepar;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000510 }
511 if (regexp_syntax & RE_NO_BK_VBAR)
512 regexp_plain_ops['\174'] = Ror;
513 else
514 regexp_quoted_ops['\174'] = Ror;
515 regexp_plain_ops['*'] = Rstar;
516 if (regexp_syntax & RE_BK_PLUS_QM)
517 {
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000518 regexp_quoted_ops['+'] = Rplus;
519 regexp_quoted_ops['?'] = Roptional;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000520 }
521 else
522 {
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000523 regexp_plain_ops['+'] = Rplus;
524 regexp_plain_ops['?'] = Roptional;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000525 }
526 if (regexp_syntax & RE_NEWLINE_OR)
527 regexp_plain_ops['\n'] = Ror;
528 regexp_plain_ops['\133'] = Ropenset;
529 regexp_plain_ops['\136'] = Rbol;
530 regexp_plain_ops['$'] = Reol;
531 regexp_plain_ops['.'] = Ranychar;
532 if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS))
533 {
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000534 regexp_quoted_ops['w'] = Rwordchar;
535 regexp_quoted_ops['W'] = Rnotwordchar;
536 regexp_quoted_ops['<'] = Rwordbeg;
537 regexp_quoted_ops['>'] = Rwordend;
538 regexp_quoted_ops['b'] = Rwordbound;
539 regexp_quoted_ops['B'] = Rnotwordbound;
540 regexp_quoted_ops['`'] = Rbegbuf;
541 regexp_quoted_ops['\''] = Rendbuf;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000542 }
543 if (regexp_syntax & RE_ANSI_HEX)
544 regexp_quoted_ops['v'] = Rextended_memory;
545 for (a = 0; a < Rnum_ops; a++)
546 regexp_precedences[a] = 4;
547 if (regexp_syntax & RE_TIGHT_VBAR)
548 {
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000549 regexp_precedences[Ror] = 3;
550 regexp_precedences[Rbol] = 2;
551 regexp_precedences[Reol] = 2;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000552 }
553 else
554 {
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000555 regexp_precedences[Ror] = 2;
556 regexp_precedences[Rbol] = 3;
557 regexp_precedences[Reol] = 3;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000558 }
559 regexp_precedences[Rclosepar] = 1;
560 regexp_precedences[Rend] = 0;
561 regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0;
562 regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000563}
564
Guido van Rossum004c1e11997-05-09 02:35:58 +0000565int re_set_syntax(int syntax)
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000566{
Guido van Rossum004c1e11997-05-09 02:35:58 +0000567 int ret;
568
569 ret = regexp_syntax;
570 regexp_syntax = syntax;
571 re_syntax = syntax; /* Exported copy */
572 re_compile_initialize();
573 return ret;
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000574}
575
Guido van Rossum004c1e11997-05-09 02:35:58 +0000576static int hex_char_to_decimal(int ch)
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000577{
Guido van Rossum004c1e11997-05-09 02:35:58 +0000578 if (ch >= '0' && ch <= '9')
579 return ch - '0';
580 if (ch >= 'a' && ch <= 'f')
581 return ch - 'a' + 10;
582 if (ch >= 'A' && ch <= 'F')
583 return ch - 'A' + 10;
584 return 16;
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000585}
586
Guido van Rossum004c1e11997-05-09 02:35:58 +0000587static void re_compile_fastmap_aux(char *code,
588 int pos,
589 char *visited,
590 char *can_be_null,
591 char *fastmap)
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000592{
Guido van Rossum004c1e11997-05-09 02:35:58 +0000593 int a;
594 int b;
595 int syntaxcode;
596
597 if (visited[pos])
598 return; /* we have already been here */
599 visited[pos] = 1;
600 for (;;)
601 switch (code[pos++])
602 {
603 case Cend:
604 {
605 *can_be_null = 1;
606 return;
607 }
608 case Cbol:
609 case Cbegbuf:
610 case Cendbuf:
611 case Cwordbeg:
612 case Cwordend:
613 case Cwordbound:
614 case Cnotwordbound:
615 {
Guido van Rossumdb25f321997-07-10 14:31:32 +0000616 for (a = 0; a < 256; a++)
617 fastmap[a] = 1;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000618 break;
619 }
620 case Csyntaxspec:
621 {
622 syntaxcode = code[pos++];
623 for (a = 0; a < 256; a++)
624 if (SYNTAX(a) == syntaxcode)
625 fastmap[a] = 1;
626 return;
627 }
628 case Cnotsyntaxspec:
629 {
630 syntaxcode = code[pos++];
631 for (a = 0; a < 256; a++)
632 if (SYNTAX(a) != syntaxcode)
633 fastmap[a] = 1;
634 return;
635 }
636 case Ceol:
637 {
638 fastmap['\n'] = 1;
639 if (*can_be_null == 0)
640 *can_be_null = 2; /* can match null, but only at end of buffer*/
641 return;
642 }
643 case Cset:
644 {
645 for (a = 0; a < 256/8; a++)
646 if (code[pos + a] != 0)
647 for (b = 0; b < 8; b++)
648 if (code[pos + a] & (1 << b))
649 fastmap[(a << 3) + b] = 1;
650 pos += 256/8;
651 return;
652 }
653 case Cexact:
654 {
655 fastmap[(unsigned char)code[pos]] = 1;
656 return;
657 }
658 case Canychar:
659 {
660 for (a = 0; a < 256; a++)
661 if (a != '\n')
662 fastmap[a] = 1;
663 return;
664 }
665 case Cstart_memory:
666 case Cend_memory:
667 {
668 pos++;
669 break;
670 }
671 case Cmatch_memory:
672 {
673 for (a = 0; a < 256; a++)
674 fastmap[a] = 1;
675 *can_be_null = 1;
676 return;
677 }
678 case Cjump:
679 case Cdummy_failure_jump:
680 case Cupdate_failure_jump:
681 case Cstar_jump:
682 {
683 a = (unsigned char)code[pos++];
684 a |= (unsigned char)code[pos++] << 8;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000685 pos += (int)SHORT(a);
Guido van Rossum004c1e11997-05-09 02:35:58 +0000686 if (visited[pos])
687 {
688 /* argh... the regexp contains empty loops. This is not
689 good, as this may cause a failure stack overflow when
690 matching. Oh well. */
691 /* this path leads nowhere; pursue other paths. */
692 return;
693 }
694 visited[pos] = 1;
695 break;
696 }
697 case Cfailure_jump:
698 {
699 a = (unsigned char)code[pos++];
700 a |= (unsigned char)code[pos++] << 8;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000701 a = pos + (int)SHORT(a);
Guido van Rossum004c1e11997-05-09 02:35:58 +0000702 re_compile_fastmap_aux(code, a, visited, can_be_null, fastmap);
703 break;
704 }
Guido van Rossumdb25f321997-07-10 14:31:32 +0000705 case Crepeat1:
706 {
707 pos += 2;
708 break;
709 }
Guido van Rossum004c1e11997-05-09 02:35:58 +0000710 default:
711 {
712 abort(); /* probably some opcode is missing from this switch */
713 /*NOTREACHED*/
714 }
715 }
716}
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000717
Guido van Rossum004c1e11997-05-09 02:35:58 +0000718static int re_do_compile_fastmap(char *buffer,
719 int used,
720 int pos,
721 char *can_be_null,
722 char *fastmap)
723{
724 char small_visited[512], *visited;
725
726 if (used <= sizeof(small_visited))
727 visited = small_visited;
728 else
729 {
730 visited = malloc(used);
731 if (!visited)
732 return 0;
733 }
734 *can_be_null = 0;
735 memset(fastmap, 0, 256);
736 memset(visited, 0, used);
737 re_compile_fastmap_aux(buffer, pos, visited, can_be_null, fastmap);
738 if (visited != small_visited)
739 free(visited);
740 return 1;
741}
Guido van Rossumb674c3b1992-01-19 16:32:47 +0000742
Guido van Rossum004c1e11997-05-09 02:35:58 +0000743void re_compile_fastmap(regexp_t bufp)
744{
745 if (!bufp->fastmap || bufp->fastmap_accurate)
746 return;
747 assert(bufp->used > 0);
748 if (!re_do_compile_fastmap(bufp->buffer,
749 bufp->used,
750 0,
751 &bufp->can_be_null,
752 bufp->fastmap))
753 return;
754 if (bufp->buffer[0] == Cbol)
755 bufp->anchor = 1; /* begline */
756 else
757 if (bufp->buffer[0] == Cbegbuf)
758 bufp->anchor = 2; /* begbuf */
759 else
760 bufp->anchor = 0; /* none */
761 bufp->fastmap_accurate = 1;
762}
763
764/*
765 * star is coded as:
766 * 1: failure_jump 2
767 * ... code for operand of star
768 * star_jump 1
769 * 2: ... code after star
770 *
771 * We change the star_jump to update_failure_jump if we can determine
772 * that it is safe to do so; otherwise we change it to an ordinary
773 * jump.
774 *
775 * plus is coded as
776 *
777 * jump 2
778 * 1: failure_jump 3
779 * 2: ... code for operand of plus
780 * star_jump 1
781 * 3: ... code after plus
782 *
783 * For star_jump considerations this is processed identically to star.
784 *
785 */
786
787static int re_optimize_star_jump(regexp_t bufp, char *code)
788{
789 char map[256];
790 char can_be_null;
791 char *p1;
792 char *p2;
793 char ch;
794 int a;
795 int b;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000796 int num_instructions = 0;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000797
798 a = (unsigned char)*code++;
799 a |= (unsigned char)*code++ << 8;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000800 a = (int)SHORT(a);
Guido van Rossum004c1e11997-05-09 02:35:58 +0000801
802 p1 = code + a + 3; /* skip the failure_jump */
803 assert(p1[-3] == Cfailure_jump);
804 p2 = code;
805 /* p1 points inside loop, p2 points to after loop */
806 if (!re_do_compile_fastmap(bufp->buffer, bufp->used,
807 p2 - bufp->buffer, &can_be_null, map))
808 goto make_normal_jump;
809
810 /* If we might introduce a new update point inside the
811 * loop, we can't optimize because then update_jump would
812 * update a wrong failure point. Thus we have to be
813 * quite careful here.
814 */
815
816 /* loop until we find something that consumes a character */
817 loop_p1:
Guido van Rossumdb25f321997-07-10 14:31:32 +0000818 num_instructions++;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000819 switch (*p1++)
820 {
821 case Cbol:
822 case Ceol:
823 case Cbegbuf:
824 case Cendbuf:
825 case Cwordbeg:
826 case Cwordend:
827 case Cwordbound:
828 case Cnotwordbound:
829 {
830 goto loop_p1;
831 }
832 case Cstart_memory:
833 case Cend_memory:
834 {
835 p1++;
836 goto loop_p1;
837 }
838 case Cexact:
839 {
840 ch = (unsigned char)*p1++;
Guido van Rossum4917d931997-05-13 17:53:34 +0000841 if (map[(int)ch])
Guido van Rossum004c1e11997-05-09 02:35:58 +0000842 goto make_normal_jump;
843 break;
844 }
845 case Canychar:
846 {
847 for (b = 0; b < 256; b++)
848 if (b != '\n' && map[b])
849 goto make_normal_jump;
850 break;
851 }
852 case Cset:
853 {
854 for (b = 0; b < 256; b++)
855 if ((p1[b >> 3] & (1 << (b & 7))) && map[b])
856 goto make_normal_jump;
857 p1 += 256/8;
858 break;
859 }
860 default:
861 {
862 goto make_normal_jump;
863 }
864 }
865 /* now we know that we can't backtrack. */
866 while (p1 != p2 - 3)
867 {
Guido van Rossumdb25f321997-07-10 14:31:32 +0000868 num_instructions++;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000869 switch (*p1++)
870 {
871 case Cend:
872 {
873 return 0;
874 }
875 case Cbol:
876 case Ceol:
877 case Canychar:
878 case Cbegbuf:
879 case Cendbuf:
880 case Cwordbeg:
881 case Cwordend:
882 case Cwordbound:
883 case Cnotwordbound:
884 {
885 break;
886 }
887 case Cset:
888 {
889 p1 += 256/8;
890 break;
891 }
892 case Cexact:
893 case Cstart_memory:
894 case Cend_memory:
895 case Cmatch_memory:
896 case Csyntaxspec:
897 case Cnotsyntaxspec:
898 {
899 p1++;
900 break;
901 }
902 case Cjump:
903 case Cstar_jump:
904 case Cfailure_jump:
905 case Cupdate_failure_jump:
906 case Cdummy_failure_jump:
907 {
908 goto make_normal_jump;
909 }
910 default:
911 {
912 return 0;
913 break;
914 }
915 }
916 }
917
Guido van Rossumdb25f321997-07-10 14:31:32 +0000918 make_update_jump:
Guido van Rossum004c1e11997-05-09 02:35:58 +0000919 code -= 3;
920 a += 3; /* jump to after the Cfailure_jump */
921 code[0] = Cupdate_failure_jump;
922 code[1] = a & 0xff;
923 code[2] = a >> 8;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000924 if (num_instructions > 1)
925 return 1;
926 assert(num_instructions == 1);
927 /* if the only instruction matches a single character, we can do
928 * better
929 */
930 p1 = code + 3 + a; /* start of sole instruction */
931 if (*p1 == Cset || *p1 == Cexact || *p1 == Canychar ||
932 *p1 == Csyntaxspec || *p1 == Cnotsyntaxspec)
933 code[0] = Crepeat1;
Guido van Rossum004c1e11997-05-09 02:35:58 +0000934 return 1;
935
936 make_normal_jump:
937 code -= 3;
938 *code = Cjump;
939 return 1;
940}
941
942static int re_optimize(regexp_t bufp)
943{
944 char *code;
945
946 code = bufp->buffer;
947
948 while(1)
949 {
950 switch (*code++)
951 {
952 case Cend:
953 {
954 return 1;
955 }
956 case Canychar:
957 case Cbol:
958 case Ceol:
959 case Cbegbuf:
960 case Cendbuf:
961 case Cwordbeg:
962 case Cwordend:
963 case Cwordbound:
964 case Cnotwordbound:
965 {
966 break;
967 }
968 case Cset:
969 {
970 code += 256/8;
971 break;
972 }
973 case Cexact:
974 case Cstart_memory:
975 case Cend_memory:
976 case Cmatch_memory:
977 case Csyntaxspec:
978 case Cnotsyntaxspec:
979 {
980 code++;
981 break;
982 }
983 case Cstar_jump:
984 {
985 if (!re_optimize_star_jump(bufp, code))
986 {
987 return 0;
988 }
989 /* fall through */
990 }
991 case Cupdate_failure_jump:
992 case Cjump:
993 case Cdummy_failure_jump:
994 case Cfailure_jump:
Guido van Rossumdb25f321997-07-10 14:31:32 +0000995 case Crepeat1:
Guido van Rossum004c1e11997-05-09 02:35:58 +0000996 {
997 code += 2;
998 break;
999 }
1000 default:
1001 {
1002 return 0;
1003 }
1004 }
1005 }
1006}
1007
1008#define NEXTCHAR(var) \
1009{ \
1010 if (pos >= size) \
1011 goto ends_prematurely; \
1012 (var) = regex[pos]; \
1013 pos++; \
1014}
1015
1016#define ALLOC(amount) \
1017{ \
1018 if (pattern_offset+(amount) > alloc) \
1019 { \
1020 alloc += 256 + (amount); \
1021 pattern = realloc(pattern, alloc); \
1022 if (!pattern) \
1023 goto out_of_memory; \
1024 } \
1025}
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001026
1027#define STORE(ch) pattern[pattern_offset++] = (ch)
1028
1029#define CURRENT_LEVEL_START (starts[starts_base + current_level])
1030
1031#define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
1032
Guido van Rossum004c1e11997-05-09 02:35:58 +00001033#define PUSH_LEVEL_STARTS \
1034 if (starts_base < (MAX_NESTING-1)*NUM_LEVELS) \
1035 starts_base += NUM_LEVELS; \
1036 else \
1037 goto too_complex
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001038
1039#define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
1040
Guido van Rossum004c1e11997-05-09 02:35:58 +00001041#define PUT_ADDR(offset,addr) \
1042{ \
1043 int disp = (addr) - (offset) - 2; \
1044 pattern[(offset)] = disp & 0xff; \
1045 pattern[(offset)+1] = (disp>>8) & 0xff; \
1046}
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001047
Guido van Rossum004c1e11997-05-09 02:35:58 +00001048#define INSERT_JUMP(pos,type,addr) \
1049{ \
1050 int a, p = (pos), t = (type), ad = (addr); \
1051 for (a = pattern_offset - 1; a >= p; a--) \
1052 pattern[a + 3] = pattern[a]; \
1053 pattern[p] = t; \
1054 PUT_ADDR(p+1,ad); \
1055 pattern_offset += 3; \
1056}
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001057#define SETBIT(buf,offset,bit) (buf)[(offset)+(bit)/8] |= (1<<((bit) & 7))
1058
Guido van Rossum004c1e11997-05-09 02:35:58 +00001059#define SET_FIELDS \
1060{ \
1061 bufp->allocated = alloc; \
1062 bufp->buffer = pattern; \
1063 bufp->used = pattern_offset; \
1064}
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001065
Guido van Rossum004c1e11997-05-09 02:35:58 +00001066#define GETHEX(var) \
1067{ \
1068 char gethex_ch, gethex_value; \
1069 NEXTCHAR(gethex_ch); \
1070 gethex_value = hex_char_to_decimal(gethex_ch); \
1071 if (gethex_value == 16) \
1072 goto hex_error; \
1073 NEXTCHAR(gethex_ch); \
1074 gethex_ch = hex_char_to_decimal(gethex_ch); \
1075 if (gethex_ch == 16) \
1076 goto hex_error; \
1077 (var) = gethex_value * 16 + gethex_ch; \
1078}
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001079
Guido van Rossum004c1e11997-05-09 02:35:58 +00001080#define ANSI_TRANSLATE(ch) \
1081{ \
1082 switch (ch) \
1083 { \
1084 case 'a': \
1085 case 'A': \
1086 { \
1087 ch = 7; /* audible bell */ \
1088 break; \
1089 } \
1090 case 'b': \
1091 case 'B': \
1092 { \
1093 ch = 8; /* backspace */ \
1094 break; \
1095 } \
1096 case 'f': \
1097 case 'F': \
1098 { \
1099 ch = 12; /* form feed */ \
1100 break; \
1101 } \
1102 case 'n': \
1103 case 'N': \
1104 { \
1105 ch = 10; /* line feed */ \
1106 break; \
1107 } \
1108 case 'r': \
1109 case 'R': \
1110 { \
1111 ch = 13; /* carriage return */ \
1112 break; \
1113 } \
1114 case 't': \
1115 case 'T': \
1116 { \
1117 ch = 9; /* tab */ \
1118 break; \
1119 } \
1120 case 'v': \
1121 case 'V': \
1122 { \
1123 ch = 11; /* vertical tab */ \
1124 break; \
1125 } \
1126 case 'x': /* hex code */ \
1127 case 'X': \
1128 { \
1129 GETHEX(ch); \
1130 break; \
1131 } \
1132 default: \
1133 { \
1134 /* other characters passed through */ \
1135 if (translate) \
1136 ch = translate[(unsigned char)ch]; \
1137 break; \
1138 } \
1139 } \
1140}
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001141
Guido van Rossum004c1e11997-05-09 02:35:58 +00001142char *re_compile_pattern(char *regex, int size, regexp_t bufp)
1143{
1144 int a;
1145 int pos;
1146 int op;
1147 int current_level;
1148 int level;
1149 int opcode;
Guido van Rossum4917d931997-05-13 17:53:34 +00001150 int pattern_offset = 0, alloc;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001151 int starts[NUM_LEVELS * MAX_NESTING];
1152 int starts_base;
1153 int future_jumps[MAX_NESTING];
1154 int num_jumps;
Guido van Rossum4917d931997-05-13 17:53:34 +00001155 unsigned char ch = '\0';
Guido van Rossum004c1e11997-05-09 02:35:58 +00001156 char *pattern;
1157 char *translate;
1158 int next_register;
1159 int paren_depth;
1160 int num_open_registers;
1161 int open_registers[RE_NREGS];
1162 int beginning_context;
1163
1164 if (!re_compile_initialized)
1165 re_compile_initialize();
1166 bufp->used = 0;
1167 bufp->fastmap_accurate = 0;
Guido van Rossumdb25f321997-07-10 14:31:32 +00001168 bufp->uses_registers = 1;
1169 bufp->num_registers = 1;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001170 translate = bufp->translate;
1171 pattern = bufp->buffer;
1172 alloc = bufp->allocated;
1173 if (alloc == 0 || pattern == NULL)
1174 {
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001175 alloc = 256;
1176 pattern = malloc(alloc);
1177 if (!pattern)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001178 goto out_of_memory;
1179 }
1180 pattern_offset = 0;
1181 starts_base = 0;
1182 num_jumps = 0;
1183 current_level = 0;
1184 SET_LEVEL_START;
1185 num_open_registers = 0;
1186 next_register = 1;
1187 paren_depth = 0;
1188 beginning_context = 1;
1189 op = -1;
1190 /* we use Rend dummy to ensure that pending jumps are updated (due to
1191 low priority of Rend) before exiting the loop. */
1192 pos = 0;
1193 while (op != Rend)
1194 {
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001195 if (pos >= size)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001196 op = Rend;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001197 else
Guido van Rossum004c1e11997-05-09 02:35:58 +00001198 {
1199 NEXTCHAR(ch);
1200 if (translate)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001201 ch = translate[(unsigned char)ch];
Guido van Rossum004c1e11997-05-09 02:35:58 +00001202 op = regexp_plain_ops[(unsigned char)ch];
1203 if (op == Rquote)
1204 {
1205 NEXTCHAR(ch);
1206 op = regexp_quoted_ops[(unsigned char)ch];
1207 if (op == Rnormal && regexp_ansi_sequences)
1208 ANSI_TRANSLATE(ch);
1209 }
1210 }
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001211 level = regexp_precedences[op];
1212 /* printf("ch='%c' op=%d level=%d current_level=%d curlevstart=%d\n",
Guido van Rossum004c1e11997-05-09 02:35:58 +00001213 ch, op, level, current_level, CURRENT_LEVEL_START); */
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001214 if (level > current_level)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001215 {
1216 for (current_level++; current_level < level; current_level++)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001217 SET_LEVEL_START;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001218 SET_LEVEL_START;
1219 }
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001220 else
Guido van Rossum004c1e11997-05-09 02:35:58 +00001221 if (level < current_level)
1222 {
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001223 current_level = level;
1224 for (;num_jumps > 0 &&
Guido van Rossum004c1e11997-05-09 02:35:58 +00001225 future_jumps[num_jumps-1] >= CURRENT_LEVEL_START;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001226 num_jumps--)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001227 PUT_ADDR(future_jumps[num_jumps-1], pattern_offset);
1228 }
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001229 switch (op)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001230 {
1231 case Rend:
1232 {
1233 break;
1234 }
1235 case Rnormal:
1236 {
1237 normal_char:
1238 opcode = Cexact;
1239 store_opcode_and_arg: /* opcode & ch must be set */
1240 SET_LEVEL_START;
1241 ALLOC(2);
1242 STORE(opcode);
1243 STORE(ch);
1244 break;
1245 }
1246 case Ranychar:
1247 {
1248 opcode = Canychar;
1249 store_opcode:
1250 SET_LEVEL_START;
1251 ALLOC(1);
1252 STORE(opcode);
1253 break;
1254 }
1255 case Rquote:
1256 {
1257 abort();
1258 /*NOTREACHED*/
1259 }
1260 case Rbol:
1261 {
1262 if (!beginning_context)
1263 if (regexp_context_indep_ops)
1264 goto op_error;
1265 else
1266 goto normal_char;
1267 opcode = Cbol;
1268 goto store_opcode;
1269 }
1270 case Reol:
1271 {
1272 if (!((pos >= size) ||
1273 ((regexp_syntax & RE_NO_BK_VBAR) ?
1274 (regex[pos] == '\174') :
1275 (pos+1 < size && regex[pos] == '\134' &&
1276 regex[pos+1] == '\174')) ||
1277 ((regexp_syntax & RE_NO_BK_PARENS)?
1278 (regex[pos] == ')'):
1279 (pos+1 < size && regex[pos] == '\134' &&
1280 regex[pos+1] == ')'))))
1281 if (regexp_context_indep_ops)
1282 goto op_error;
1283 else
1284 goto normal_char;
1285 opcode = Ceol;
1286 goto store_opcode;
1287 /* NOTREACHED */
1288 break;
1289 }
1290 case Roptional:
1291 {
1292 if (beginning_context)
1293 if (regexp_context_indep_ops)
1294 goto op_error;
1295 else
1296 goto normal_char;
1297 if (CURRENT_LEVEL_START == pattern_offset)
1298 break; /* ignore empty patterns for ? */
1299 ALLOC(3);
1300 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
1301 pattern_offset + 3);
1302 break;
1303 }
1304 case Rstar:
1305 case Rplus:
1306 {
1307 if (beginning_context)
1308 if (regexp_context_indep_ops)
1309 goto op_error;
1310 else
1311 goto normal_char;
1312 if (CURRENT_LEVEL_START == pattern_offset)
1313 break; /* ignore empty patterns for + and * */
1314 ALLOC(9);
1315 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
1316 pattern_offset + 6);
1317 INSERT_JUMP(pattern_offset, Cstar_jump, CURRENT_LEVEL_START);
1318 if (op == Rplus) /* jump over initial failure_jump */
1319 INSERT_JUMP(CURRENT_LEVEL_START, Cdummy_failure_jump,
1320 CURRENT_LEVEL_START + 6);
1321 break;
1322 }
1323 case Ror:
1324 {
1325 ALLOC(6);
1326 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
1327 pattern_offset + 6);
1328 if (num_jumps >= MAX_NESTING)
1329 goto too_complex;
1330 STORE(Cjump);
1331 future_jumps[num_jumps++] = pattern_offset;
1332 STORE(0);
1333 STORE(0);
1334 SET_LEVEL_START;
1335 break;
1336 }
1337 case Ropenpar:
1338 {
1339 SET_LEVEL_START;
1340 if (next_register < RE_NREGS)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001341 {
Guido van Rossum004c1e11997-05-09 02:35:58 +00001342 bufp->uses_registers = 1;
1343 ALLOC(2);
1344 STORE(Cstart_memory);
1345 STORE(next_register);
1346 open_registers[num_open_registers++] = next_register;
Guido van Rossumdb25f321997-07-10 14:31:32 +00001347 bufp->num_registers++;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001348 next_register++;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001349 }
Guido van Rossum004c1e11997-05-09 02:35:58 +00001350 paren_depth++;
1351 PUSH_LEVEL_STARTS;
1352 current_level = 0;
1353 SET_LEVEL_START;
1354 break;
1355 }
1356 case Rclosepar:
1357 {
1358 if (paren_depth <= 0)
1359 goto parenthesis_error;
1360 POP_LEVEL_STARTS;
1361 current_level = regexp_precedences[Ropenpar];
1362 paren_depth--;
1363 if (paren_depth < num_open_registers)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001364 {
Guido van Rossum004c1e11997-05-09 02:35:58 +00001365 bufp->uses_registers = 1;
1366 ALLOC(2);
1367 STORE(Cend_memory);
1368 num_open_registers--;
1369 STORE(open_registers[num_open_registers]);
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001370 }
Guido van Rossum004c1e11997-05-09 02:35:58 +00001371 break;
1372 }
1373 case Rmemory:
1374 {
1375 if (ch == '0')
1376 goto bad_match_register;
1377 assert(ch >= '0' && ch <= '9');
1378 bufp->uses_registers = 1;
1379 opcode = Cmatch_memory;
1380 ch -= '0';
1381 goto store_opcode_and_arg;
1382 }
1383 case Rextended_memory:
1384 {
1385 NEXTCHAR(ch);
1386 if (ch < '0' || ch > '9')
1387 goto bad_match_register;
1388 NEXTCHAR(a);
1389 if (a < '0' || a > '9')
1390 goto bad_match_register;
1391 ch = 10 * (a - '0') + ch - '0';
1392 if (ch <= 0 || ch >= RE_NREGS)
1393 goto bad_match_register;
1394 bufp->uses_registers = 1;
1395 opcode = Cmatch_memory;
1396 goto store_opcode_and_arg;
1397 }
1398 case Ropenset:
1399 {
1400 int complement;
1401 int prev;
1402 int offset;
1403 int range;
1404 int firstchar;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001405
1406 SET_LEVEL_START;
1407 ALLOC(1+256/8);
1408 STORE(Cset);
1409 offset = pattern_offset;
1410 for (a = 0; a < 256/8; a++)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001411 STORE(0);
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001412 NEXTCHAR(ch);
1413 if (translate)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001414 ch = translate[(unsigned char)ch];
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001415 if (ch == '\136')
Guido van Rossum004c1e11997-05-09 02:35:58 +00001416 {
1417 complement = 1;
1418 NEXTCHAR(ch);
1419 if (translate)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001420 ch = translate[(unsigned char)ch];
Guido van Rossum004c1e11997-05-09 02:35:58 +00001421 }
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001422 else
Guido van Rossum004c1e11997-05-09 02:35:58 +00001423 complement = 0;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001424 prev = -1;
1425 range = 0;
1426 firstchar = 1;
1427 while (ch != '\135' || firstchar)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001428 {
1429 firstchar = 0;
1430 if (regexp_ansi_sequences && ch == '\134')
1431 {
1432 NEXTCHAR(ch);
1433 ANSI_TRANSLATE(ch);
1434 }
1435 if (range)
1436 {
1437 for (a = prev; a <= (int)ch; a++)
1438 SETBIT(pattern, offset, a);
1439 prev = -1;
1440 range = 0;
1441 }
1442 else
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001443 if (prev != -1 && ch == '-')
Guido van Rossum004c1e11997-05-09 02:35:58 +00001444 range = 1;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001445 else
Guido van Rossum004c1e11997-05-09 02:35:58 +00001446 {
1447 SETBIT(pattern, offset, ch);
1448 prev = ch;
1449 }
1450 NEXTCHAR(ch);
1451 if (translate)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001452 ch = translate[(unsigned char)ch];
Guido van Rossum004c1e11997-05-09 02:35:58 +00001453 }
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001454 if (range)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001455 SETBIT(pattern, offset, '-');
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001456 if (complement)
Guido van Rossum004c1e11997-05-09 02:35:58 +00001457 {
1458 for (a = 0; a < 256/8; a++)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001459 pattern[offset+a] ^= 0xff;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001460 }
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001461 break;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001462 }
1463 case Rbegbuf:
1464 {
1465 opcode = Cbegbuf;
1466 goto store_opcode;
1467 }
1468 case Rendbuf:
1469 {
1470 opcode = Cendbuf;
1471 goto store_opcode;
1472 }
1473 case Rwordchar:
1474 {
1475 opcode = Csyntaxspec;
1476 ch = Sword;
1477 goto store_opcode_and_arg;
1478 }
1479 case Rnotwordchar:
1480 {
1481 opcode = Cnotsyntaxspec;
1482 ch = Sword;
1483 goto store_opcode_and_arg;
1484 }
1485 case Rwordbeg:
1486 {
1487 opcode = Cwordbeg;
1488 goto store_opcode;
1489 }
1490 case Rwordend:
1491 {
1492 opcode = Cwordend;
1493 goto store_opcode;
1494 }
1495 case Rwordbound:
1496 {
1497 opcode = Cwordbound;
1498 goto store_opcode;
1499 }
1500 case Rnotwordbound:
1501 {
1502 opcode = Cnotwordbound;
1503 goto store_opcode;
1504 }
1505 default:
1506 {
1507 abort();
1508 }
1509 }
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001510 beginning_context = (op == Ropenpar || op == Ror);
Guido van Rossum004c1e11997-05-09 02:35:58 +00001511 }
1512 if (starts_base != 0)
1513 goto parenthesis_error;
1514 assert(num_jumps == 0);
1515 ALLOC(1);
1516 STORE(Cend);
1517 SET_FIELDS;
1518 if(!re_optimize(bufp))
1519 return "Optimization error";
1520 return NULL;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001521
Guido van Rossum004c1e11997-05-09 02:35:58 +00001522 op_error:
1523 SET_FIELDS;
1524 return "Badly placed special character";
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001525
Guido van Rossum004c1e11997-05-09 02:35:58 +00001526 bad_match_register:
1527 SET_FIELDS;
1528 return "Bad match register number";
1529
1530 hex_error:
1531 SET_FIELDS;
1532 return "Bad hexadecimal number";
1533
1534 parenthesis_error:
1535 SET_FIELDS;
1536 return "Badly placed parenthesis";
1537
1538 out_of_memory:
1539 SET_FIELDS;
1540 return "Out of memory";
1541
1542 ends_prematurely:
1543 SET_FIELDS;
1544 return "Regular expression ends prematurely";
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001545
Guido van Rossum004c1e11997-05-09 02:35:58 +00001546 too_complex:
1547 SET_FIELDS;
1548 return "Regular expression too complex";
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001549}
Guido van Rossum004c1e11997-05-09 02:35:58 +00001550
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001551#undef CHARAT
1552#undef NEXTCHAR
1553#undef GETHEX
1554#undef ALLOC
1555#undef STORE
1556#undef CURRENT_LEVEL_START
1557#undef SET_LEVEL_START
1558#undef PUSH_LEVEL_STARTS
1559#undef POP_LEVEL_STARTS
1560#undef PUT_ADDR
1561#undef INSERT_JUMP
1562#undef SETBIT
1563#undef SET_FIELDS
1564
Guido van Rossum004c1e11997-05-09 02:35:58 +00001565#define PREFETCH if (text == textend) goto fail
1566
1567#define NEXTCHAR(var) \
1568PREFETCH; \
1569var = (unsigned char)*text++; \
1570if (translate) \
1571 var = translate[var]
1572
1573int re_match(regexp_t bufp,
1574 char *string,
1575 int size,
1576 int pos,
1577 regexp_registers_t old_regs)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001578{
Guido van Rossum004c1e11997-05-09 02:35:58 +00001579 char *code;
1580 char *translate;
1581 char *text;
1582 char *textstart;
1583 char *textend;
1584 int a;
1585 int b;
1586 int ch;
1587 int reg;
1588 int match_end;
1589 char *regstart;
1590 char *regend;
1591 int regsize;
1592 match_state state;
1593
1594 assert(pos >= 0 && size >= 0);
1595 assert(pos <= size);
1596
1597 text = string + pos;
1598 textstart = string;
1599 textend = string + size;
1600
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001601 code = bufp->buffer;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001602
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001603 translate = bufp->translate;
Guido van Rossum004c1e11997-05-09 02:35:58 +00001604
Guido van Rossumdb25f321997-07-10 14:31:32 +00001605 NEW_STATE(state, bufp->num_registers);
Guido van Rossum004c1e11997-05-09 02:35:58 +00001606
1607 continue_matching:
1608 switch (*code++)
1609 {
1610 case Cend:
1611 {
1612 match_end = text - textstart;
1613 if (old_regs)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001614 {
Guido van Rossum004c1e11997-05-09 02:35:58 +00001615 old_regs->start[0] = pos;
1616 old_regs->end[0] = match_end;
1617 if (!bufp->uses_registers)
1618 {
1619 for (a = 1; a < RE_NREGS; a++)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001620 {
Guido van Rossum004c1e11997-05-09 02:35:58 +00001621 old_regs->start[a] = -1;
1622 old_regs->end[a] = -1;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001623 }
Guido van Rossum004c1e11997-05-09 02:35:58 +00001624 }
1625 else
1626 {
Guido van Rossumdb25f321997-07-10 14:31:32 +00001627 for (a = 1; a < bufp->num_registers; a++)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001628 {
Guido van Rossum004c1e11997-05-09 02:35:58 +00001629 if ((GET_REG_START(state, a) == NULL) ||
1630 (GET_REG_END(state, a) == NULL))
1631 {
1632 old_regs->start[a] = -1;
1633 old_regs->end[a] = -1;
1634 continue;
1635 }
1636 old_regs->start[a] = GET_REG_START(state, a) - textstart;
1637 old_regs->end[a] = GET_REG_END(state, a) - textstart;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001638 }
Guido van Rossumdb25f321997-07-10 14:31:32 +00001639 for (; a < RE_NREGS; a++)
1640 {
1641 old_regs->start[a] = -1;
1642 old_regs->end[a] = -1;
1643 }
Guido van Rossum004c1e11997-05-09 02:35:58 +00001644 }
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001645 }
Guido van Rossum004c1e11997-05-09 02:35:58 +00001646 FREE_STATE(state);
1647 return match_end - pos;
1648 }
1649 case Cbol:
1650 {
1651 if (text == textstart || text[-1] == '\n')
1652 goto continue_matching;
1653 goto fail;
1654 }
1655 case Ceol:
1656 {
1657 if (text == textend || *text == '\n')
1658 goto continue_matching;
1659 goto fail;
1660 }
1661 case Cset:
1662 {
1663 NEXTCHAR(ch);
1664 if (code[ch/8] & (1<<(ch & 7)))
1665 {
1666 code += 256/8;
1667 goto continue_matching;
1668 }
1669 goto fail;
1670 }
1671 case Cexact:
1672 {
1673 NEXTCHAR(ch);
1674 if (ch != (unsigned char)*code++)
1675 goto fail;
1676/* { */
1677/* char *p1 = code - 2; */
1678/* ch = *(code - 1); */
1679/* POP_FAILURE(state, code, text, goto done_matching, goto error); */
1680/* while ((code == p1) && (*text != ch)) */
1681/* POP_FAILURE(state, code, text, goto done_matching, goto error); */
1682/* if ((code == p1) && (*text == ch)) */
1683/* { */
1684/* code += 2; */
1685/* text++; */
1686/* } */
1687/* } */
1688 goto continue_matching;
1689 }
1690 case Canychar:
1691 {
1692 NEXTCHAR(ch);
1693 if (ch == '\n')
1694 goto fail;
1695 goto continue_matching;
1696 }
1697 case Cstart_memory:
1698 {
1699 reg = *code++;
1700 SET_REG_START(state, reg, text, goto error);
1701 goto continue_matching;
1702 }
1703 case Cend_memory:
1704 {
1705 reg = *code++;
1706 SET_REG_END(state, reg, text, goto error);
1707 goto continue_matching;
1708 }
1709 case Cmatch_memory:
1710 {
1711 reg = *code++;
1712 regstart = GET_REG_START(state, reg);
1713 regend = GET_REG_END(state, reg);
1714 if ((regstart == NULL) || (regend == NULL))
1715 goto fail; /* or should we just match nothing? */
1716 regsize = regend - regstart;
1717
1718 if (regsize > (textend - text))
1719 goto fail;
1720 if(translate)
1721 {
1722 for (; regstart < regend; regstart++, text++)
1723 if (translate[*regstart] != translate[*text])
1724 goto fail;
1725 }
1726 else
1727 for (; regstart < regend; regstart++, text++)
1728 if (*regstart != *text)
1729 goto fail;
1730/* if (memcmp(text, regstart, regsize) != 0)
1731 goto fail;
1732 text += regsize; */
1733 goto continue_matching;
1734 }
1735 case Cupdate_failure_jump:
1736 {
1737 UPDATE_FAILURE(state, text, goto error);
1738 /* fall to next case */
1739 }
1740 /* treat Cstar_jump just like Cjump if it hasn't been optimized */
1741 case Cstar_jump:
1742 case Cjump:
1743 {
1744 a = (unsigned char)*code++;
1745 a |= (unsigned char)*code++ << 8;
Guido van Rossumdb25f321997-07-10 14:31:32 +00001746 code += (int)SHORT(a);
Guido van Rossum004c1e11997-05-09 02:35:58 +00001747 goto continue_matching;
1748 }
1749 case Cdummy_failure_jump:
1750 {
1751 a = (unsigned char)*code++;
1752 a |= (unsigned char)*code++ << 8;
Guido van Rossumdb25f321997-07-10 14:31:32 +00001753 a = (int)SHORT(a);
Guido van Rossum004c1e11997-05-09 02:35:58 +00001754 assert(*code == Cfailure_jump);
1755 b = (unsigned char)code[1];
1756 b |= (unsigned char)code[2] << 8;
Guido van Rossumdb25f321997-07-10 14:31:32 +00001757 PUSH_FAILURE(state, code + (int)SHORT(b) + 3, NULL, goto error);
Guido van Rossum004c1e11997-05-09 02:35:58 +00001758 code += a;
1759 goto continue_matching;
1760 }
1761 case Cfailure_jump:
1762 {
1763 a = (unsigned char)*code++;
1764 a |= (unsigned char)*code++ << 8;
Guido van Rossumdb25f321997-07-10 14:31:32 +00001765 a = (int)SHORT(a);
Guido van Rossum004c1e11997-05-09 02:35:58 +00001766 PUSH_FAILURE(state, code + a, text, goto error);
1767 goto continue_matching;
1768 }
Guido van Rossumdb25f321997-07-10 14:31:32 +00001769 case Crepeat1:
1770 {
1771 char *pinst;
1772 a = (unsigned char)*code++;
1773 a |= (unsigned char)*code++ << 8;
1774 a = (int)SHORT(a);
1775 pinst = code + a;
1776 /* pinst is sole instruction in loop, and it matches a
1777 * single character. Since Crepeat1 was originally a
1778 * Cupdate_failure_jump, we also know that backtracking is
1779 * useless: so long as the single-character expression matches,
1780 * it must be used. Also, in the case of +, we've already
1781 * matched one character, so + can't fail: nothing here can
1782 * cause a failure.
1783 */
1784 switch (*pinst++)
1785 {
1786 case Cset:
1787 {
1788 if (translate)
1789 {
1790 while (text < textend)
1791 {
1792 ch = translate[(unsigned char)*text];
1793 if (pinst[ch/8] & (1<<(ch & 7)))
1794 text++;
1795 else
1796 break;
1797 }
1798 }
1799 else
1800 {
1801 while (text < textend)
1802 {
1803 ch = (unsigned char)*text;
1804 if (pinst[ch/8] & (1<<(ch & 7)))
1805 text++;
1806 else
1807 break;
1808 }
1809 }
1810 break;
1811 }
1812 case Cexact:
1813 {
1814 ch = (unsigned char)*pinst;
1815 if (translate)
1816 {
1817 while (text < textend &&
1818 translate[(unsigned char)*text] == ch)
1819 text++;
1820 }
1821 else
1822 {
1823 while (text < textend && (unsigned char)*text == ch)
1824 text++;
1825 }
1826 break;
1827 }
1828 case Canychar:
1829 {
1830 while (text < textend && (unsigned char)*text != '\n')
1831 text++;
1832 break;
1833 }
1834 case Csyntaxspec:
1835 {
1836 a = (unsigned char)*pinst;
1837 if (translate)
1838 {
1839 while (text < textend &&
1840 translate[SYNTAX(*text)] == a)
1841 text++;
1842 }
1843 else
1844 {
1845 while (text < textend && SYNTAX(*text) == a)
1846 text++;
1847 }
1848 break;
1849 }
1850 case Cnotsyntaxspec:
1851 {
1852 a = (unsigned char)*pinst;
1853 if (translate)
1854 {
1855 while (text < textend &&
1856 translate[SYNTAX(*text)] != a)
1857 text++;
1858 }
1859 else
1860 {
1861 while (text < textend && SYNTAX(*text) != a)
1862 text++;
1863 }
1864 break;
1865 }
1866 default:
1867 {
1868 abort();
1869 /*NOTREACHED*/
1870 }
1871 }
1872 /* due to the funky way + and * are compiled, the top failure-
1873 * stack entry at this point is actually a success entry --
1874 * update it & pop it
1875 */
1876 UPDATE_FAILURE(state, text, goto error);
1877 goto fail; /* i.e., succeed <wink/sigh> */
1878 }
Guido van Rossum004c1e11997-05-09 02:35:58 +00001879 case Cbegbuf:
1880 {
1881 if (text == textstart)
1882 goto continue_matching;
1883 goto fail;
1884 }
1885 case Cendbuf:
1886 {
1887 if (text == textend)
1888 goto continue_matching;
1889 goto fail;
1890 }
1891 case Cwordbeg:
1892 {
1893 if (text == textend)
1894 goto fail;
1895 if (SYNTAX(*text) != Sword)
1896 goto fail;
1897 if (text == textstart)
1898 goto continue_matching;
1899 if (SYNTAX(text[-1]) != Sword)
1900 goto continue_matching;
1901 goto fail;
1902 }
1903 case Cwordend:
1904 {
1905 if (text == textstart)
1906 goto fail;
1907 if (SYNTAX(text[-1]) != Sword)
1908 goto fail;
1909 if (text == textend)
1910 goto continue_matching;
1911 if (SYNTAX(*text) == Sword)
1912 goto fail;
1913 goto continue_matching;
1914 }
1915 case Cwordbound:
1916 {
1917 /* Note: as in gnu regexp, this also matches at the beginning
1918 * and end of buffer. */
1919
1920 if (text == textstart || text == textend)
1921 goto continue_matching;
1922 if ((SYNTAX(text[-1]) == Sword) ^ (SYNTAX(*text) == Sword))
1923 goto continue_matching;
1924 goto fail;
1925 }
1926 case Cnotwordbound:
1927 {
1928 /* Note: as in gnu regexp, this never matches at the beginning
1929 * and end of buffer. */
1930 if (text == textstart || text == textend)
1931 goto fail;
1932 if (!((SYNTAX(text[-1]) == Sword) ^ (SYNTAX(*text) == Sword)))
1933 goto fail;
1934 goto continue_matching;
1935 }
1936 case Csyntaxspec:
1937 {
1938 NEXTCHAR(ch);
1939 if (SYNTAX(ch) != (unsigned char)*code++)
1940 goto fail;
1941 goto continue_matching;
1942 }
1943 case Cnotsyntaxspec:
1944 {
1945 NEXTCHAR(ch);
1946 if (SYNTAX(ch) != (unsigned char)*code++)
1947 break;
1948 goto continue_matching;
1949 }
1950 default:
1951 {
1952 abort();
1953 /*NOTREACHED*/
1954 }
1955 }
1956
Guido van Rossum3b1a57a1992-01-27 16:47:46 +00001957#if 0 /* This line is never reached --Guido */
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001958 abort();
Guido van Rossum5f21dd11992-01-19 16:49:14 +00001959#endif
Guido van Rossum004c1e11997-05-09 02:35:58 +00001960 /*
1961 *NOTREACHED
1962 */
1963
1964 fail:
1965 POP_FAILURE(state, code, text, goto done_matching, goto error);
1966 goto continue_matching;
1967
1968 done_matching:
1969/* if(translated != NULL) */
1970/* free(translated); */
1971 FREE_STATE(state);
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001972 return -1;
1973
Guido van Rossum004c1e11997-05-09 02:35:58 +00001974 error:
1975/* if (translated != NULL) */
1976/* free(translated); */
1977 FREE_STATE(state);
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001978 return -2;
1979}
1980
1981#undef PREFETCH
1982#undef NEXTCHAR
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001983
Guido van Rossum004c1e11997-05-09 02:35:58 +00001984int re_search(regexp_t bufp,
1985 char *string,
1986 int size,
1987 int pos,
1988 int range,
1989 regexp_registers_t regs)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001990{
Guido van Rossum004c1e11997-05-09 02:35:58 +00001991 char *fastmap;
1992 char *translate;
1993 char *text;
1994 char *partstart;
1995 char *partend;
1996 int dir;
1997 int ret;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00001998 char anchor;
1999
Guido van Rossum004c1e11997-05-09 02:35:58 +00002000 assert(size >= 0 && pos >= 0);
2001 assert(pos + range >= 0 && pos + range <= size); /* Bugfix by ylo */
Guido van Rossumb674c3b1992-01-19 16:32:47 +00002002
2003 fastmap = bufp->fastmap;
2004 translate = bufp->translate;
2005 if (fastmap && !bufp->fastmap_accurate)
Guido van Rossum004c1e11997-05-09 02:35:58 +00002006 re_compile_fastmap(bufp);
Guido van Rossumb674c3b1992-01-19 16:32:47 +00002007 anchor = bufp->anchor;
2008 if (bufp->can_be_null == 1) /* can_be_null == 2: can match null at eob */
Guido van Rossum004c1e11997-05-09 02:35:58 +00002009 fastmap = NULL;
2010
Guido van Rossumb674c3b1992-01-19 16:32:47 +00002011 if (range < 0)
Guido van Rossum004c1e11997-05-09 02:35:58 +00002012 {
2013 dir = -1;
2014 range = -range;
2015 }
Guido van Rossumb674c3b1992-01-19 16:32:47 +00002016 else
Guido van Rossum004c1e11997-05-09 02:35:58 +00002017 dir = 1;
2018
Guido van Rossumb674c3b1992-01-19 16:32:47 +00002019 if (anchor == 2)
Guido van Rossum004c1e11997-05-09 02:35:58 +00002020 if (pos != 0)
2021 return -1;
2022 else
2023 range = 0;
2024
Guido van Rossumb674c3b1992-01-19 16:32:47 +00002025 for (; range >= 0; range--, pos += dir)
Guido van Rossum004c1e11997-05-09 02:35:58 +00002026 {
2027 if (fastmap)
2028 {
2029 if (dir == 1)
2030 { /* searching forwards */
2031
2032 text = string + pos;
2033 partend = string + size;
2034 partstart = text;
2035 if (translate)
2036 while (text != partend &&
2037 !fastmap[(unsigned char) translate[(unsigned char)*text]])
2038 text++;
2039 else
2040 while (text != partend && !fastmap[(unsigned char)*text])
2041 text++;
2042 pos += text - partstart;
2043 range -= text - partstart;
2044 if (pos == size && bufp->can_be_null == 0)
2045 return -1;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00002046 }
Guido van Rossum004c1e11997-05-09 02:35:58 +00002047 else
2048 { /* searching backwards */
2049 text = string + pos;
2050 partstart = string + pos - range;
2051 partend = text;
2052 if (translate)
2053 while (text != partstart &&
2054 !fastmap[(unsigned char)
2055 translate[(unsigned char)*text]])
2056 text--;
2057 else
2058 while (text != partstart &&
2059 !fastmap[(unsigned char)*text])
2060 text--;
2061 pos -= partend - text;
2062 range -= partend - text;
Guido van Rossumb674c3b1992-01-19 16:32:47 +00002063 }
Guido van Rossum004c1e11997-05-09 02:35:58 +00002064 }
2065 if (anchor == 1)
2066 { /* anchored to begline */
Guido van Rossumfc4f5031997-05-14 18:27:51 +00002067 if (pos > 0 && (string[pos - 1] != '\n'))
Guido van Rossum004c1e11997-05-09 02:35:58 +00002068 continue;
2069 }
2070 assert(pos >= 0 && pos <= size);
2071 ret = re_match(bufp, string, size, pos, regs);
2072 if (ret >= 0)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00002073 return pos;
Guido van Rossum004c1e11997-05-09 02:35:58 +00002074 if (ret == -2)
Guido van Rossumb674c3b1992-01-19 16:32:47 +00002075 return -2;
Guido van Rossum004c1e11997-05-09 02:35:58 +00002076 }
Guido van Rossumb674c3b1992-01-19 16:32:47 +00002077 return -1;
2078}